diff options
Diffstat (limited to '')
198 files changed, 44906 insertions, 5735 deletions
diff --git a/tools/testing/selftests/kvm/.gitignore b/tools/testing/selftests/kvm/.gitignore index 452787152748..6d9381d60172 100644 --- a/tools/testing/selftests/kvm/.gitignore +++ b/tools/testing/selftests/kvm/.gitignore @@ -1,27 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only -/s390x/memop -/s390x/resets -/s390x/sync_regs_test -/x86_64/cr4_cpuid_sync_test -/x86_64/debug_regs -/x86_64/evmcs_test -/x86_64/hyperv_cpuid -/x86_64/mmio_warning_test -/x86_64/platform_info_test -/x86_64/set_sregs_test -/x86_64/smm_test -/x86_64/state_test -/x86_64/vmx_preemption_timer_test -/x86_64/svm_vmcall_test -/x86_64/sync_regs_test -/x86_64/vmx_close_while_nested_test -/x86_64/vmx_dirty_log_test -/x86_64/vmx_set_nested_state_test -/x86_64/vmx_tsc_adjust_test -/x86_64/xss_msr_test -/clear_dirty_log_test -/demand_paging_test -/dirty_log_test -/kvm_create_max_vcpus -/set_memory_region_test -/steal_time +* +!/**/ +!*.c +!*.h +!*.S +!*.sh diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile index 4a166588d99f..ce8ff8e8ce3a 100644 --- a/tools/testing/selftests/kvm/Makefile +++ b/tools/testing/selftests/kvm/Makefile @@ -1,84 +1,224 @@ # SPDX-License-Identifier: GPL-2.0-only -include ../../../../scripts/Kbuild.include +include ../../../build/Build.include all: top_srcdir = ../../../.. -KSFT_KHDR_INSTALL := 1 - -# For cross-builds to work, UNAME_M has to map to ARCH and arch specific -# directories and targets in this Makefile. "uname -m" doesn't map to -# arch specific sub-directory names. -# -# UNAME_M variable to used to run the compiles pointing to the right arch -# directories and build the right targets for these supported architectures. -# -# TEST_GEN_PROGS and LIBKVM are set using UNAME_M variable. -# LINUX_TOOL_ARCH_INCLUDE is set using ARCH variable. -# -# x86_64 targets are named to include x86_64 as a suffix and directories -# for includes are in x86_64 sub-directory. s390x and aarch64 follow the -# same convention. "uname -m" doesn't result in the correct mapping for -# s390x and aarch64. -# -# No change necessary for x86_64 -UNAME_M := $(shell uname -m) - -# Set UNAME_M for arm64 compile/install to work -ifeq ($(ARCH),arm64) - UNAME_M := aarch64 -endif -# Set UNAME_M s390x compile/install to work -ifeq ($(ARCH),s390) - UNAME_M := s390x +include $(top_srcdir)/scripts/subarch.include +ARCH ?= $(SUBARCH) + +ifeq ($(ARCH),x86) + ARCH_DIR := x86_64 +else ifeq ($(ARCH),arm64) + ARCH_DIR := aarch64 +else ifeq ($(ARCH),s390) + ARCH_DIR := s390x +else + ARCH_DIR := $(ARCH) endif -LIBKVM = lib/assert.c lib/elf.c lib/io.c lib/kvm_util.c lib/sparsebit.c lib/test_util.c -LIBKVM_x86_64 = lib/x86_64/processor.c lib/x86_64/vmx.c lib/x86_64/svm.c lib/x86_64/ucall.c -LIBKVM_aarch64 = lib/aarch64/processor.c lib/aarch64/ucall.c -LIBKVM_s390x = lib/s390x/processor.c lib/s390x/ucall.c +LIBKVM += lib/assert.c +LIBKVM += lib/elf.c +LIBKVM += lib/guest_modes.c +LIBKVM += lib/io.c +LIBKVM += lib/kvm_util.c +LIBKVM += lib/memstress.c +LIBKVM += lib/guest_sprintf.c +LIBKVM += lib/rbtree.c +LIBKVM += lib/sparsebit.c +LIBKVM += lib/test_util.c +LIBKVM += lib/ucall_common.c +LIBKVM += lib/userfaultfd_util.c + +LIBKVM_STRING += lib/string_override.c + +LIBKVM_x86_64 += lib/x86_64/apic.c +LIBKVM_x86_64 += lib/x86_64/handlers.S +LIBKVM_x86_64 += lib/x86_64/hyperv.c +LIBKVM_x86_64 += lib/x86_64/memstress.c +LIBKVM_x86_64 += lib/x86_64/pmu.c +LIBKVM_x86_64 += lib/x86_64/processor.c +LIBKVM_x86_64 += lib/x86_64/sev.c +LIBKVM_x86_64 += lib/x86_64/svm.c +LIBKVM_x86_64 += lib/x86_64/ucall.c +LIBKVM_x86_64 += lib/x86_64/vmx.c + +LIBKVM_aarch64 += lib/aarch64/gic.c +LIBKVM_aarch64 += lib/aarch64/gic_v3.c +LIBKVM_aarch64 += lib/aarch64/gic_v3_its.c +LIBKVM_aarch64 += lib/aarch64/handlers.S +LIBKVM_aarch64 += lib/aarch64/processor.c +LIBKVM_aarch64 += lib/aarch64/spinlock.c +LIBKVM_aarch64 += lib/aarch64/ucall.c +LIBKVM_aarch64 += lib/aarch64/vgic.c + +LIBKVM_s390x += lib/s390x/diag318_test_handler.c +LIBKVM_s390x += lib/s390x/processor.c +LIBKVM_s390x += lib/s390x/ucall.c -TEST_GEN_PROGS_x86_64 = x86_64/cr4_cpuid_sync_test -TEST_GEN_PROGS_x86_64 += x86_64/evmcs_test +LIBKVM_riscv += lib/riscv/handlers.S +LIBKVM_riscv += lib/riscv/processor.c +LIBKVM_riscv += lib/riscv/ucall.c + +# Non-compiled test targets +TEST_PROGS_x86_64 += x86_64/nx_huge_pages_test.sh + +# Compiled test targets +TEST_GEN_PROGS_x86_64 = x86_64/cpuid_test +TEST_GEN_PROGS_x86_64 += x86_64/cr4_cpuid_sync_test +TEST_GEN_PROGS_x86_64 += x86_64/dirty_log_page_splitting_test +TEST_GEN_PROGS_x86_64 += x86_64/get_msr_index_features +TEST_GEN_PROGS_x86_64 += x86_64/exit_on_emulation_failure_test +TEST_GEN_PROGS_x86_64 += x86_64/fix_hypercall_test +TEST_GEN_PROGS_x86_64 += x86_64/hwcr_msr_test +TEST_GEN_PROGS_x86_64 += x86_64/hyperv_clock TEST_GEN_PROGS_x86_64 += x86_64/hyperv_cpuid -TEST_GEN_PROGS_x86_64 += x86_64/mmio_warning_test +TEST_GEN_PROGS_x86_64 += x86_64/hyperv_evmcs +TEST_GEN_PROGS_x86_64 += x86_64/hyperv_extended_hypercalls +TEST_GEN_PROGS_x86_64 += x86_64/hyperv_features +TEST_GEN_PROGS_x86_64 += x86_64/hyperv_ipi +TEST_GEN_PROGS_x86_64 += x86_64/hyperv_svm_test +TEST_GEN_PROGS_x86_64 += x86_64/hyperv_tlb_flush +TEST_GEN_PROGS_x86_64 += x86_64/kvm_clock_test +TEST_GEN_PROGS_x86_64 += x86_64/kvm_pv_test +TEST_GEN_PROGS_x86_64 += x86_64/monitor_mwait_test +TEST_GEN_PROGS_x86_64 += x86_64/nested_exceptions_test TEST_GEN_PROGS_x86_64 += x86_64/platform_info_test +TEST_GEN_PROGS_x86_64 += x86_64/pmu_counters_test +TEST_GEN_PROGS_x86_64 += x86_64/pmu_event_filter_test +TEST_GEN_PROGS_x86_64 += x86_64/private_mem_conversions_test +TEST_GEN_PROGS_x86_64 += x86_64/private_mem_kvm_exits_test +TEST_GEN_PROGS_x86_64 += x86_64/set_boot_cpu_id TEST_GEN_PROGS_x86_64 += x86_64/set_sregs_test +TEST_GEN_PROGS_x86_64 += x86_64/smaller_maxphyaddr_emulation_test TEST_GEN_PROGS_x86_64 += x86_64/smm_test TEST_GEN_PROGS_x86_64 += x86_64/state_test TEST_GEN_PROGS_x86_64 += x86_64/vmx_preemption_timer_test TEST_GEN_PROGS_x86_64 += x86_64/svm_vmcall_test +TEST_GEN_PROGS_x86_64 += x86_64/svm_int_ctl_test +TEST_GEN_PROGS_x86_64 += x86_64/svm_nested_shutdown_test +TEST_GEN_PROGS_x86_64 += x86_64/svm_nested_soft_inject_test +TEST_GEN_PROGS_x86_64 += x86_64/tsc_scaling_sync TEST_GEN_PROGS_x86_64 += x86_64/sync_regs_test +TEST_GEN_PROGS_x86_64 += x86_64/ucna_injection_test +TEST_GEN_PROGS_x86_64 += x86_64/userspace_io_test +TEST_GEN_PROGS_x86_64 += x86_64/userspace_msr_exit_test +TEST_GEN_PROGS_x86_64 += x86_64/vmx_apic_access_test TEST_GEN_PROGS_x86_64 += x86_64/vmx_close_while_nested_test TEST_GEN_PROGS_x86_64 += x86_64/vmx_dirty_log_test +TEST_GEN_PROGS_x86_64 += x86_64/vmx_exception_with_invalid_guest_state +TEST_GEN_PROGS_x86_64 += x86_64/vmx_msrs_test +TEST_GEN_PROGS_x86_64 += x86_64/vmx_invalid_nested_guest_state TEST_GEN_PROGS_x86_64 += x86_64/vmx_set_nested_state_test TEST_GEN_PROGS_x86_64 += x86_64/vmx_tsc_adjust_test +TEST_GEN_PROGS_x86_64 += x86_64/vmx_nested_tsc_scaling_test +TEST_GEN_PROGS_x86_64 += x86_64/xapic_ipi_test +TEST_GEN_PROGS_x86_64 += x86_64/xapic_state_test +TEST_GEN_PROGS_x86_64 += x86_64/xcr0_cpuid_test TEST_GEN_PROGS_x86_64 += x86_64/xss_msr_test TEST_GEN_PROGS_x86_64 += x86_64/debug_regs -TEST_GEN_PROGS_x86_64 += clear_dirty_log_test +TEST_GEN_PROGS_x86_64 += x86_64/tsc_msrs_test +TEST_GEN_PROGS_x86_64 += x86_64/vmx_pmu_caps_test +TEST_GEN_PROGS_x86_64 += x86_64/xen_shinfo_test +TEST_GEN_PROGS_x86_64 += x86_64/xen_vmcall_test +TEST_GEN_PROGS_x86_64 += x86_64/sev_init2_tests +TEST_GEN_PROGS_x86_64 += x86_64/sev_migrate_tests +TEST_GEN_PROGS_x86_64 += x86_64/sev_smoke_test +TEST_GEN_PROGS_x86_64 += x86_64/amx_test +TEST_GEN_PROGS_x86_64 += x86_64/max_vcpuid_cap_test +TEST_GEN_PROGS_x86_64 += x86_64/triple_fault_event_test +TEST_GEN_PROGS_x86_64 += x86_64/recalc_apic_map_test +TEST_GEN_PROGS_x86_64 += access_tracking_perf_test TEST_GEN_PROGS_x86_64 += demand_paging_test TEST_GEN_PROGS_x86_64 += dirty_log_test +TEST_GEN_PROGS_x86_64 += dirty_log_perf_test +TEST_GEN_PROGS_x86_64 += guest_memfd_test +TEST_GEN_PROGS_x86_64 += guest_print_test +TEST_GEN_PROGS_x86_64 += hardware_disable_test TEST_GEN_PROGS_x86_64 += kvm_create_max_vcpus +TEST_GEN_PROGS_x86_64 += kvm_page_table_test +TEST_GEN_PROGS_x86_64 += max_guest_memory_test +TEST_GEN_PROGS_x86_64 += memslot_modification_stress_test +TEST_GEN_PROGS_x86_64 += memslot_perf_test +TEST_GEN_PROGS_x86_64 += rseq_test TEST_GEN_PROGS_x86_64 += set_memory_region_test TEST_GEN_PROGS_x86_64 += steal_time +TEST_GEN_PROGS_x86_64 += kvm_binary_stats_test +TEST_GEN_PROGS_x86_64 += system_counter_offset_test + +# Compiled outputs used by test targets +TEST_GEN_PROGS_EXTENDED_x86_64 += x86_64/nx_huge_pages_test -TEST_GEN_PROGS_aarch64 += clear_dirty_log_test +TEST_GEN_PROGS_aarch64 += aarch64/aarch32_id_regs +TEST_GEN_PROGS_aarch64 += aarch64/debug-exceptions +TEST_GEN_PROGS_aarch64 += aarch64/hypercalls +TEST_GEN_PROGS_aarch64 += aarch64/page_fault_test +TEST_GEN_PROGS_aarch64 += aarch64/psci_test +TEST_GEN_PROGS_aarch64 += aarch64/set_id_regs +TEST_GEN_PROGS_aarch64 += aarch64/smccc_filter +TEST_GEN_PROGS_aarch64 += aarch64/vcpu_width_config +TEST_GEN_PROGS_aarch64 += aarch64/vgic_init +TEST_GEN_PROGS_aarch64 += aarch64/vgic_irq +TEST_GEN_PROGS_aarch64 += aarch64/vgic_lpi_stress +TEST_GEN_PROGS_aarch64 += aarch64/vpmu_counter_access +TEST_GEN_PROGS_aarch64 += access_tracking_perf_test +TEST_GEN_PROGS_aarch64 += arch_timer TEST_GEN_PROGS_aarch64 += demand_paging_test TEST_GEN_PROGS_aarch64 += dirty_log_test +TEST_GEN_PROGS_aarch64 += dirty_log_perf_test +TEST_GEN_PROGS_aarch64 += guest_print_test +TEST_GEN_PROGS_aarch64 += get-reg-list TEST_GEN_PROGS_aarch64 += kvm_create_max_vcpus +TEST_GEN_PROGS_aarch64 += kvm_page_table_test +TEST_GEN_PROGS_aarch64 += memslot_modification_stress_test +TEST_GEN_PROGS_aarch64 += memslot_perf_test +TEST_GEN_PROGS_aarch64 += rseq_test TEST_GEN_PROGS_aarch64 += set_memory_region_test TEST_GEN_PROGS_aarch64 += steal_time +TEST_GEN_PROGS_aarch64 += kvm_binary_stats_test TEST_GEN_PROGS_s390x = s390x/memop TEST_GEN_PROGS_s390x += s390x/resets TEST_GEN_PROGS_s390x += s390x/sync_regs_test +TEST_GEN_PROGS_s390x += s390x/tprot +TEST_GEN_PROGS_s390x += s390x/cmma_test +TEST_GEN_PROGS_s390x += s390x/debug_test TEST_GEN_PROGS_s390x += demand_paging_test TEST_GEN_PROGS_s390x += dirty_log_test +TEST_GEN_PROGS_s390x += guest_print_test TEST_GEN_PROGS_s390x += kvm_create_max_vcpus +TEST_GEN_PROGS_s390x += kvm_page_table_test +TEST_GEN_PROGS_s390x += rseq_test TEST_GEN_PROGS_s390x += set_memory_region_test +TEST_GEN_PROGS_s390x += kvm_binary_stats_test + +TEST_GEN_PROGS_riscv += riscv/sbi_pmu_test +TEST_GEN_PROGS_riscv += riscv/ebreak_test +TEST_GEN_PROGS_riscv += arch_timer +TEST_GEN_PROGS_riscv += demand_paging_test +TEST_GEN_PROGS_riscv += dirty_log_test +TEST_GEN_PROGS_riscv += get-reg-list +TEST_GEN_PROGS_riscv += guest_print_test +TEST_GEN_PROGS_riscv += kvm_binary_stats_test +TEST_GEN_PROGS_riscv += kvm_create_max_vcpus +TEST_GEN_PROGS_riscv += kvm_page_table_test +TEST_GEN_PROGS_riscv += set_memory_region_test +TEST_GEN_PROGS_riscv += steal_time + +SPLIT_TESTS += arch_timer +SPLIT_TESTS += get-reg-list -TEST_GEN_PROGS += $(TEST_GEN_PROGS_$(UNAME_M)) -LIBKVM += $(LIBKVM_$(UNAME_M)) +TEST_PROGS += $(TEST_PROGS_$(ARCH_DIR)) +TEST_GEN_PROGS += $(TEST_GEN_PROGS_$(ARCH_DIR)) +TEST_GEN_PROGS_EXTENDED += $(TEST_GEN_PROGS_EXTENDED_$(ARCH_DIR)) +LIBKVM += $(LIBKVM_$(ARCH_DIR)) + +OVERRIDE_TARGETS = 1 + +# lib.mak defines $(OUTPUT), prepends $(OUTPUT)/ to $(TEST_GEN_PROGS), and most +# importantly defines, i.e. overwrites, $(CC) (unless `make -e` or `make CC=`, +# which causes the environment variable to override the makefile). +include ../lib.mk INSTALL_HDR_PATH = $(top_srcdir)/usr LINUX_HDR_PATH = $(INSTALL_HDR_PATH)/include/ @@ -89,38 +229,96 @@ else LINUX_TOOL_ARCH_INCLUDE = $(top_srcdir)/tools/arch/$(ARCH)/include endif CFLAGS += -Wall -Wstrict-prototypes -Wuninitialized -O2 -g -std=gnu99 \ + -Wno-gnu-variable-sized-type-not-at-end -MD -MP -DCONFIG_64BIT \ + -D_GNU_SOURCE -fno-builtin-memcmp -fno-builtin-memcpy \ + -fno-builtin-memset -fno-builtin-strnlen \ -fno-stack-protector -fno-PIE -I$(LINUX_TOOL_INCLUDE) \ -I$(LINUX_TOOL_ARCH_INCLUDE) -I$(LINUX_HDR_PATH) -Iinclude \ - -I$(<D) -Iinclude/$(UNAME_M) -I.. + -I$(<D) -Iinclude/$(ARCH_DIR) -I ../rseq -I.. $(EXTRA_CFLAGS) \ + $(KHDR_INCLUDES) +ifeq ($(ARCH),s390) + CFLAGS += -march=z10 +endif +ifeq ($(ARCH),arm64) +tools_dir := $(top_srcdir)/tools +arm64_tools_dir := $(tools_dir)/arch/arm64/tools/ + +ifneq ($(abs_objdir),) +arm64_hdr_outdir := $(abs_objdir)/tools/ +else +arm64_hdr_outdir := $(tools_dir)/ +endif -no-pie-option := $(call try-run, echo 'int main() { return 0; }' | \ - $(CC) -Werror -no-pie -x c - -o "$$TMP", -no-pie) +GEN_HDRS := $(arm64_hdr_outdir)arch/arm64/include/generated/ +CFLAGS += -I$(GEN_HDRS) + +$(GEN_HDRS): $(wildcard $(arm64_tools_dir)/*) + $(MAKE) -C $(arm64_tools_dir) OUTPUT=$(arm64_hdr_outdir) +endif + +no-pie-option := $(call try-run, echo 'int main(void) { return 0; }' | \ + $(CC) -Werror $(CFLAGS) -no-pie -x c - -o "$$TMP", -no-pie) # On s390, build the testcases KVM-enabled -pgste-option = $(call try-run, echo 'int main() { return 0; }' | \ +pgste-option = $(call try-run, echo 'int main(void) { return 0; }' | \ $(CC) -Werror -Wl$(comma)--s390-pgste -x c - -o "$$TMP",-Wl$(comma)--s390-pgste) - +LDLIBS += -ldl LDFLAGS += -pthread $(no-pie-option) $(pgste-option) -# After inclusion, $(OUTPUT) is defined and -# $(TEST_GEN_PROGS) starts with $(OUTPUT)/ -include ../lib.mk +LIBKVM_C := $(filter %.c,$(LIBKVM)) +LIBKVM_S := $(filter %.S,$(LIBKVM)) +LIBKVM_C_OBJ := $(patsubst %.c, $(OUTPUT)/%.o, $(LIBKVM_C)) +LIBKVM_S_OBJ := $(patsubst %.S, $(OUTPUT)/%.o, $(LIBKVM_S)) +LIBKVM_STRING_OBJ := $(patsubst %.c, $(OUTPUT)/%.o, $(LIBKVM_STRING)) +LIBKVM_OBJS = $(LIBKVM_C_OBJ) $(LIBKVM_S_OBJ) $(LIBKVM_STRING_OBJ) +SPLIT_TEST_GEN_PROGS := $(patsubst %, $(OUTPUT)/%, $(SPLIT_TESTS)) +SPLIT_TEST_GEN_OBJ := $(patsubst %, $(OUTPUT)/$(ARCH_DIR)/%.o, $(SPLIT_TESTS)) + +TEST_GEN_OBJ = $(patsubst %, %.o, $(TEST_GEN_PROGS)) +TEST_GEN_OBJ += $(patsubst %, %.o, $(TEST_GEN_PROGS_EXTENDED)) +TEST_DEP_FILES = $(patsubst %.o, %.d, $(TEST_GEN_OBJ)) +TEST_DEP_FILES += $(patsubst %.o, %.d, $(LIBKVM_OBJS)) +TEST_DEP_FILES += $(patsubst %.o, %.d, $(SPLIT_TEST_GEN_OBJ)) +-include $(TEST_DEP_FILES) + +$(shell mkdir -p $(sort $(OUTPUT)/$(ARCH_DIR) $(dir $(LIBKVM_C_OBJ) $(LIBKVM_S_OBJ)))) + +$(filter-out $(SPLIT_TEST_GEN_PROGS), $(TEST_GEN_PROGS)) \ +$(TEST_GEN_PROGS_EXTENDED): %: %.o + $(CC) $(CFLAGS) $(CPPFLAGS) $(LDFLAGS) $(TARGET_ARCH) $< $(LIBKVM_OBJS) $(LDLIBS) -o $@ +$(TEST_GEN_OBJ): $(OUTPUT)/%.o: %.c + $(CC) $(CFLAGS) $(CPPFLAGS) $(TARGET_ARCH) -c $< -o $@ -STATIC_LIBS := $(OUTPUT)/libkvm.a -LIBKVM_OBJ := $(patsubst %.c, $(OUTPUT)/%.o, $(LIBKVM)) -EXTRA_CLEAN += $(LIBKVM_OBJ) $(STATIC_LIBS) cscope.* +$(SPLIT_TEST_GEN_PROGS): $(OUTPUT)/%: $(OUTPUT)/%.o $(OUTPUT)/$(ARCH_DIR)/%.o + $(CC) $(CFLAGS) $(CPPFLAGS) $(LDFLAGS) $(TARGET_ARCH) $^ $(LDLIBS) -o $@ +$(SPLIT_TEST_GEN_OBJ): $(OUTPUT)/$(ARCH_DIR)/%.o: $(ARCH_DIR)/%.c + $(CC) $(CFLAGS) $(CPPFLAGS) $(TARGET_ARCH) -c $< -o $@ + +EXTRA_CLEAN += $(GEN_HDRS) \ + $(LIBKVM_OBJS) \ + $(SPLIT_TEST_GEN_OBJ) \ + $(TEST_DEP_FILES) \ + $(TEST_GEN_OBJ) \ + cscope.* + +$(LIBKVM_C_OBJ): $(OUTPUT)/%.o: %.c $(GEN_HDRS) + $(CC) $(CFLAGS) $(CPPFLAGS) $(TARGET_ARCH) -c $< -o $@ -x := $(shell mkdir -p $(sort $(dir $(LIBKVM_OBJ)))) -$(LIBKVM_OBJ): $(OUTPUT)/%.o: %.c +$(LIBKVM_S_OBJ): $(OUTPUT)/%.o: %.S $(GEN_HDRS) $(CC) $(CFLAGS) $(CPPFLAGS) $(TARGET_ARCH) -c $< -o $@ -$(OUTPUT)/libkvm.a: $(LIBKVM_OBJ) - $(AR) crs $@ $^ +# Compile the string overrides as freestanding to prevent the compiler from +# generating self-referential code, e.g. without "freestanding" the compiler may +# "optimize" memcmp() by invoking memcmp(), thus causing infinite recursion. +$(LIBKVM_STRING_OBJ): $(OUTPUT)/%.o: %.c + $(CC) $(CFLAGS) $(CPPFLAGS) $(TARGET_ARCH) -c -ffreestanding $< -o $@ -x := $(shell mkdir -p $(sort $(dir $(TEST_GEN_PROGS)))) -all: $(STATIC_LIBS) -$(TEST_GEN_PROGS): $(STATIC_LIBS) +$(shell mkdir -p $(sort $(dir $(TEST_GEN_PROGS)))) +$(SPLIT_TEST_GEN_OBJ): $(GEN_HDRS) +$(TEST_GEN_PROGS): $(LIBKVM_OBJS) +$(TEST_GEN_PROGS_EXTENDED): $(LIBKVM_OBJS) +$(TEST_GEN_OBJ): $(GEN_HDRS) cscope: include_paths = $(LINUX_TOOL_INCLUDE) $(LINUX_HDR_PATH) include lib .. cscope: diff --git a/tools/testing/selftests/kvm/aarch64/aarch32_id_regs.c b/tools/testing/selftests/kvm/aarch64/aarch32_id_regs.c new file mode 100644 index 000000000000..8e5bd07a3727 --- /dev/null +++ b/tools/testing/selftests/kvm/aarch64/aarch32_id_regs.c @@ -0,0 +1,167 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * aarch32_id_regs - Test for ID register behavior on AArch64-only systems + * + * Copyright (c) 2022 Google LLC. + * + * Test that KVM handles the AArch64 views of the AArch32 ID registers as RAZ + * and WI from userspace. + */ + +#include <stdint.h> + +#include "kvm_util.h" +#include "processor.h" +#include "test_util.h" +#include <linux/bitfield.h> + +#define BAD_ID_REG_VAL 0x1badc0deul + +#define GUEST_ASSERT_REG_RAZ(reg) GUEST_ASSERT_EQ(read_sysreg_s(reg), 0) + +static void guest_main(void) +{ + GUEST_ASSERT_REG_RAZ(SYS_ID_PFR0_EL1); + GUEST_ASSERT_REG_RAZ(SYS_ID_PFR1_EL1); + GUEST_ASSERT_REG_RAZ(SYS_ID_DFR0_EL1); + GUEST_ASSERT_REG_RAZ(SYS_ID_AFR0_EL1); + GUEST_ASSERT_REG_RAZ(SYS_ID_MMFR0_EL1); + GUEST_ASSERT_REG_RAZ(SYS_ID_MMFR1_EL1); + GUEST_ASSERT_REG_RAZ(SYS_ID_MMFR2_EL1); + GUEST_ASSERT_REG_RAZ(SYS_ID_MMFR3_EL1); + GUEST_ASSERT_REG_RAZ(SYS_ID_ISAR0_EL1); + GUEST_ASSERT_REG_RAZ(SYS_ID_ISAR1_EL1); + GUEST_ASSERT_REG_RAZ(SYS_ID_ISAR2_EL1); + GUEST_ASSERT_REG_RAZ(SYS_ID_ISAR3_EL1); + GUEST_ASSERT_REG_RAZ(SYS_ID_ISAR4_EL1); + GUEST_ASSERT_REG_RAZ(SYS_ID_ISAR5_EL1); + GUEST_ASSERT_REG_RAZ(SYS_ID_MMFR4_EL1); + GUEST_ASSERT_REG_RAZ(SYS_ID_ISAR6_EL1); + GUEST_ASSERT_REG_RAZ(SYS_MVFR0_EL1); + GUEST_ASSERT_REG_RAZ(SYS_MVFR1_EL1); + GUEST_ASSERT_REG_RAZ(SYS_MVFR2_EL1); + GUEST_ASSERT_REG_RAZ(sys_reg(3, 0, 0, 3, 3)); + GUEST_ASSERT_REG_RAZ(SYS_ID_PFR2_EL1); + GUEST_ASSERT_REG_RAZ(SYS_ID_DFR1_EL1); + GUEST_ASSERT_REG_RAZ(SYS_ID_MMFR5_EL1); + GUEST_ASSERT_REG_RAZ(sys_reg(3, 0, 0, 3, 7)); + + GUEST_DONE(); +} + +static void test_guest_raz(struct kvm_vcpu *vcpu) +{ + struct ucall uc; + + vcpu_run(vcpu); + + switch (get_ucall(vcpu, &uc)) { + case UCALL_ABORT: + REPORT_GUEST_ASSERT(uc); + break; + case UCALL_DONE: + break; + default: + TEST_FAIL("Unexpected ucall: %lu", uc.cmd); + } +} + +static uint64_t raz_wi_reg_ids[] = { + KVM_ARM64_SYS_REG(SYS_ID_PFR0_EL1), + KVM_ARM64_SYS_REG(SYS_ID_PFR1_EL1), + KVM_ARM64_SYS_REG(SYS_ID_DFR0_EL1), + KVM_ARM64_SYS_REG(SYS_ID_MMFR0_EL1), + KVM_ARM64_SYS_REG(SYS_ID_MMFR1_EL1), + KVM_ARM64_SYS_REG(SYS_ID_MMFR2_EL1), + KVM_ARM64_SYS_REG(SYS_ID_MMFR3_EL1), + KVM_ARM64_SYS_REG(SYS_ID_ISAR0_EL1), + KVM_ARM64_SYS_REG(SYS_ID_ISAR1_EL1), + KVM_ARM64_SYS_REG(SYS_ID_ISAR2_EL1), + KVM_ARM64_SYS_REG(SYS_ID_ISAR3_EL1), + KVM_ARM64_SYS_REG(SYS_ID_ISAR4_EL1), + KVM_ARM64_SYS_REG(SYS_ID_ISAR5_EL1), + KVM_ARM64_SYS_REG(SYS_ID_MMFR4_EL1), + KVM_ARM64_SYS_REG(SYS_ID_ISAR6_EL1), + KVM_ARM64_SYS_REG(SYS_MVFR0_EL1), + KVM_ARM64_SYS_REG(SYS_MVFR1_EL1), + KVM_ARM64_SYS_REG(SYS_MVFR2_EL1), + KVM_ARM64_SYS_REG(SYS_ID_PFR2_EL1), + KVM_ARM64_SYS_REG(SYS_ID_MMFR5_EL1), +}; + +static void test_user_raz_wi(struct kvm_vcpu *vcpu) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(raz_wi_reg_ids); i++) { + uint64_t reg_id = raz_wi_reg_ids[i]; + uint64_t val; + + vcpu_get_reg(vcpu, reg_id, &val); + TEST_ASSERT_EQ(val, 0); + + /* + * Expect the ioctl to succeed with no effect on the register + * value. + */ + vcpu_set_reg(vcpu, reg_id, BAD_ID_REG_VAL); + + vcpu_get_reg(vcpu, reg_id, &val); + TEST_ASSERT_EQ(val, 0); + } +} + +static uint64_t raz_invariant_reg_ids[] = { + KVM_ARM64_SYS_REG(SYS_ID_AFR0_EL1), + KVM_ARM64_SYS_REG(sys_reg(3, 0, 0, 3, 3)), + KVM_ARM64_SYS_REG(SYS_ID_DFR1_EL1), + KVM_ARM64_SYS_REG(sys_reg(3, 0, 0, 3, 7)), +}; + +static void test_user_raz_invariant(struct kvm_vcpu *vcpu) +{ + int i, r; + + for (i = 0; i < ARRAY_SIZE(raz_invariant_reg_ids); i++) { + uint64_t reg_id = raz_invariant_reg_ids[i]; + uint64_t val; + + vcpu_get_reg(vcpu, reg_id, &val); + TEST_ASSERT_EQ(val, 0); + + r = __vcpu_set_reg(vcpu, reg_id, BAD_ID_REG_VAL); + TEST_ASSERT(r < 0 && errno == EINVAL, + "unexpected KVM_SET_ONE_REG error: r=%d, errno=%d", r, errno); + + vcpu_get_reg(vcpu, reg_id, &val); + TEST_ASSERT_EQ(val, 0); + } +} + + + +static bool vcpu_aarch64_only(struct kvm_vcpu *vcpu) +{ + uint64_t val, el0; + + vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(SYS_ID_AA64PFR0_EL1), &val); + + el0 = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_EL0), val); + return el0 == ID_AA64PFR0_EL1_ELx_64BIT_ONLY; +} + +int main(void) +{ + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; + + vm = vm_create_with_one_vcpu(&vcpu, guest_main); + + TEST_REQUIRE(vcpu_aarch64_only(vcpu)); + + test_user_raz_wi(vcpu); + test_user_raz_invariant(vcpu); + test_guest_raz(vcpu); + + kvm_vm_free(vm); +} diff --git a/tools/testing/selftests/kvm/aarch64/arch_timer.c b/tools/testing/selftests/kvm/aarch64/arch_timer.c new file mode 100644 index 000000000000..eeba1cc87ff8 --- /dev/null +++ b/tools/testing/selftests/kvm/aarch64/arch_timer.c @@ -0,0 +1,220 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * The test validates both the virtual and physical timer IRQs using + * CVAL and TVAL registers. + * + * Copyright (c) 2021, Google LLC. + */ +#include "arch_timer.h" +#include "delay.h" +#include "gic.h" +#include "processor.h" +#include "timer_test.h" +#include "ucall_common.h" +#include "vgic.h" + +enum guest_stage { + GUEST_STAGE_VTIMER_CVAL = 1, + GUEST_STAGE_VTIMER_TVAL, + GUEST_STAGE_PTIMER_CVAL, + GUEST_STAGE_PTIMER_TVAL, + GUEST_STAGE_MAX, +}; + +static int vtimer_irq, ptimer_irq; + +static void +guest_configure_timer_action(struct test_vcpu_shared_data *shared_data) +{ + switch (shared_data->guest_stage) { + case GUEST_STAGE_VTIMER_CVAL: + timer_set_next_cval_ms(VIRTUAL, test_args.timer_period_ms); + shared_data->xcnt = timer_get_cntct(VIRTUAL); + timer_set_ctl(VIRTUAL, CTL_ENABLE); + break; + case GUEST_STAGE_VTIMER_TVAL: + timer_set_next_tval_ms(VIRTUAL, test_args.timer_period_ms); + shared_data->xcnt = timer_get_cntct(VIRTUAL); + timer_set_ctl(VIRTUAL, CTL_ENABLE); + break; + case GUEST_STAGE_PTIMER_CVAL: + timer_set_next_cval_ms(PHYSICAL, test_args.timer_period_ms); + shared_data->xcnt = timer_get_cntct(PHYSICAL); + timer_set_ctl(PHYSICAL, CTL_ENABLE); + break; + case GUEST_STAGE_PTIMER_TVAL: + timer_set_next_tval_ms(PHYSICAL, test_args.timer_period_ms); + shared_data->xcnt = timer_get_cntct(PHYSICAL); + timer_set_ctl(PHYSICAL, CTL_ENABLE); + break; + default: + GUEST_ASSERT(0); + } +} + +static void guest_validate_irq(unsigned int intid, + struct test_vcpu_shared_data *shared_data) +{ + enum guest_stage stage = shared_data->guest_stage; + uint64_t xcnt = 0, xcnt_diff_us, cval = 0; + unsigned long xctl = 0; + unsigned int timer_irq = 0; + unsigned int accessor; + + if (intid == IAR_SPURIOUS) + return; + + switch (stage) { + case GUEST_STAGE_VTIMER_CVAL: + case GUEST_STAGE_VTIMER_TVAL: + accessor = VIRTUAL; + timer_irq = vtimer_irq; + break; + case GUEST_STAGE_PTIMER_CVAL: + case GUEST_STAGE_PTIMER_TVAL: + accessor = PHYSICAL; + timer_irq = ptimer_irq; + break; + default: + GUEST_ASSERT(0); + return; + } + + xctl = timer_get_ctl(accessor); + if ((xctl & CTL_IMASK) || !(xctl & CTL_ENABLE)) + return; + + timer_set_ctl(accessor, CTL_IMASK); + xcnt = timer_get_cntct(accessor); + cval = timer_get_cval(accessor); + + xcnt_diff_us = cycles_to_usec(xcnt - shared_data->xcnt); + + /* Make sure we are dealing with the correct timer IRQ */ + GUEST_ASSERT_EQ(intid, timer_irq); + + /* Basic 'timer condition met' check */ + __GUEST_ASSERT(xcnt >= cval, + "xcnt = 0x%lx, cval = 0x%lx, xcnt_diff_us = 0x%lx", + xcnt, cval, xcnt_diff_us); + __GUEST_ASSERT(xctl & CTL_ISTATUS, "xctl = 0x%lx", xctl); + + WRITE_ONCE(shared_data->nr_iter, shared_data->nr_iter + 1); +} + +static void guest_irq_handler(struct ex_regs *regs) +{ + unsigned int intid = gic_get_and_ack_irq(); + uint32_t cpu = guest_get_vcpuid(); + struct test_vcpu_shared_data *shared_data = &vcpu_shared_data[cpu]; + + guest_validate_irq(intid, shared_data); + + gic_set_eoi(intid); +} + +static void guest_run_stage(struct test_vcpu_shared_data *shared_data, + enum guest_stage stage) +{ + uint32_t irq_iter, config_iter; + + shared_data->guest_stage = stage; + shared_data->nr_iter = 0; + + for (config_iter = 0; config_iter < test_args.nr_iter; config_iter++) { + /* Setup the next interrupt */ + guest_configure_timer_action(shared_data); + + /* Setup a timeout for the interrupt to arrive */ + udelay(msecs_to_usecs(test_args.timer_period_ms) + + test_args.timer_err_margin_us); + + irq_iter = READ_ONCE(shared_data->nr_iter); + __GUEST_ASSERT(config_iter + 1 == irq_iter, + "config_iter + 1 = 0x%x, irq_iter = 0x%x.\n" + " Guest timer interrupt was not triggered within the specified\n" + " interval, try to increase the error margin by [-e] option.\n", + config_iter + 1, irq_iter); + } +} + +static void guest_code(void) +{ + uint32_t cpu = guest_get_vcpuid(); + struct test_vcpu_shared_data *shared_data = &vcpu_shared_data[cpu]; + + local_irq_disable(); + + gic_init(GIC_V3, test_args.nr_vcpus); + + timer_set_ctl(VIRTUAL, CTL_IMASK); + timer_set_ctl(PHYSICAL, CTL_IMASK); + + gic_irq_enable(vtimer_irq); + gic_irq_enable(ptimer_irq); + local_irq_enable(); + + guest_run_stage(shared_data, GUEST_STAGE_VTIMER_CVAL); + guest_run_stage(shared_data, GUEST_STAGE_VTIMER_TVAL); + guest_run_stage(shared_data, GUEST_STAGE_PTIMER_CVAL); + guest_run_stage(shared_data, GUEST_STAGE_PTIMER_TVAL); + + GUEST_DONE(); +} + +static void test_init_timer_irq(struct kvm_vm *vm) +{ + /* Timer initid should be same for all the vCPUs, so query only vCPU-0 */ + vcpu_device_attr_get(vcpus[0], KVM_ARM_VCPU_TIMER_CTRL, + KVM_ARM_VCPU_TIMER_IRQ_PTIMER, &ptimer_irq); + vcpu_device_attr_get(vcpus[0], KVM_ARM_VCPU_TIMER_CTRL, + KVM_ARM_VCPU_TIMER_IRQ_VTIMER, &vtimer_irq); + + sync_global_to_guest(vm, ptimer_irq); + sync_global_to_guest(vm, vtimer_irq); + + pr_debug("ptimer_irq: %d; vtimer_irq: %d\n", ptimer_irq, vtimer_irq); +} + +static int gic_fd; + +struct kvm_vm *test_vm_create(void) +{ + struct kvm_vm *vm; + unsigned int i; + int nr_vcpus = test_args.nr_vcpus; + + vm = vm_create_with_vcpus(nr_vcpus, guest_code, vcpus); + + vm_init_descriptor_tables(vm); + vm_install_exception_handler(vm, VECTOR_IRQ_CURRENT, guest_irq_handler); + + if (!test_args.reserved) { + if (kvm_has_cap(KVM_CAP_COUNTER_OFFSET)) { + struct kvm_arm_counter_offset offset = { + .counter_offset = test_args.counter_offset, + .reserved = 0, + }; + vm_ioctl(vm, KVM_ARM_SET_COUNTER_OFFSET, &offset); + } else + TEST_FAIL("no support for global offset"); + } + + for (i = 0; i < nr_vcpus; i++) + vcpu_init_descriptor_tables(vcpus[i]); + + test_init_timer_irq(vm); + gic_fd = vgic_v3_setup(vm, nr_vcpus, 64); + __TEST_REQUIRE(gic_fd >= 0, "Failed to create vgic-v3"); + + /* Make all the test's cmdline args visible to the guest */ + sync_global_to_guest(vm, test_args); + + return vm; +} + +void test_vm_cleanup(struct kvm_vm *vm) +{ + close(gic_fd); + kvm_vm_free(vm); +} diff --git a/tools/testing/selftests/kvm/aarch64/debug-exceptions.c b/tools/testing/selftests/kvm/aarch64/debug-exceptions.c new file mode 100644 index 000000000000..2582c49e525a --- /dev/null +++ b/tools/testing/selftests/kvm/aarch64/debug-exceptions.c @@ -0,0 +1,607 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <test_util.h> +#include <kvm_util.h> +#include <processor.h> +#include <linux/bitfield.h> + +#define MDSCR_KDE (1 << 13) +#define MDSCR_MDE (1 << 15) +#define MDSCR_SS (1 << 0) + +#define DBGBCR_LEN8 (0xff << 5) +#define DBGBCR_EXEC (0x0 << 3) +#define DBGBCR_EL1 (0x1 << 1) +#define DBGBCR_E (0x1 << 0) +#define DBGBCR_LBN_SHIFT 16 +#define DBGBCR_BT_SHIFT 20 +#define DBGBCR_BT_ADDR_LINK_CTX (0x1 << DBGBCR_BT_SHIFT) +#define DBGBCR_BT_CTX_LINK (0x3 << DBGBCR_BT_SHIFT) + +#define DBGWCR_LEN8 (0xff << 5) +#define DBGWCR_RD (0x1 << 3) +#define DBGWCR_WR (0x2 << 3) +#define DBGWCR_EL1 (0x1 << 1) +#define DBGWCR_E (0x1 << 0) +#define DBGWCR_LBN_SHIFT 16 +#define DBGWCR_WT_SHIFT 20 +#define DBGWCR_WT_LINK (0x1 << DBGWCR_WT_SHIFT) + +#define SPSR_D (1 << 9) +#define SPSR_SS (1 << 21) + +extern unsigned char sw_bp, sw_bp2, hw_bp, hw_bp2, bp_svc, bp_brk, hw_wp, ss_start, hw_bp_ctx; +extern unsigned char iter_ss_begin, iter_ss_end; +static volatile uint64_t sw_bp_addr, hw_bp_addr; +static volatile uint64_t wp_addr, wp_data_addr; +static volatile uint64_t svc_addr; +static volatile uint64_t ss_addr[4], ss_idx; +#define PC(v) ((uint64_t)&(v)) + +#define GEN_DEBUG_WRITE_REG(reg_name) \ +static void write_##reg_name(int num, uint64_t val) \ +{ \ + switch (num) { \ + case 0: \ + write_sysreg(val, reg_name##0_el1); \ + break; \ + case 1: \ + write_sysreg(val, reg_name##1_el1); \ + break; \ + case 2: \ + write_sysreg(val, reg_name##2_el1); \ + break; \ + case 3: \ + write_sysreg(val, reg_name##3_el1); \ + break; \ + case 4: \ + write_sysreg(val, reg_name##4_el1); \ + break; \ + case 5: \ + write_sysreg(val, reg_name##5_el1); \ + break; \ + case 6: \ + write_sysreg(val, reg_name##6_el1); \ + break; \ + case 7: \ + write_sysreg(val, reg_name##7_el1); \ + break; \ + case 8: \ + write_sysreg(val, reg_name##8_el1); \ + break; \ + case 9: \ + write_sysreg(val, reg_name##9_el1); \ + break; \ + case 10: \ + write_sysreg(val, reg_name##10_el1); \ + break; \ + case 11: \ + write_sysreg(val, reg_name##11_el1); \ + break; \ + case 12: \ + write_sysreg(val, reg_name##12_el1); \ + break; \ + case 13: \ + write_sysreg(val, reg_name##13_el1); \ + break; \ + case 14: \ + write_sysreg(val, reg_name##14_el1); \ + break; \ + case 15: \ + write_sysreg(val, reg_name##15_el1); \ + break; \ + default: \ + GUEST_ASSERT(0); \ + } \ +} + +/* Define write_dbgbcr()/write_dbgbvr()/write_dbgwcr()/write_dbgwvr() */ +GEN_DEBUG_WRITE_REG(dbgbcr) +GEN_DEBUG_WRITE_REG(dbgbvr) +GEN_DEBUG_WRITE_REG(dbgwcr) +GEN_DEBUG_WRITE_REG(dbgwvr) + +static void reset_debug_state(void) +{ + uint8_t brps, wrps, i; + uint64_t dfr0; + + asm volatile("msr daifset, #8"); + + write_sysreg(0, osdlr_el1); + write_sysreg(0, oslar_el1); + isb(); + + write_sysreg(0, mdscr_el1); + write_sysreg(0, contextidr_el1); + + /* Reset all bcr/bvr/wcr/wvr registers */ + dfr0 = read_sysreg(id_aa64dfr0_el1); + brps = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_BRPs), dfr0); + for (i = 0; i <= brps; i++) { + write_dbgbcr(i, 0); + write_dbgbvr(i, 0); + } + wrps = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_WRPs), dfr0); + for (i = 0; i <= wrps; i++) { + write_dbgwcr(i, 0); + write_dbgwvr(i, 0); + } + + isb(); +} + +static void enable_os_lock(void) +{ + write_sysreg(1, oslar_el1); + isb(); + + GUEST_ASSERT(read_sysreg(oslsr_el1) & 2); +} + +static void enable_monitor_debug_exceptions(void) +{ + uint32_t mdscr; + + asm volatile("msr daifclr, #8"); + + mdscr = read_sysreg(mdscr_el1) | MDSCR_KDE | MDSCR_MDE; + write_sysreg(mdscr, mdscr_el1); + isb(); +} + +static void install_wp(uint8_t wpn, uint64_t addr) +{ + uint32_t wcr; + + wcr = DBGWCR_LEN8 | DBGWCR_RD | DBGWCR_WR | DBGWCR_EL1 | DBGWCR_E; + write_dbgwcr(wpn, wcr); + write_dbgwvr(wpn, addr); + + isb(); + + enable_monitor_debug_exceptions(); +} + +static void install_hw_bp(uint8_t bpn, uint64_t addr) +{ + uint32_t bcr; + + bcr = DBGBCR_LEN8 | DBGBCR_EXEC | DBGBCR_EL1 | DBGBCR_E; + write_dbgbcr(bpn, bcr); + write_dbgbvr(bpn, addr); + isb(); + + enable_monitor_debug_exceptions(); +} + +static void install_wp_ctx(uint8_t addr_wp, uint8_t ctx_bp, uint64_t addr, + uint64_t ctx) +{ + uint32_t wcr; + uint64_t ctx_bcr; + + /* Setup a context-aware breakpoint for Linked Context ID Match */ + ctx_bcr = DBGBCR_LEN8 | DBGBCR_EXEC | DBGBCR_EL1 | DBGBCR_E | + DBGBCR_BT_CTX_LINK; + write_dbgbcr(ctx_bp, ctx_bcr); + write_dbgbvr(ctx_bp, ctx); + + /* Setup a linked watchpoint (linked to the context-aware breakpoint) */ + wcr = DBGWCR_LEN8 | DBGWCR_RD | DBGWCR_WR | DBGWCR_EL1 | DBGWCR_E | + DBGWCR_WT_LINK | ((uint32_t)ctx_bp << DBGWCR_LBN_SHIFT); + write_dbgwcr(addr_wp, wcr); + write_dbgwvr(addr_wp, addr); + isb(); + + enable_monitor_debug_exceptions(); +} + +void install_hw_bp_ctx(uint8_t addr_bp, uint8_t ctx_bp, uint64_t addr, + uint64_t ctx) +{ + uint32_t addr_bcr, ctx_bcr; + + /* Setup a context-aware breakpoint for Linked Context ID Match */ + ctx_bcr = DBGBCR_LEN8 | DBGBCR_EXEC | DBGBCR_EL1 | DBGBCR_E | + DBGBCR_BT_CTX_LINK; + write_dbgbcr(ctx_bp, ctx_bcr); + write_dbgbvr(ctx_bp, ctx); + + /* + * Setup a normal breakpoint for Linked Address Match, and link it + * to the context-aware breakpoint. + */ + addr_bcr = DBGBCR_LEN8 | DBGBCR_EXEC | DBGBCR_EL1 | DBGBCR_E | + DBGBCR_BT_ADDR_LINK_CTX | + ((uint32_t)ctx_bp << DBGBCR_LBN_SHIFT); + write_dbgbcr(addr_bp, addr_bcr); + write_dbgbvr(addr_bp, addr); + isb(); + + enable_monitor_debug_exceptions(); +} + +static void install_ss(void) +{ + uint32_t mdscr; + + asm volatile("msr daifclr, #8"); + + mdscr = read_sysreg(mdscr_el1) | MDSCR_KDE | MDSCR_SS; + write_sysreg(mdscr, mdscr_el1); + isb(); +} + +static volatile char write_data; + +static void guest_code(uint8_t bpn, uint8_t wpn, uint8_t ctx_bpn) +{ + uint64_t ctx = 0xabcdef; /* a random context number */ + + /* Software-breakpoint */ + reset_debug_state(); + asm volatile("sw_bp: brk #0"); + GUEST_ASSERT_EQ(sw_bp_addr, PC(sw_bp)); + + /* Hardware-breakpoint */ + reset_debug_state(); + install_hw_bp(bpn, PC(hw_bp)); + asm volatile("hw_bp: nop"); + GUEST_ASSERT_EQ(hw_bp_addr, PC(hw_bp)); + + /* Hardware-breakpoint + svc */ + reset_debug_state(); + install_hw_bp(bpn, PC(bp_svc)); + asm volatile("bp_svc: svc #0"); + GUEST_ASSERT_EQ(hw_bp_addr, PC(bp_svc)); + GUEST_ASSERT_EQ(svc_addr, PC(bp_svc) + 4); + + /* Hardware-breakpoint + software-breakpoint */ + reset_debug_state(); + install_hw_bp(bpn, PC(bp_brk)); + asm volatile("bp_brk: brk #0"); + GUEST_ASSERT_EQ(sw_bp_addr, PC(bp_brk)); + GUEST_ASSERT_EQ(hw_bp_addr, PC(bp_brk)); + + /* Watchpoint */ + reset_debug_state(); + install_wp(wpn, PC(write_data)); + write_data = 'x'; + GUEST_ASSERT_EQ(write_data, 'x'); + GUEST_ASSERT_EQ(wp_data_addr, PC(write_data)); + + /* Single-step */ + reset_debug_state(); + install_ss(); + ss_idx = 0; + asm volatile("ss_start:\n" + "mrs x0, esr_el1\n" + "add x0, x0, #1\n" + "msr daifset, #8\n" + : : : "x0"); + GUEST_ASSERT_EQ(ss_addr[0], PC(ss_start)); + GUEST_ASSERT_EQ(ss_addr[1], PC(ss_start) + 4); + GUEST_ASSERT_EQ(ss_addr[2], PC(ss_start) + 8); + + /* OS Lock does not block software-breakpoint */ + reset_debug_state(); + enable_os_lock(); + sw_bp_addr = 0; + asm volatile("sw_bp2: brk #0"); + GUEST_ASSERT_EQ(sw_bp_addr, PC(sw_bp2)); + + /* OS Lock blocking hardware-breakpoint */ + reset_debug_state(); + enable_os_lock(); + install_hw_bp(bpn, PC(hw_bp2)); + hw_bp_addr = 0; + asm volatile("hw_bp2: nop"); + GUEST_ASSERT_EQ(hw_bp_addr, 0); + + /* OS Lock blocking watchpoint */ + reset_debug_state(); + enable_os_lock(); + write_data = '\0'; + wp_data_addr = 0; + install_wp(wpn, PC(write_data)); + write_data = 'x'; + GUEST_ASSERT_EQ(write_data, 'x'); + GUEST_ASSERT_EQ(wp_data_addr, 0); + + /* OS Lock blocking single-step */ + reset_debug_state(); + enable_os_lock(); + ss_addr[0] = 0; + install_ss(); + ss_idx = 0; + asm volatile("mrs x0, esr_el1\n\t" + "add x0, x0, #1\n\t" + "msr daifset, #8\n\t" + : : : "x0"); + GUEST_ASSERT_EQ(ss_addr[0], 0); + + /* Linked hardware-breakpoint */ + hw_bp_addr = 0; + reset_debug_state(); + install_hw_bp_ctx(bpn, ctx_bpn, PC(hw_bp_ctx), ctx); + /* Set context id */ + write_sysreg(ctx, contextidr_el1); + isb(); + asm volatile("hw_bp_ctx: nop"); + write_sysreg(0, contextidr_el1); + GUEST_ASSERT_EQ(hw_bp_addr, PC(hw_bp_ctx)); + + /* Linked watchpoint */ + reset_debug_state(); + install_wp_ctx(wpn, ctx_bpn, PC(write_data), ctx); + /* Set context id */ + write_sysreg(ctx, contextidr_el1); + isb(); + write_data = 'x'; + GUEST_ASSERT_EQ(write_data, 'x'); + GUEST_ASSERT_EQ(wp_data_addr, PC(write_data)); + + GUEST_DONE(); +} + +static void guest_sw_bp_handler(struct ex_regs *regs) +{ + sw_bp_addr = regs->pc; + regs->pc += 4; +} + +static void guest_hw_bp_handler(struct ex_regs *regs) +{ + hw_bp_addr = regs->pc; + regs->pstate |= SPSR_D; +} + +static void guest_wp_handler(struct ex_regs *regs) +{ + wp_data_addr = read_sysreg(far_el1); + wp_addr = regs->pc; + regs->pstate |= SPSR_D; +} + +static void guest_ss_handler(struct ex_regs *regs) +{ + __GUEST_ASSERT(ss_idx < 4, "Expected index < 4, got '%lu'", ss_idx); + ss_addr[ss_idx++] = regs->pc; + regs->pstate |= SPSR_SS; +} + +static void guest_svc_handler(struct ex_regs *regs) +{ + svc_addr = regs->pc; +} + +static void guest_code_ss(int test_cnt) +{ + uint64_t i; + uint64_t bvr, wvr, w_bvr, w_wvr; + + for (i = 0; i < test_cnt; i++) { + /* Bits [1:0] of dbg{b,w}vr are RES0 */ + w_bvr = i << 2; + w_wvr = i << 2; + + /* + * Enable Single Step execution. Note! This _must_ be a bare + * ucall as the ucall() path uses atomic operations to manage + * the ucall structures, and the built-in "atomics" are usually + * implemented via exclusive access instructions. The exlusive + * monitor is cleared on ERET, and so taking debug exceptions + * during a LDREX=>STREX sequence will prevent forward progress + * and hang the guest/test. + */ + GUEST_UCALL_NONE(); + + /* + * The userspace will verify that the pc is as expected during + * single step execution between iter_ss_begin and iter_ss_end. + */ + asm volatile("iter_ss_begin:nop\n"); + + write_sysreg(w_bvr, dbgbvr0_el1); + write_sysreg(w_wvr, dbgwvr0_el1); + bvr = read_sysreg(dbgbvr0_el1); + wvr = read_sysreg(dbgwvr0_el1); + + /* Userspace disables Single Step when the end is nigh. */ + asm volatile("iter_ss_end:\n"); + + GUEST_ASSERT_EQ(bvr, w_bvr); + GUEST_ASSERT_EQ(wvr, w_wvr); + } + GUEST_DONE(); +} + +static int debug_version(uint64_t id_aa64dfr0) +{ + return FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_DebugVer), id_aa64dfr0); +} + +static void test_guest_debug_exceptions(uint8_t bpn, uint8_t wpn, uint8_t ctx_bpn) +{ + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; + struct ucall uc; + + vm = vm_create_with_one_vcpu(&vcpu, guest_code); + + vm_init_descriptor_tables(vm); + vcpu_init_descriptor_tables(vcpu); + + vm_install_sync_handler(vm, VECTOR_SYNC_CURRENT, + ESR_EC_BRK_INS, guest_sw_bp_handler); + vm_install_sync_handler(vm, VECTOR_SYNC_CURRENT, + ESR_EC_HW_BP_CURRENT, guest_hw_bp_handler); + vm_install_sync_handler(vm, VECTOR_SYNC_CURRENT, + ESR_EC_WP_CURRENT, guest_wp_handler); + vm_install_sync_handler(vm, VECTOR_SYNC_CURRENT, + ESR_EC_SSTEP_CURRENT, guest_ss_handler); + vm_install_sync_handler(vm, VECTOR_SYNC_CURRENT, + ESR_EC_SVC64, guest_svc_handler); + + /* Specify bpn/wpn/ctx_bpn to be tested */ + vcpu_args_set(vcpu, 3, bpn, wpn, ctx_bpn); + pr_debug("Use bpn#%d, wpn#%d and ctx_bpn#%d\n", bpn, wpn, ctx_bpn); + + vcpu_run(vcpu); + switch (get_ucall(vcpu, &uc)) { + case UCALL_ABORT: + REPORT_GUEST_ASSERT(uc); + break; + case UCALL_DONE: + goto done; + default: + TEST_FAIL("Unknown ucall %lu", uc.cmd); + } + +done: + kvm_vm_free(vm); +} + +void test_single_step_from_userspace(int test_cnt) +{ + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; + struct ucall uc; + struct kvm_run *run; + uint64_t pc, cmd; + uint64_t test_pc = 0; + bool ss_enable = false; + struct kvm_guest_debug debug = {}; + + vm = vm_create_with_one_vcpu(&vcpu, guest_code_ss); + run = vcpu->run; + vcpu_args_set(vcpu, 1, test_cnt); + + while (1) { + vcpu_run(vcpu); + if (run->exit_reason != KVM_EXIT_DEBUG) { + cmd = get_ucall(vcpu, &uc); + if (cmd == UCALL_ABORT) { + REPORT_GUEST_ASSERT(uc); + /* NOT REACHED */ + } else if (cmd == UCALL_DONE) { + break; + } + + TEST_ASSERT(cmd == UCALL_NONE, + "Unexpected ucall cmd 0x%lx", cmd); + + debug.control = KVM_GUESTDBG_ENABLE | + KVM_GUESTDBG_SINGLESTEP; + ss_enable = true; + vcpu_guest_debug_set(vcpu, &debug); + continue; + } + + TEST_ASSERT(ss_enable, "Unexpected KVM_EXIT_DEBUG"); + + /* Check if the current pc is expected. */ + vcpu_get_reg(vcpu, ARM64_CORE_REG(regs.pc), &pc); + TEST_ASSERT(!test_pc || pc == test_pc, + "Unexpected pc 0x%lx (expected 0x%lx)", + pc, test_pc); + + if ((pc + 4) == (uint64_t)&iter_ss_end) { + test_pc = 0; + debug.control = KVM_GUESTDBG_ENABLE; + ss_enable = false; + vcpu_guest_debug_set(vcpu, &debug); + continue; + } + + /* + * If the current pc is between iter_ss_bgin and + * iter_ss_end, the pc for the next KVM_EXIT_DEBUG should + * be the current pc + 4. + */ + if ((pc >= (uint64_t)&iter_ss_begin) && + (pc < (uint64_t)&iter_ss_end)) + test_pc = pc + 4; + else + test_pc = 0; + } + + kvm_vm_free(vm); +} + +/* + * Run debug testing using the various breakpoint#, watchpoint# and + * context-aware breakpoint# with the given ID_AA64DFR0_EL1 configuration. + */ +void test_guest_debug_exceptions_all(uint64_t aa64dfr0) +{ + uint8_t brp_num, wrp_num, ctx_brp_num, normal_brp_num, ctx_brp_base; + int b, w, c; + + /* Number of breakpoints */ + brp_num = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_BRPs), aa64dfr0) + 1; + __TEST_REQUIRE(brp_num >= 2, "At least two breakpoints are required"); + + /* Number of watchpoints */ + wrp_num = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_WRPs), aa64dfr0) + 1; + + /* Number of context aware breakpoints */ + ctx_brp_num = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_CTX_CMPs), aa64dfr0) + 1; + + pr_debug("%s brp_num:%d, wrp_num:%d, ctx_brp_num:%d\n", __func__, + brp_num, wrp_num, ctx_brp_num); + + /* Number of normal (non-context aware) breakpoints */ + normal_brp_num = brp_num - ctx_brp_num; + + /* Lowest context aware breakpoint number */ + ctx_brp_base = normal_brp_num; + + /* Run tests with all supported breakpoints/watchpoints */ + for (c = ctx_brp_base; c < ctx_brp_base + ctx_brp_num; c++) { + for (b = 0; b < normal_brp_num; b++) { + for (w = 0; w < wrp_num; w++) + test_guest_debug_exceptions(b, w, c); + } + } +} + +static void help(char *name) +{ + puts(""); + printf("Usage: %s [-h] [-i iterations of the single step test]\n", name); + puts(""); + exit(0); +} + +int main(int argc, char *argv[]) +{ + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; + int opt; + int ss_iteration = 10000; + uint64_t aa64dfr0; + + vm = vm_create_with_one_vcpu(&vcpu, guest_code); + vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(SYS_ID_AA64DFR0_EL1), &aa64dfr0); + __TEST_REQUIRE(debug_version(aa64dfr0) >= 6, + "Armv8 debug architecture not supported."); + kvm_vm_free(vm); + + while ((opt = getopt(argc, argv, "i:")) != -1) { + switch (opt) { + case 'i': + ss_iteration = atoi_positive("Number of iterations", optarg); + break; + case 'h': + default: + help(argv[0]); + break; + } + } + + test_guest_debug_exceptions_all(aa64dfr0); + test_single_step_from_userspace(ss_iteration); + + return 0; +} diff --git a/tools/testing/selftests/kvm/aarch64/get-reg-list.c b/tools/testing/selftests/kvm/aarch64/get-reg-list.c new file mode 100644 index 000000000000..709d7d721760 --- /dev/null +++ b/tools/testing/selftests/kvm/aarch64/get-reg-list.c @@ -0,0 +1,757 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Check for KVM_GET_REG_LIST regressions. + * + * Copyright (C) 2020, Red Hat, Inc. + * + * While the blessed list should be created from the oldest possible + * kernel, we can't go older than v5.2, though, because that's the first + * release which includes df205b5c6328 ("KVM: arm64: Filter out invalid + * core register IDs in KVM_GET_REG_LIST"). Without that commit the core + * registers won't match expectations. + */ +#include <stdio.h> +#include "kvm_util.h" +#include "test_util.h" +#include "processor.h" + +struct feature_id_reg { + __u64 reg; + __u64 id_reg; + __u64 feat_shift; + __u64 feat_min; +}; + +static struct feature_id_reg feat_id_regs[] = { + { + ARM64_SYS_REG(3, 0, 2, 0, 3), /* TCR2_EL1 */ + ARM64_SYS_REG(3, 0, 0, 7, 3), /* ID_AA64MMFR3_EL1 */ + 0, + 1 + }, + { + ARM64_SYS_REG(3, 0, 10, 2, 2), /* PIRE0_EL1 */ + ARM64_SYS_REG(3, 0, 0, 7, 3), /* ID_AA64MMFR3_EL1 */ + 4, + 1 + }, + { + ARM64_SYS_REG(3, 0, 10, 2, 3), /* PIR_EL1 */ + ARM64_SYS_REG(3, 0, 0, 7, 3), /* ID_AA64MMFR3_EL1 */ + 4, + 1 + } +}; + +bool filter_reg(__u64 reg) +{ + /* + * DEMUX register presence depends on the host's CLIDR_EL1. + * This means there's no set of them that we can bless. + */ + if ((reg & KVM_REG_ARM_COPROC_MASK) == KVM_REG_ARM_DEMUX) + return true; + + return false; +} + +static bool check_supported_feat_reg(struct kvm_vcpu *vcpu, __u64 reg) +{ + int i, ret; + __u64 data, feat_val; + + for (i = 0; i < ARRAY_SIZE(feat_id_regs); i++) { + if (feat_id_regs[i].reg == reg) { + ret = __vcpu_get_reg(vcpu, feat_id_regs[i].id_reg, &data); + if (ret < 0) + return false; + + feat_val = ((data >> feat_id_regs[i].feat_shift) & 0xf); + return feat_val >= feat_id_regs[i].feat_min; + } + } + + return true; +} + +bool check_supported_reg(struct kvm_vcpu *vcpu, __u64 reg) +{ + return check_supported_feat_reg(vcpu, reg); +} + +bool check_reject_set(int err) +{ + return err == EPERM; +} + +void finalize_vcpu(struct kvm_vcpu *vcpu, struct vcpu_reg_list *c) +{ + struct vcpu_reg_sublist *s; + int feature; + + for_each_sublist(c, s) { + if (s->finalize) { + feature = s->feature; + vcpu_ioctl(vcpu, KVM_ARM_VCPU_FINALIZE, &feature); + } + } +} + +#define REG_MASK (KVM_REG_ARCH_MASK | KVM_REG_SIZE_MASK | KVM_REG_ARM_COPROC_MASK) + +#define CORE_REGS_XX_NR_WORDS 2 +#define CORE_SPSR_XX_NR_WORDS 2 +#define CORE_FPREGS_XX_NR_WORDS 4 + +static const char *core_id_to_str(const char *prefix, __u64 id) +{ + __u64 core_off = id & ~REG_MASK, idx; + + /* + * core_off is the offset into struct kvm_regs + */ + switch (core_off) { + case KVM_REG_ARM_CORE_REG(regs.regs[0]) ... + KVM_REG_ARM_CORE_REG(regs.regs[30]): + idx = (core_off - KVM_REG_ARM_CORE_REG(regs.regs[0])) / CORE_REGS_XX_NR_WORDS; + TEST_ASSERT(idx < 31, "%s: Unexpected regs.regs index: %lld", prefix, idx); + return strdup_printf("KVM_REG_ARM_CORE_REG(regs.regs[%lld])", idx); + case KVM_REG_ARM_CORE_REG(regs.sp): + return "KVM_REG_ARM_CORE_REG(regs.sp)"; + case KVM_REG_ARM_CORE_REG(regs.pc): + return "KVM_REG_ARM_CORE_REG(regs.pc)"; + case KVM_REG_ARM_CORE_REG(regs.pstate): + return "KVM_REG_ARM_CORE_REG(regs.pstate)"; + case KVM_REG_ARM_CORE_REG(sp_el1): + return "KVM_REG_ARM_CORE_REG(sp_el1)"; + case KVM_REG_ARM_CORE_REG(elr_el1): + return "KVM_REG_ARM_CORE_REG(elr_el1)"; + case KVM_REG_ARM_CORE_REG(spsr[0]) ... + KVM_REG_ARM_CORE_REG(spsr[KVM_NR_SPSR - 1]): + idx = (core_off - KVM_REG_ARM_CORE_REG(spsr[0])) / CORE_SPSR_XX_NR_WORDS; + TEST_ASSERT(idx < KVM_NR_SPSR, "%s: Unexpected spsr index: %lld", prefix, idx); + return strdup_printf("KVM_REG_ARM_CORE_REG(spsr[%lld])", idx); + case KVM_REG_ARM_CORE_REG(fp_regs.vregs[0]) ... + KVM_REG_ARM_CORE_REG(fp_regs.vregs[31]): + idx = (core_off - KVM_REG_ARM_CORE_REG(fp_regs.vregs[0])) / CORE_FPREGS_XX_NR_WORDS; + TEST_ASSERT(idx < 32, "%s: Unexpected fp_regs.vregs index: %lld", prefix, idx); + return strdup_printf("KVM_REG_ARM_CORE_REG(fp_regs.vregs[%lld])", idx); + case KVM_REG_ARM_CORE_REG(fp_regs.fpsr): + return "KVM_REG_ARM_CORE_REG(fp_regs.fpsr)"; + case KVM_REG_ARM_CORE_REG(fp_regs.fpcr): + return "KVM_REG_ARM_CORE_REG(fp_regs.fpcr)"; + } + + TEST_FAIL("%s: Unknown core reg id: 0x%llx", prefix, id); + return NULL; +} + +static const char *sve_id_to_str(const char *prefix, __u64 id) +{ + __u64 sve_off, n, i; + + if (id == KVM_REG_ARM64_SVE_VLS) + return "KVM_REG_ARM64_SVE_VLS"; + + sve_off = id & ~(REG_MASK | ((1ULL << 5) - 1)); + i = id & (KVM_ARM64_SVE_MAX_SLICES - 1); + + TEST_ASSERT(i == 0, "%s: Currently we don't expect slice > 0, reg id 0x%llx", prefix, id); + + switch (sve_off) { + case KVM_REG_ARM64_SVE_ZREG_BASE ... + KVM_REG_ARM64_SVE_ZREG_BASE + (1ULL << 5) * KVM_ARM64_SVE_NUM_ZREGS - 1: + n = (id >> 5) & (KVM_ARM64_SVE_NUM_ZREGS - 1); + TEST_ASSERT(id == KVM_REG_ARM64_SVE_ZREG(n, 0), + "%s: Unexpected bits set in SVE ZREG id: 0x%llx", prefix, id); + return strdup_printf("KVM_REG_ARM64_SVE_ZREG(%lld, 0)", n); + case KVM_REG_ARM64_SVE_PREG_BASE ... + KVM_REG_ARM64_SVE_PREG_BASE + (1ULL << 5) * KVM_ARM64_SVE_NUM_PREGS - 1: + n = (id >> 5) & (KVM_ARM64_SVE_NUM_PREGS - 1); + TEST_ASSERT(id == KVM_REG_ARM64_SVE_PREG(n, 0), + "%s: Unexpected bits set in SVE PREG id: 0x%llx", prefix, id); + return strdup_printf("KVM_REG_ARM64_SVE_PREG(%lld, 0)", n); + case KVM_REG_ARM64_SVE_FFR_BASE: + TEST_ASSERT(id == KVM_REG_ARM64_SVE_FFR(0), + "%s: Unexpected bits set in SVE FFR id: 0x%llx", prefix, id); + return "KVM_REG_ARM64_SVE_FFR(0)"; + } + + return NULL; +} + +void print_reg(const char *prefix, __u64 id) +{ + unsigned op0, op1, crn, crm, op2; + const char *reg_size = NULL; + + TEST_ASSERT((id & KVM_REG_ARCH_MASK) == KVM_REG_ARM64, + "%s: KVM_REG_ARM64 missing in reg id: 0x%llx", prefix, id); + + switch (id & KVM_REG_SIZE_MASK) { + case KVM_REG_SIZE_U8: + reg_size = "KVM_REG_SIZE_U8"; + break; + case KVM_REG_SIZE_U16: + reg_size = "KVM_REG_SIZE_U16"; + break; + case KVM_REG_SIZE_U32: + reg_size = "KVM_REG_SIZE_U32"; + break; + case KVM_REG_SIZE_U64: + reg_size = "KVM_REG_SIZE_U64"; + break; + case KVM_REG_SIZE_U128: + reg_size = "KVM_REG_SIZE_U128"; + break; + case KVM_REG_SIZE_U256: + reg_size = "KVM_REG_SIZE_U256"; + break; + case KVM_REG_SIZE_U512: + reg_size = "KVM_REG_SIZE_U512"; + break; + case KVM_REG_SIZE_U1024: + reg_size = "KVM_REG_SIZE_U1024"; + break; + case KVM_REG_SIZE_U2048: + reg_size = "KVM_REG_SIZE_U2048"; + break; + default: + TEST_FAIL("%s: Unexpected reg size: 0x%llx in reg id: 0x%llx", + prefix, (id & KVM_REG_SIZE_MASK) >> KVM_REG_SIZE_SHIFT, id); + } + + switch (id & KVM_REG_ARM_COPROC_MASK) { + case KVM_REG_ARM_CORE: + printf("\tKVM_REG_ARM64 | %s | KVM_REG_ARM_CORE | %s,\n", reg_size, core_id_to_str(prefix, id)); + break; + case KVM_REG_ARM_DEMUX: + TEST_ASSERT(!(id & ~(REG_MASK | KVM_REG_ARM_DEMUX_ID_MASK | KVM_REG_ARM_DEMUX_VAL_MASK)), + "%s: Unexpected bits set in DEMUX reg id: 0x%llx", prefix, id); + printf("\tKVM_REG_ARM64 | %s | KVM_REG_ARM_DEMUX | KVM_REG_ARM_DEMUX_ID_CCSIDR | %lld,\n", + reg_size, id & KVM_REG_ARM_DEMUX_VAL_MASK); + break; + case KVM_REG_ARM64_SYSREG: + op0 = (id & KVM_REG_ARM64_SYSREG_OP0_MASK) >> KVM_REG_ARM64_SYSREG_OP0_SHIFT; + op1 = (id & KVM_REG_ARM64_SYSREG_OP1_MASK) >> KVM_REG_ARM64_SYSREG_OP1_SHIFT; + crn = (id & KVM_REG_ARM64_SYSREG_CRN_MASK) >> KVM_REG_ARM64_SYSREG_CRN_SHIFT; + crm = (id & KVM_REG_ARM64_SYSREG_CRM_MASK) >> KVM_REG_ARM64_SYSREG_CRM_SHIFT; + op2 = (id & KVM_REG_ARM64_SYSREG_OP2_MASK) >> KVM_REG_ARM64_SYSREG_OP2_SHIFT; + TEST_ASSERT(id == ARM64_SYS_REG(op0, op1, crn, crm, op2), + "%s: Unexpected bits set in SYSREG reg id: 0x%llx", prefix, id); + printf("\tARM64_SYS_REG(%d, %d, %d, %d, %d),\n", op0, op1, crn, crm, op2); + break; + case KVM_REG_ARM_FW: + TEST_ASSERT(id == KVM_REG_ARM_FW_REG(id & 0xffff), + "%s: Unexpected bits set in FW reg id: 0x%llx", prefix, id); + printf("\tKVM_REG_ARM_FW_REG(%lld),\n", id & 0xffff); + break; + case KVM_REG_ARM_FW_FEAT_BMAP: + TEST_ASSERT(id == KVM_REG_ARM_FW_FEAT_BMAP_REG(id & 0xffff), + "%s: Unexpected bits set in the bitmap feature FW reg id: 0x%llx", prefix, id); + printf("\tKVM_REG_ARM_FW_FEAT_BMAP_REG(%lld),\n", id & 0xffff); + break; + case KVM_REG_ARM64_SVE: + printf("\t%s,\n", sve_id_to_str(prefix, id)); + break; + default: + TEST_FAIL("%s: Unexpected coproc type: 0x%llx in reg id: 0x%llx", + prefix, (id & KVM_REG_ARM_COPROC_MASK) >> KVM_REG_ARM_COPROC_SHIFT, id); + } +} + +/* + * The original blessed list was primed with the output of kernel version + * v4.15 with --core-reg-fixup and then later updated with new registers. + * (The --core-reg-fixup option and it's fixup function have been removed + * from the test, as it's unlikely to use this type of test on a kernel + * older than v5.2.) + * + * The blessed list is up to date with kernel version v6.4 (or so we hope) + */ +static __u64 base_regs[] = { + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[0]), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[1]), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[2]), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[3]), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[4]), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[5]), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[6]), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[7]), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[8]), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[9]), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[10]), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[11]), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[12]), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[13]), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[14]), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[15]), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[16]), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[17]), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[18]), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[19]), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[20]), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[21]), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[22]), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[23]), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[24]), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[25]), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[26]), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[27]), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[28]), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[29]), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[30]), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.sp), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.pc), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.pstate), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(sp_el1), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(elr_el1), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(spsr[0]), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(spsr[1]), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(spsr[2]), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(spsr[3]), + KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(spsr[4]), + KVM_REG_ARM64 | KVM_REG_SIZE_U32 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.fpsr), + KVM_REG_ARM64 | KVM_REG_SIZE_U32 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.fpcr), + KVM_REG_ARM_FW_REG(0), /* KVM_REG_ARM_PSCI_VERSION */ + KVM_REG_ARM_FW_REG(1), /* KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1 */ + KVM_REG_ARM_FW_REG(2), /* KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2 */ + KVM_REG_ARM_FW_REG(3), /* KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_3 */ + KVM_REG_ARM_FW_FEAT_BMAP_REG(0), /* KVM_REG_ARM_STD_BMAP */ + KVM_REG_ARM_FW_FEAT_BMAP_REG(1), /* KVM_REG_ARM_STD_HYP_BMAP */ + KVM_REG_ARM_FW_FEAT_BMAP_REG(2), /* KVM_REG_ARM_VENDOR_HYP_BMAP */ + ARM64_SYS_REG(3, 3, 14, 3, 1), /* CNTV_CTL_EL0 */ + ARM64_SYS_REG(3, 3, 14, 3, 2), /* CNTV_CVAL_EL0 */ + ARM64_SYS_REG(3, 3, 14, 0, 2), + ARM64_SYS_REG(3, 0, 0, 0, 0), /* MIDR_EL1 */ + ARM64_SYS_REG(3, 0, 0, 0, 6), /* REVIDR_EL1 */ + ARM64_SYS_REG(3, 1, 0, 0, 1), /* CLIDR_EL1 */ + ARM64_SYS_REG(3, 1, 0, 0, 7), /* AIDR_EL1 */ + ARM64_SYS_REG(3, 3, 0, 0, 1), /* CTR_EL0 */ + ARM64_SYS_REG(2, 0, 0, 0, 4), + ARM64_SYS_REG(2, 0, 0, 0, 5), + ARM64_SYS_REG(2, 0, 0, 0, 6), + ARM64_SYS_REG(2, 0, 0, 0, 7), + ARM64_SYS_REG(2, 0, 0, 1, 4), + ARM64_SYS_REG(2, 0, 0, 1, 5), + ARM64_SYS_REG(2, 0, 0, 1, 6), + ARM64_SYS_REG(2, 0, 0, 1, 7), + ARM64_SYS_REG(2, 0, 0, 2, 0), /* MDCCINT_EL1 */ + ARM64_SYS_REG(2, 0, 0, 2, 2), /* MDSCR_EL1 */ + ARM64_SYS_REG(2, 0, 0, 2, 4), + ARM64_SYS_REG(2, 0, 0, 2, 5), + ARM64_SYS_REG(2, 0, 0, 2, 6), + ARM64_SYS_REG(2, 0, 0, 2, 7), + ARM64_SYS_REG(2, 0, 0, 3, 4), + ARM64_SYS_REG(2, 0, 0, 3, 5), + ARM64_SYS_REG(2, 0, 0, 3, 6), + ARM64_SYS_REG(2, 0, 0, 3, 7), + ARM64_SYS_REG(2, 0, 0, 4, 4), + ARM64_SYS_REG(2, 0, 0, 4, 5), + ARM64_SYS_REG(2, 0, 0, 4, 6), + ARM64_SYS_REG(2, 0, 0, 4, 7), + ARM64_SYS_REG(2, 0, 0, 5, 4), + ARM64_SYS_REG(2, 0, 0, 5, 5), + ARM64_SYS_REG(2, 0, 0, 5, 6), + ARM64_SYS_REG(2, 0, 0, 5, 7), + ARM64_SYS_REG(2, 0, 0, 6, 4), + ARM64_SYS_REG(2, 0, 0, 6, 5), + ARM64_SYS_REG(2, 0, 0, 6, 6), + ARM64_SYS_REG(2, 0, 0, 6, 7), + ARM64_SYS_REG(2, 0, 0, 7, 4), + ARM64_SYS_REG(2, 0, 0, 7, 5), + ARM64_SYS_REG(2, 0, 0, 7, 6), + ARM64_SYS_REG(2, 0, 0, 7, 7), + ARM64_SYS_REG(2, 0, 0, 8, 4), + ARM64_SYS_REG(2, 0, 0, 8, 5), + ARM64_SYS_REG(2, 0, 0, 8, 6), + ARM64_SYS_REG(2, 0, 0, 8, 7), + ARM64_SYS_REG(2, 0, 0, 9, 4), + ARM64_SYS_REG(2, 0, 0, 9, 5), + ARM64_SYS_REG(2, 0, 0, 9, 6), + ARM64_SYS_REG(2, 0, 0, 9, 7), + ARM64_SYS_REG(2, 0, 0, 10, 4), + ARM64_SYS_REG(2, 0, 0, 10, 5), + ARM64_SYS_REG(2, 0, 0, 10, 6), + ARM64_SYS_REG(2, 0, 0, 10, 7), + ARM64_SYS_REG(2, 0, 0, 11, 4), + ARM64_SYS_REG(2, 0, 0, 11, 5), + ARM64_SYS_REG(2, 0, 0, 11, 6), + ARM64_SYS_REG(2, 0, 0, 11, 7), + ARM64_SYS_REG(2, 0, 0, 12, 4), + ARM64_SYS_REG(2, 0, 0, 12, 5), + ARM64_SYS_REG(2, 0, 0, 12, 6), + ARM64_SYS_REG(2, 0, 0, 12, 7), + ARM64_SYS_REG(2, 0, 0, 13, 4), + ARM64_SYS_REG(2, 0, 0, 13, 5), + ARM64_SYS_REG(2, 0, 0, 13, 6), + ARM64_SYS_REG(2, 0, 0, 13, 7), + ARM64_SYS_REG(2, 0, 0, 14, 4), + ARM64_SYS_REG(2, 0, 0, 14, 5), + ARM64_SYS_REG(2, 0, 0, 14, 6), + ARM64_SYS_REG(2, 0, 0, 14, 7), + ARM64_SYS_REG(2, 0, 0, 15, 4), + ARM64_SYS_REG(2, 0, 0, 15, 5), + ARM64_SYS_REG(2, 0, 0, 15, 6), + ARM64_SYS_REG(2, 0, 0, 15, 7), + ARM64_SYS_REG(2, 0, 1, 1, 4), /* OSLSR_EL1 */ + ARM64_SYS_REG(2, 4, 0, 7, 0), /* DBGVCR32_EL2 */ + ARM64_SYS_REG(3, 0, 0, 0, 5), /* MPIDR_EL1 */ + ARM64_SYS_REG(3, 0, 0, 1, 0), /* ID_PFR0_EL1 */ + ARM64_SYS_REG(3, 0, 0, 1, 1), /* ID_PFR1_EL1 */ + ARM64_SYS_REG(3, 0, 0, 1, 2), /* ID_DFR0_EL1 */ + ARM64_SYS_REG(3, 0, 0, 1, 3), /* ID_AFR0_EL1 */ + ARM64_SYS_REG(3, 0, 0, 1, 4), /* ID_MMFR0_EL1 */ + ARM64_SYS_REG(3, 0, 0, 1, 5), /* ID_MMFR1_EL1 */ + ARM64_SYS_REG(3, 0, 0, 1, 6), /* ID_MMFR2_EL1 */ + ARM64_SYS_REG(3, 0, 0, 1, 7), /* ID_MMFR3_EL1 */ + ARM64_SYS_REG(3, 0, 0, 2, 0), /* ID_ISAR0_EL1 */ + ARM64_SYS_REG(3, 0, 0, 2, 1), /* ID_ISAR1_EL1 */ + ARM64_SYS_REG(3, 0, 0, 2, 2), /* ID_ISAR2_EL1 */ + ARM64_SYS_REG(3, 0, 0, 2, 3), /* ID_ISAR3_EL1 */ + ARM64_SYS_REG(3, 0, 0, 2, 4), /* ID_ISAR4_EL1 */ + ARM64_SYS_REG(3, 0, 0, 2, 5), /* ID_ISAR5_EL1 */ + ARM64_SYS_REG(3, 0, 0, 2, 6), /* ID_MMFR4_EL1 */ + ARM64_SYS_REG(3, 0, 0, 2, 7), /* ID_ISAR6_EL1 */ + ARM64_SYS_REG(3, 0, 0, 3, 0), /* MVFR0_EL1 */ + ARM64_SYS_REG(3, 0, 0, 3, 1), /* MVFR1_EL1 */ + ARM64_SYS_REG(3, 0, 0, 3, 2), /* MVFR2_EL1 */ + ARM64_SYS_REG(3, 0, 0, 3, 3), + ARM64_SYS_REG(3, 0, 0, 3, 4), /* ID_PFR2_EL1 */ + ARM64_SYS_REG(3, 0, 0, 3, 5), /* ID_DFR1_EL1 */ + ARM64_SYS_REG(3, 0, 0, 3, 6), /* ID_MMFR5_EL1 */ + ARM64_SYS_REG(3, 0, 0, 3, 7), + ARM64_SYS_REG(3, 0, 0, 4, 0), /* ID_AA64PFR0_EL1 */ + ARM64_SYS_REG(3, 0, 0, 4, 1), /* ID_AA64PFR1_EL1 */ + ARM64_SYS_REG(3, 0, 0, 4, 2), /* ID_AA64PFR2_EL1 */ + ARM64_SYS_REG(3, 0, 0, 4, 3), + ARM64_SYS_REG(3, 0, 0, 4, 4), /* ID_AA64ZFR0_EL1 */ + ARM64_SYS_REG(3, 0, 0, 4, 5), /* ID_AA64SMFR0_EL1 */ + ARM64_SYS_REG(3, 0, 0, 4, 6), + ARM64_SYS_REG(3, 0, 0, 4, 7), + ARM64_SYS_REG(3, 0, 0, 5, 0), /* ID_AA64DFR0_EL1 */ + ARM64_SYS_REG(3, 0, 0, 5, 1), /* ID_AA64DFR1_EL1 */ + ARM64_SYS_REG(3, 0, 0, 5, 2), + ARM64_SYS_REG(3, 0, 0, 5, 3), + ARM64_SYS_REG(3, 0, 0, 5, 4), /* ID_AA64AFR0_EL1 */ + ARM64_SYS_REG(3, 0, 0, 5, 5), /* ID_AA64AFR1_EL1 */ + ARM64_SYS_REG(3, 0, 0, 5, 6), + ARM64_SYS_REG(3, 0, 0, 5, 7), + ARM64_SYS_REG(3, 0, 0, 6, 0), /* ID_AA64ISAR0_EL1 */ + ARM64_SYS_REG(3, 0, 0, 6, 1), /* ID_AA64ISAR1_EL1 */ + ARM64_SYS_REG(3, 0, 0, 6, 2), /* ID_AA64ISAR2_EL1 */ + ARM64_SYS_REG(3, 0, 0, 6, 3), + ARM64_SYS_REG(3, 0, 0, 6, 4), + ARM64_SYS_REG(3, 0, 0, 6, 5), + ARM64_SYS_REG(3, 0, 0, 6, 6), + ARM64_SYS_REG(3, 0, 0, 6, 7), + ARM64_SYS_REG(3, 0, 0, 7, 0), /* ID_AA64MMFR0_EL1 */ + ARM64_SYS_REG(3, 0, 0, 7, 1), /* ID_AA64MMFR1_EL1 */ + ARM64_SYS_REG(3, 0, 0, 7, 2), /* ID_AA64MMFR2_EL1 */ + ARM64_SYS_REG(3, 0, 0, 7, 3), /* ID_AA64MMFR3_EL1 */ + ARM64_SYS_REG(3, 0, 0, 7, 4), /* ID_AA64MMFR4_EL1 */ + ARM64_SYS_REG(3, 0, 0, 7, 5), + ARM64_SYS_REG(3, 0, 0, 7, 6), + ARM64_SYS_REG(3, 0, 0, 7, 7), + ARM64_SYS_REG(3, 0, 1, 0, 0), /* SCTLR_EL1 */ + ARM64_SYS_REG(3, 0, 1, 0, 1), /* ACTLR_EL1 */ + ARM64_SYS_REG(3, 0, 1, 0, 2), /* CPACR_EL1 */ + ARM64_SYS_REG(3, 0, 2, 0, 0), /* TTBR0_EL1 */ + ARM64_SYS_REG(3, 0, 2, 0, 1), /* TTBR1_EL1 */ + ARM64_SYS_REG(3, 0, 2, 0, 2), /* TCR_EL1 */ + ARM64_SYS_REG(3, 0, 2, 0, 3), /* TCR2_EL1 */ + ARM64_SYS_REG(3, 0, 5, 1, 0), /* AFSR0_EL1 */ + ARM64_SYS_REG(3, 0, 5, 1, 1), /* AFSR1_EL1 */ + ARM64_SYS_REG(3, 0, 5, 2, 0), /* ESR_EL1 */ + ARM64_SYS_REG(3, 0, 6, 0, 0), /* FAR_EL1 */ + ARM64_SYS_REG(3, 0, 7, 4, 0), /* PAR_EL1 */ + ARM64_SYS_REG(3, 0, 10, 2, 0), /* MAIR_EL1 */ + ARM64_SYS_REG(3, 0, 10, 2, 2), /* PIRE0_EL1 */ + ARM64_SYS_REG(3, 0, 10, 2, 3), /* PIR_EL1 */ + ARM64_SYS_REG(3, 0, 10, 3, 0), /* AMAIR_EL1 */ + ARM64_SYS_REG(3, 0, 12, 0, 0), /* VBAR_EL1 */ + ARM64_SYS_REG(3, 0, 12, 1, 1), /* DISR_EL1 */ + ARM64_SYS_REG(3, 0, 13, 0, 1), /* CONTEXTIDR_EL1 */ + ARM64_SYS_REG(3, 0, 13, 0, 4), /* TPIDR_EL1 */ + ARM64_SYS_REG(3, 0, 14, 1, 0), /* CNTKCTL_EL1 */ + ARM64_SYS_REG(3, 2, 0, 0, 0), /* CSSELR_EL1 */ + ARM64_SYS_REG(3, 3, 13, 0, 2), /* TPIDR_EL0 */ + ARM64_SYS_REG(3, 3, 13, 0, 3), /* TPIDRRO_EL0 */ + ARM64_SYS_REG(3, 3, 14, 0, 1), /* CNTPCT_EL0 */ + ARM64_SYS_REG(3, 3, 14, 2, 1), /* CNTP_CTL_EL0 */ + ARM64_SYS_REG(3, 3, 14, 2, 2), /* CNTP_CVAL_EL0 */ + ARM64_SYS_REG(3, 4, 3, 0, 0), /* DACR32_EL2 */ + ARM64_SYS_REG(3, 4, 5, 0, 1), /* IFSR32_EL2 */ + ARM64_SYS_REG(3, 4, 5, 3, 0), /* FPEXC32_EL2 */ +}; + +static __u64 pmu_regs[] = { + ARM64_SYS_REG(3, 0, 9, 14, 1), /* PMINTENSET_EL1 */ + ARM64_SYS_REG(3, 0, 9, 14, 2), /* PMINTENCLR_EL1 */ + ARM64_SYS_REG(3, 3, 9, 12, 0), /* PMCR_EL0 */ + ARM64_SYS_REG(3, 3, 9, 12, 1), /* PMCNTENSET_EL0 */ + ARM64_SYS_REG(3, 3, 9, 12, 2), /* PMCNTENCLR_EL0 */ + ARM64_SYS_REG(3, 3, 9, 12, 3), /* PMOVSCLR_EL0 */ + ARM64_SYS_REG(3, 3, 9, 12, 4), /* PMSWINC_EL0 */ + ARM64_SYS_REG(3, 3, 9, 12, 5), /* PMSELR_EL0 */ + ARM64_SYS_REG(3, 3, 9, 13, 0), /* PMCCNTR_EL0 */ + ARM64_SYS_REG(3, 3, 9, 14, 0), /* PMUSERENR_EL0 */ + ARM64_SYS_REG(3, 3, 9, 14, 3), /* PMOVSSET_EL0 */ + ARM64_SYS_REG(3, 3, 14, 8, 0), + ARM64_SYS_REG(3, 3, 14, 8, 1), + ARM64_SYS_REG(3, 3, 14, 8, 2), + ARM64_SYS_REG(3, 3, 14, 8, 3), + ARM64_SYS_REG(3, 3, 14, 8, 4), + ARM64_SYS_REG(3, 3, 14, 8, 5), + ARM64_SYS_REG(3, 3, 14, 8, 6), + ARM64_SYS_REG(3, 3, 14, 8, 7), + ARM64_SYS_REG(3, 3, 14, 9, 0), + ARM64_SYS_REG(3, 3, 14, 9, 1), + ARM64_SYS_REG(3, 3, 14, 9, 2), + ARM64_SYS_REG(3, 3, 14, 9, 3), + ARM64_SYS_REG(3, 3, 14, 9, 4), + ARM64_SYS_REG(3, 3, 14, 9, 5), + ARM64_SYS_REG(3, 3, 14, 9, 6), + ARM64_SYS_REG(3, 3, 14, 9, 7), + ARM64_SYS_REG(3, 3, 14, 10, 0), + ARM64_SYS_REG(3, 3, 14, 10, 1), + ARM64_SYS_REG(3, 3, 14, 10, 2), + ARM64_SYS_REG(3, 3, 14, 10, 3), + ARM64_SYS_REG(3, 3, 14, 10, 4), + ARM64_SYS_REG(3, 3, 14, 10, 5), + ARM64_SYS_REG(3, 3, 14, 10, 6), + ARM64_SYS_REG(3, 3, 14, 10, 7), + ARM64_SYS_REG(3, 3, 14, 11, 0), + ARM64_SYS_REG(3, 3, 14, 11, 1), + ARM64_SYS_REG(3, 3, 14, 11, 2), + ARM64_SYS_REG(3, 3, 14, 11, 3), + ARM64_SYS_REG(3, 3, 14, 11, 4), + ARM64_SYS_REG(3, 3, 14, 11, 5), + ARM64_SYS_REG(3, 3, 14, 11, 6), + ARM64_SYS_REG(3, 3, 14, 12, 0), + ARM64_SYS_REG(3, 3, 14, 12, 1), + ARM64_SYS_REG(3, 3, 14, 12, 2), + ARM64_SYS_REG(3, 3, 14, 12, 3), + ARM64_SYS_REG(3, 3, 14, 12, 4), + ARM64_SYS_REG(3, 3, 14, 12, 5), + ARM64_SYS_REG(3, 3, 14, 12, 6), + ARM64_SYS_REG(3, 3, 14, 12, 7), + ARM64_SYS_REG(3, 3, 14, 13, 0), + ARM64_SYS_REG(3, 3, 14, 13, 1), + ARM64_SYS_REG(3, 3, 14, 13, 2), + ARM64_SYS_REG(3, 3, 14, 13, 3), + ARM64_SYS_REG(3, 3, 14, 13, 4), + ARM64_SYS_REG(3, 3, 14, 13, 5), + ARM64_SYS_REG(3, 3, 14, 13, 6), + ARM64_SYS_REG(3, 3, 14, 13, 7), + ARM64_SYS_REG(3, 3, 14, 14, 0), + ARM64_SYS_REG(3, 3, 14, 14, 1), + ARM64_SYS_REG(3, 3, 14, 14, 2), + ARM64_SYS_REG(3, 3, 14, 14, 3), + ARM64_SYS_REG(3, 3, 14, 14, 4), + ARM64_SYS_REG(3, 3, 14, 14, 5), + ARM64_SYS_REG(3, 3, 14, 14, 6), + ARM64_SYS_REG(3, 3, 14, 14, 7), + ARM64_SYS_REG(3, 3, 14, 15, 0), + ARM64_SYS_REG(3, 3, 14, 15, 1), + ARM64_SYS_REG(3, 3, 14, 15, 2), + ARM64_SYS_REG(3, 3, 14, 15, 3), + ARM64_SYS_REG(3, 3, 14, 15, 4), + ARM64_SYS_REG(3, 3, 14, 15, 5), + ARM64_SYS_REG(3, 3, 14, 15, 6), + ARM64_SYS_REG(3, 3, 14, 15, 7), /* PMCCFILTR_EL0 */ +}; + +static __u64 vregs[] = { + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[0]), + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[1]), + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[2]), + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[3]), + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[4]), + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[5]), + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[6]), + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[7]), + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[8]), + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[9]), + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[10]), + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[11]), + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[12]), + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[13]), + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[14]), + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[15]), + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[16]), + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[17]), + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[18]), + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[19]), + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[20]), + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[21]), + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[22]), + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[23]), + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[24]), + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[25]), + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[26]), + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[27]), + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[28]), + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[29]), + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[30]), + KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[31]), +}; + +static __u64 sve_regs[] = { + KVM_REG_ARM64_SVE_VLS, + KVM_REG_ARM64_SVE_ZREG(0, 0), + KVM_REG_ARM64_SVE_ZREG(1, 0), + KVM_REG_ARM64_SVE_ZREG(2, 0), + KVM_REG_ARM64_SVE_ZREG(3, 0), + KVM_REG_ARM64_SVE_ZREG(4, 0), + KVM_REG_ARM64_SVE_ZREG(5, 0), + KVM_REG_ARM64_SVE_ZREG(6, 0), + KVM_REG_ARM64_SVE_ZREG(7, 0), + KVM_REG_ARM64_SVE_ZREG(8, 0), + KVM_REG_ARM64_SVE_ZREG(9, 0), + KVM_REG_ARM64_SVE_ZREG(10, 0), + KVM_REG_ARM64_SVE_ZREG(11, 0), + KVM_REG_ARM64_SVE_ZREG(12, 0), + KVM_REG_ARM64_SVE_ZREG(13, 0), + KVM_REG_ARM64_SVE_ZREG(14, 0), + KVM_REG_ARM64_SVE_ZREG(15, 0), + KVM_REG_ARM64_SVE_ZREG(16, 0), + KVM_REG_ARM64_SVE_ZREG(17, 0), + KVM_REG_ARM64_SVE_ZREG(18, 0), + KVM_REG_ARM64_SVE_ZREG(19, 0), + KVM_REG_ARM64_SVE_ZREG(20, 0), + KVM_REG_ARM64_SVE_ZREG(21, 0), + KVM_REG_ARM64_SVE_ZREG(22, 0), + KVM_REG_ARM64_SVE_ZREG(23, 0), + KVM_REG_ARM64_SVE_ZREG(24, 0), + KVM_REG_ARM64_SVE_ZREG(25, 0), + KVM_REG_ARM64_SVE_ZREG(26, 0), + KVM_REG_ARM64_SVE_ZREG(27, 0), + KVM_REG_ARM64_SVE_ZREG(28, 0), + KVM_REG_ARM64_SVE_ZREG(29, 0), + KVM_REG_ARM64_SVE_ZREG(30, 0), + KVM_REG_ARM64_SVE_ZREG(31, 0), + KVM_REG_ARM64_SVE_PREG(0, 0), + KVM_REG_ARM64_SVE_PREG(1, 0), + KVM_REG_ARM64_SVE_PREG(2, 0), + KVM_REG_ARM64_SVE_PREG(3, 0), + KVM_REG_ARM64_SVE_PREG(4, 0), + KVM_REG_ARM64_SVE_PREG(5, 0), + KVM_REG_ARM64_SVE_PREG(6, 0), + KVM_REG_ARM64_SVE_PREG(7, 0), + KVM_REG_ARM64_SVE_PREG(8, 0), + KVM_REG_ARM64_SVE_PREG(9, 0), + KVM_REG_ARM64_SVE_PREG(10, 0), + KVM_REG_ARM64_SVE_PREG(11, 0), + KVM_REG_ARM64_SVE_PREG(12, 0), + KVM_REG_ARM64_SVE_PREG(13, 0), + KVM_REG_ARM64_SVE_PREG(14, 0), + KVM_REG_ARM64_SVE_PREG(15, 0), + KVM_REG_ARM64_SVE_FFR(0), + ARM64_SYS_REG(3, 0, 1, 2, 0), /* ZCR_EL1 */ +}; + +static __u64 sve_rejects_set[] = { + KVM_REG_ARM64_SVE_VLS, +}; + +static __u64 pauth_addr_regs[] = { + ARM64_SYS_REG(3, 0, 2, 1, 0), /* APIAKEYLO_EL1 */ + ARM64_SYS_REG(3, 0, 2, 1, 1), /* APIAKEYHI_EL1 */ + ARM64_SYS_REG(3, 0, 2, 1, 2), /* APIBKEYLO_EL1 */ + ARM64_SYS_REG(3, 0, 2, 1, 3), /* APIBKEYHI_EL1 */ + ARM64_SYS_REG(3, 0, 2, 2, 0), /* APDAKEYLO_EL1 */ + ARM64_SYS_REG(3, 0, 2, 2, 1), /* APDAKEYHI_EL1 */ + ARM64_SYS_REG(3, 0, 2, 2, 2), /* APDBKEYLO_EL1 */ + ARM64_SYS_REG(3, 0, 2, 2, 3) /* APDBKEYHI_EL1 */ +}; + +static __u64 pauth_generic_regs[] = { + ARM64_SYS_REG(3, 0, 2, 3, 0), /* APGAKEYLO_EL1 */ + ARM64_SYS_REG(3, 0, 2, 3, 1), /* APGAKEYHI_EL1 */ +}; + +#define BASE_SUBLIST \ + { "base", .regs = base_regs, .regs_n = ARRAY_SIZE(base_regs), } +#define VREGS_SUBLIST \ + { "vregs", .regs = vregs, .regs_n = ARRAY_SIZE(vregs), } +#define PMU_SUBLIST \ + { "pmu", .capability = KVM_CAP_ARM_PMU_V3, .feature = KVM_ARM_VCPU_PMU_V3, \ + .regs = pmu_regs, .regs_n = ARRAY_SIZE(pmu_regs), } +#define SVE_SUBLIST \ + { "sve", .capability = KVM_CAP_ARM_SVE, .feature = KVM_ARM_VCPU_SVE, .finalize = true, \ + .regs = sve_regs, .regs_n = ARRAY_SIZE(sve_regs), \ + .rejects_set = sve_rejects_set, .rejects_set_n = ARRAY_SIZE(sve_rejects_set), } +#define PAUTH_SUBLIST \ + { \ + .name = "pauth_address", \ + .capability = KVM_CAP_ARM_PTRAUTH_ADDRESS, \ + .feature = KVM_ARM_VCPU_PTRAUTH_ADDRESS, \ + .regs = pauth_addr_regs, \ + .regs_n = ARRAY_SIZE(pauth_addr_regs), \ + }, \ + { \ + .name = "pauth_generic", \ + .capability = KVM_CAP_ARM_PTRAUTH_GENERIC, \ + .feature = KVM_ARM_VCPU_PTRAUTH_GENERIC, \ + .regs = pauth_generic_regs, \ + .regs_n = ARRAY_SIZE(pauth_generic_regs), \ + } + +static struct vcpu_reg_list vregs_config = { + .sublists = { + BASE_SUBLIST, + VREGS_SUBLIST, + {0}, + }, +}; +static struct vcpu_reg_list vregs_pmu_config = { + .sublists = { + BASE_SUBLIST, + VREGS_SUBLIST, + PMU_SUBLIST, + {0}, + }, +}; +static struct vcpu_reg_list sve_config = { + .sublists = { + BASE_SUBLIST, + SVE_SUBLIST, + {0}, + }, +}; +static struct vcpu_reg_list sve_pmu_config = { + .sublists = { + BASE_SUBLIST, + SVE_SUBLIST, + PMU_SUBLIST, + {0}, + }, +}; +static struct vcpu_reg_list pauth_config = { + .sublists = { + BASE_SUBLIST, + VREGS_SUBLIST, + PAUTH_SUBLIST, + {0}, + }, +}; +static struct vcpu_reg_list pauth_pmu_config = { + .sublists = { + BASE_SUBLIST, + VREGS_SUBLIST, + PAUTH_SUBLIST, + PMU_SUBLIST, + {0}, + }, +}; + +struct vcpu_reg_list *vcpu_configs[] = { + &vregs_config, + &vregs_pmu_config, + &sve_config, + &sve_pmu_config, + &pauth_config, + &pauth_pmu_config, +}; +int vcpu_configs_n = ARRAY_SIZE(vcpu_configs); diff --git a/tools/testing/selftests/kvm/aarch64/hypercalls.c b/tools/testing/selftests/kvm/aarch64/hypercalls.c new file mode 100644 index 000000000000..9d192ce0078d --- /dev/null +++ b/tools/testing/selftests/kvm/aarch64/hypercalls.c @@ -0,0 +1,308 @@ +// SPDX-License-Identifier: GPL-2.0-only + +/* hypercalls: Check the ARM64's psuedo-firmware bitmap register interface. + * + * The test validates the basic hypercall functionalities that are exposed + * via the psuedo-firmware bitmap register. This includes the registers' + * read/write behavior before and after the VM has started, and if the + * hypercalls are properly masked or unmasked to the guest when disabled or + * enabled from the KVM userspace, respectively. + */ +#include <errno.h> +#include <linux/arm-smccc.h> +#include <asm/kvm.h> +#include <kvm_util.h> + +#include "processor.h" + +#define FW_REG_ULIMIT_VAL(max_feat_bit) (GENMASK(max_feat_bit, 0)) + +/* Last valid bits of the bitmapped firmware registers */ +#define KVM_REG_ARM_STD_BMAP_BIT_MAX 0 +#define KVM_REG_ARM_STD_HYP_BMAP_BIT_MAX 0 +#define KVM_REG_ARM_VENDOR_HYP_BMAP_BIT_MAX 1 + +struct kvm_fw_reg_info { + uint64_t reg; /* Register definition */ + uint64_t max_feat_bit; /* Bit that represents the upper limit of the feature-map */ +}; + +#define FW_REG_INFO(r) \ + { \ + .reg = r, \ + .max_feat_bit = r##_BIT_MAX, \ + } + +static const struct kvm_fw_reg_info fw_reg_info[] = { + FW_REG_INFO(KVM_REG_ARM_STD_BMAP), + FW_REG_INFO(KVM_REG_ARM_STD_HYP_BMAP), + FW_REG_INFO(KVM_REG_ARM_VENDOR_HYP_BMAP), +}; + +enum test_stage { + TEST_STAGE_REG_IFACE, + TEST_STAGE_HVC_IFACE_FEAT_DISABLED, + TEST_STAGE_HVC_IFACE_FEAT_ENABLED, + TEST_STAGE_HVC_IFACE_FALSE_INFO, + TEST_STAGE_END, +}; + +static int stage = TEST_STAGE_REG_IFACE; + +struct test_hvc_info { + uint32_t func_id; + uint64_t arg1; +}; + +#define TEST_HVC_INFO(f, a1) \ + { \ + .func_id = f, \ + .arg1 = a1, \ + } + +static const struct test_hvc_info hvc_info[] = { + /* KVM_REG_ARM_STD_BMAP */ + TEST_HVC_INFO(ARM_SMCCC_TRNG_VERSION, 0), + TEST_HVC_INFO(ARM_SMCCC_TRNG_FEATURES, ARM_SMCCC_TRNG_RND64), + TEST_HVC_INFO(ARM_SMCCC_TRNG_GET_UUID, 0), + TEST_HVC_INFO(ARM_SMCCC_TRNG_RND32, 0), + TEST_HVC_INFO(ARM_SMCCC_TRNG_RND64, 0), + + /* KVM_REG_ARM_STD_HYP_BMAP */ + TEST_HVC_INFO(ARM_SMCCC_ARCH_FEATURES_FUNC_ID, ARM_SMCCC_HV_PV_TIME_FEATURES), + TEST_HVC_INFO(ARM_SMCCC_HV_PV_TIME_FEATURES, ARM_SMCCC_HV_PV_TIME_ST), + TEST_HVC_INFO(ARM_SMCCC_HV_PV_TIME_ST, 0), + + /* KVM_REG_ARM_VENDOR_HYP_BMAP */ + TEST_HVC_INFO(ARM_SMCCC_VENDOR_HYP_KVM_FEATURES_FUNC_ID, + ARM_SMCCC_VENDOR_HYP_KVM_PTP_FUNC_ID), + TEST_HVC_INFO(ARM_SMCCC_VENDOR_HYP_CALL_UID_FUNC_ID, 0), + TEST_HVC_INFO(ARM_SMCCC_VENDOR_HYP_KVM_PTP_FUNC_ID, KVM_PTP_VIRT_COUNTER), +}; + +/* Feed false hypercall info to test the KVM behavior */ +static const struct test_hvc_info false_hvc_info[] = { + /* Feature support check against a different family of hypercalls */ + TEST_HVC_INFO(ARM_SMCCC_TRNG_FEATURES, ARM_SMCCC_VENDOR_HYP_KVM_PTP_FUNC_ID), + TEST_HVC_INFO(ARM_SMCCC_ARCH_FEATURES_FUNC_ID, ARM_SMCCC_TRNG_RND64), + TEST_HVC_INFO(ARM_SMCCC_HV_PV_TIME_FEATURES, ARM_SMCCC_TRNG_RND64), +}; + +static void guest_test_hvc(const struct test_hvc_info *hc_info) +{ + unsigned int i; + struct arm_smccc_res res; + unsigned int hvc_info_arr_sz; + + hvc_info_arr_sz = + hc_info == hvc_info ? ARRAY_SIZE(hvc_info) : ARRAY_SIZE(false_hvc_info); + + for (i = 0; i < hvc_info_arr_sz; i++, hc_info++) { + memset(&res, 0, sizeof(res)); + smccc_hvc(hc_info->func_id, hc_info->arg1, 0, 0, 0, 0, 0, 0, &res); + + switch (stage) { + case TEST_STAGE_HVC_IFACE_FEAT_DISABLED: + case TEST_STAGE_HVC_IFACE_FALSE_INFO: + __GUEST_ASSERT(res.a0 == SMCCC_RET_NOT_SUPPORTED, + "a0 = 0x%lx, func_id = 0x%x, arg1 = 0x%lx, stage = %u", + res.a0, hc_info->func_id, hc_info->arg1, stage); + break; + case TEST_STAGE_HVC_IFACE_FEAT_ENABLED: + __GUEST_ASSERT(res.a0 != SMCCC_RET_NOT_SUPPORTED, + "a0 = 0x%lx, func_id = 0x%x, arg1 = 0x%lx, stage = %u", + res.a0, hc_info->func_id, hc_info->arg1, stage); + break; + default: + GUEST_FAIL("Unexpected stage = %u", stage); + } + } +} + +static void guest_code(void) +{ + while (stage != TEST_STAGE_END) { + switch (stage) { + case TEST_STAGE_REG_IFACE: + break; + case TEST_STAGE_HVC_IFACE_FEAT_DISABLED: + case TEST_STAGE_HVC_IFACE_FEAT_ENABLED: + guest_test_hvc(hvc_info); + break; + case TEST_STAGE_HVC_IFACE_FALSE_INFO: + guest_test_hvc(false_hvc_info); + break; + default: + GUEST_FAIL("Unexpected stage = %u", stage); + } + + GUEST_SYNC(stage); + } + + GUEST_DONE(); +} + +struct st_time { + uint32_t rev; + uint32_t attr; + uint64_t st_time; +}; + +#define STEAL_TIME_SIZE ((sizeof(struct st_time) + 63) & ~63) +#define ST_GPA_BASE (1 << 30) + +static void steal_time_init(struct kvm_vcpu *vcpu) +{ + uint64_t st_ipa = (ulong)ST_GPA_BASE; + unsigned int gpages; + + gpages = vm_calc_num_guest_pages(VM_MODE_DEFAULT, STEAL_TIME_SIZE); + vm_userspace_mem_region_add(vcpu->vm, VM_MEM_SRC_ANONYMOUS, ST_GPA_BASE, 1, gpages, 0); + + vcpu_device_attr_set(vcpu, KVM_ARM_VCPU_PVTIME_CTRL, + KVM_ARM_VCPU_PVTIME_IPA, &st_ipa); +} + +static void test_fw_regs_before_vm_start(struct kvm_vcpu *vcpu) +{ + uint64_t val; + unsigned int i; + int ret; + + for (i = 0; i < ARRAY_SIZE(fw_reg_info); i++) { + const struct kvm_fw_reg_info *reg_info = &fw_reg_info[i]; + + /* First 'read' should be an upper limit of the features supported */ + vcpu_get_reg(vcpu, reg_info->reg, &val); + TEST_ASSERT(val == FW_REG_ULIMIT_VAL(reg_info->max_feat_bit), + "Expected all the features to be set for reg: 0x%lx; expected: 0x%lx; read: 0x%lx", + reg_info->reg, FW_REG_ULIMIT_VAL(reg_info->max_feat_bit), val); + + /* Test a 'write' by disabling all the features of the register map */ + ret = __vcpu_set_reg(vcpu, reg_info->reg, 0); + TEST_ASSERT(ret == 0, + "Failed to clear all the features of reg: 0x%lx; ret: %d", + reg_info->reg, errno); + + vcpu_get_reg(vcpu, reg_info->reg, &val); + TEST_ASSERT(val == 0, + "Expected all the features to be cleared for reg: 0x%lx", reg_info->reg); + + /* + * Test enabling a feature that's not supported. + * Avoid this check if all the bits are occupied. + */ + if (reg_info->max_feat_bit < 63) { + ret = __vcpu_set_reg(vcpu, reg_info->reg, BIT(reg_info->max_feat_bit + 1)); + TEST_ASSERT(ret != 0 && errno == EINVAL, + "Unexpected behavior or return value (%d) while setting an unsupported feature for reg: 0x%lx", + errno, reg_info->reg); + } + } +} + +static void test_fw_regs_after_vm_start(struct kvm_vcpu *vcpu) +{ + uint64_t val; + unsigned int i; + int ret; + + for (i = 0; i < ARRAY_SIZE(fw_reg_info); i++) { + const struct kvm_fw_reg_info *reg_info = &fw_reg_info[i]; + + /* + * Before starting the VM, the test clears all the bits. + * Check if that's still the case. + */ + vcpu_get_reg(vcpu, reg_info->reg, &val); + TEST_ASSERT(val == 0, + "Expected all the features to be cleared for reg: 0x%lx", + reg_info->reg); + + /* + * Since the VM has run at least once, KVM shouldn't allow modification of + * the registers and should return EBUSY. Set the registers and check for + * the expected errno. + */ + ret = __vcpu_set_reg(vcpu, reg_info->reg, FW_REG_ULIMIT_VAL(reg_info->max_feat_bit)); + TEST_ASSERT(ret != 0 && errno == EBUSY, + "Unexpected behavior or return value (%d) while setting a feature while VM is running for reg: 0x%lx", + errno, reg_info->reg); + } +} + +static struct kvm_vm *test_vm_create(struct kvm_vcpu **vcpu) +{ + struct kvm_vm *vm; + + vm = vm_create_with_one_vcpu(vcpu, guest_code); + + steal_time_init(*vcpu); + + return vm; +} + +static void test_guest_stage(struct kvm_vm **vm, struct kvm_vcpu **vcpu) +{ + int prev_stage = stage; + + pr_debug("Stage: %d\n", prev_stage); + + /* Sync the stage early, the VM might be freed below. */ + stage++; + sync_global_to_guest(*vm, stage); + + switch (prev_stage) { + case TEST_STAGE_REG_IFACE: + test_fw_regs_after_vm_start(*vcpu); + break; + case TEST_STAGE_HVC_IFACE_FEAT_DISABLED: + /* Start a new VM so that all the features are now enabled by default */ + kvm_vm_free(*vm); + *vm = test_vm_create(vcpu); + break; + case TEST_STAGE_HVC_IFACE_FEAT_ENABLED: + case TEST_STAGE_HVC_IFACE_FALSE_INFO: + break; + default: + TEST_FAIL("Unknown test stage: %d", prev_stage); + } +} + +static void test_run(void) +{ + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; + struct ucall uc; + bool guest_done = false; + + vm = test_vm_create(&vcpu); + + test_fw_regs_before_vm_start(vcpu); + + while (!guest_done) { + vcpu_run(vcpu); + + switch (get_ucall(vcpu, &uc)) { + case UCALL_SYNC: + test_guest_stage(&vm, &vcpu); + break; + case UCALL_DONE: + guest_done = true; + break; + case UCALL_ABORT: + REPORT_GUEST_ASSERT(uc); + break; + default: + TEST_FAIL("Unexpected guest exit"); + } + } + + kvm_vm_free(vm); +} + +int main(void) +{ + test_run(); + return 0; +} diff --git a/tools/testing/selftests/kvm/aarch64/page_fault_test.c b/tools/testing/selftests/kvm/aarch64/page_fault_test.c new file mode 100644 index 000000000000..d29b08198b42 --- /dev/null +++ b/tools/testing/selftests/kvm/aarch64/page_fault_test.c @@ -0,0 +1,1135 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * page_fault_test.c - Test stage 2 faults. + * + * This test tries different combinations of guest accesses (e.g., write, + * S1PTW), backing source type (e.g., anon) and types of faults (e.g., read on + * hugetlbfs with a hole). It checks that the expected handling method is + * called (e.g., uffd faults with the right address and write/read flag). + */ +#include <linux/bitmap.h> +#include <fcntl.h> +#include <test_util.h> +#include <kvm_util.h> +#include <processor.h> +#include <asm/sysreg.h> +#include <linux/bitfield.h> +#include "guest_modes.h" +#include "userfaultfd_util.h" + +/* Guest virtual addresses that point to the test page and its PTE. */ +#define TEST_GVA 0xc0000000 +#define TEST_EXEC_GVA (TEST_GVA + 0x8) +#define TEST_PTE_GVA 0xb0000000 +#define TEST_DATA 0x0123456789ABCDEF + +static uint64_t *guest_test_memory = (uint64_t *)TEST_GVA; + +#define CMD_NONE (0) +#define CMD_SKIP_TEST (1ULL << 1) +#define CMD_HOLE_PT (1ULL << 2) +#define CMD_HOLE_DATA (1ULL << 3) +#define CMD_CHECK_WRITE_IN_DIRTY_LOG (1ULL << 4) +#define CMD_CHECK_S1PTW_WR_IN_DIRTY_LOG (1ULL << 5) +#define CMD_CHECK_NO_WRITE_IN_DIRTY_LOG (1ULL << 6) +#define CMD_CHECK_NO_S1PTW_WR_IN_DIRTY_LOG (1ULL << 7) +#define CMD_SET_PTE_AF (1ULL << 8) + +#define PREPARE_FN_NR 10 +#define CHECK_FN_NR 10 + +static struct event_cnt { + int mmio_exits; + int fail_vcpu_runs; + int uffd_faults; + /* uffd_faults is incremented from multiple threads. */ + pthread_mutex_t uffd_faults_mutex; +} events; + +struct test_desc { + const char *name; + uint64_t mem_mark_cmd; + /* Skip the test if any prepare function returns false */ + bool (*guest_prepare[PREPARE_FN_NR])(void); + void (*guest_test)(void); + void (*guest_test_check[CHECK_FN_NR])(void); + uffd_handler_t uffd_pt_handler; + uffd_handler_t uffd_data_handler; + void (*dabt_handler)(struct ex_regs *regs); + void (*iabt_handler)(struct ex_regs *regs); + void (*mmio_handler)(struct kvm_vm *vm, struct kvm_run *run); + void (*fail_vcpu_run_handler)(int ret); + uint32_t pt_memslot_flags; + uint32_t data_memslot_flags; + bool skip; + struct event_cnt expected_events; +}; + +struct test_params { + enum vm_mem_backing_src_type src_type; + struct test_desc *test_desc; +}; + +static inline void flush_tlb_page(uint64_t vaddr) +{ + uint64_t page = vaddr >> 12; + + dsb(ishst); + asm volatile("tlbi vaae1is, %0" :: "r" (page)); + dsb(ish); + isb(); +} + +static void guest_write64(void) +{ + uint64_t val; + + WRITE_ONCE(*guest_test_memory, TEST_DATA); + val = READ_ONCE(*guest_test_memory); + GUEST_ASSERT_EQ(val, TEST_DATA); +} + +/* Check the system for atomic instructions. */ +static bool guest_check_lse(void) +{ + uint64_t isar0 = read_sysreg(id_aa64isar0_el1); + uint64_t atomic; + + atomic = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_ATOMIC), isar0); + return atomic >= 2; +} + +static bool guest_check_dc_zva(void) +{ + uint64_t dczid = read_sysreg(dczid_el0); + uint64_t dzp = FIELD_GET(ARM64_FEATURE_MASK(DCZID_EL0_DZP), dczid); + + return dzp == 0; +} + +/* Compare and swap instruction. */ +static void guest_cas(void) +{ + uint64_t val; + + GUEST_ASSERT(guest_check_lse()); + asm volatile(".arch_extension lse\n" + "casal %0, %1, [%2]\n" + :: "r" (0ul), "r" (TEST_DATA), "r" (guest_test_memory)); + val = READ_ONCE(*guest_test_memory); + GUEST_ASSERT_EQ(val, TEST_DATA); +} + +static void guest_read64(void) +{ + uint64_t val; + + val = READ_ONCE(*guest_test_memory); + GUEST_ASSERT_EQ(val, 0); +} + +/* Address translation instruction */ +static void guest_at(void) +{ + uint64_t par; + + asm volatile("at s1e1r, %0" :: "r" (guest_test_memory)); + isb(); + par = read_sysreg(par_el1); + + /* Bit 1 indicates whether the AT was successful */ + GUEST_ASSERT_EQ(par & 1, 0); +} + +/* + * The size of the block written by "dc zva" is guaranteed to be between (2 << + * 0) and (2 << 9), which is safe in our case as we need the write to happen + * for at least a word, and not more than a page. + */ +static void guest_dc_zva(void) +{ + uint16_t val; + + asm volatile("dc zva, %0" :: "r" (guest_test_memory)); + dsb(ish); + val = READ_ONCE(*guest_test_memory); + GUEST_ASSERT_EQ(val, 0); +} + +/* + * Pre-indexing loads and stores don't have a valid syndrome (ESR_EL2.ISV==0). + * And that's special because KVM must take special care with those: they + * should still count as accesses for dirty logging or user-faulting, but + * should be handled differently on mmio. + */ +static void guest_ld_preidx(void) +{ + uint64_t val; + uint64_t addr = TEST_GVA - 8; + + /* + * This ends up accessing "TEST_GVA + 8 - 8", where "TEST_GVA - 8" is + * in a gap between memslots not backing by anything. + */ + asm volatile("ldr %0, [%1, #8]!" + : "=r" (val), "+r" (addr)); + GUEST_ASSERT_EQ(val, 0); + GUEST_ASSERT_EQ(addr, TEST_GVA); +} + +static void guest_st_preidx(void) +{ + uint64_t val = TEST_DATA; + uint64_t addr = TEST_GVA - 8; + + asm volatile("str %0, [%1, #8]!" + : "+r" (val), "+r" (addr)); + + GUEST_ASSERT_EQ(addr, TEST_GVA); + val = READ_ONCE(*guest_test_memory); +} + +static bool guest_set_ha(void) +{ + uint64_t mmfr1 = read_sysreg(id_aa64mmfr1_el1); + uint64_t hadbs, tcr; + + /* Skip if HA is not supported. */ + hadbs = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64MMFR1_EL1_HAFDBS), mmfr1); + if (hadbs == 0) + return false; + + tcr = read_sysreg(tcr_el1) | TCR_EL1_HA; + write_sysreg(tcr, tcr_el1); + isb(); + + return true; +} + +static bool guest_clear_pte_af(void) +{ + *((uint64_t *)TEST_PTE_GVA) &= ~PTE_AF; + flush_tlb_page(TEST_GVA); + + return true; +} + +static void guest_check_pte_af(void) +{ + dsb(ish); + GUEST_ASSERT_EQ(*((uint64_t *)TEST_PTE_GVA) & PTE_AF, PTE_AF); +} + +static void guest_check_write_in_dirty_log(void) +{ + GUEST_SYNC(CMD_CHECK_WRITE_IN_DIRTY_LOG); +} + +static void guest_check_no_write_in_dirty_log(void) +{ + GUEST_SYNC(CMD_CHECK_NO_WRITE_IN_DIRTY_LOG); +} + +static void guest_check_s1ptw_wr_in_dirty_log(void) +{ + GUEST_SYNC(CMD_CHECK_S1PTW_WR_IN_DIRTY_LOG); +} + +static void guest_check_no_s1ptw_wr_in_dirty_log(void) +{ + GUEST_SYNC(CMD_CHECK_NO_S1PTW_WR_IN_DIRTY_LOG); +} + +static void guest_exec(void) +{ + int (*code)(void) = (int (*)(void))TEST_EXEC_GVA; + int ret; + + ret = code(); + GUEST_ASSERT_EQ(ret, 0x77); +} + +static bool guest_prepare(struct test_desc *test) +{ + bool (*prepare_fn)(void); + int i; + + for (i = 0; i < PREPARE_FN_NR; i++) { + prepare_fn = test->guest_prepare[i]; + if (prepare_fn && !prepare_fn()) + return false; + } + + return true; +} + +static void guest_test_check(struct test_desc *test) +{ + void (*check_fn)(void); + int i; + + for (i = 0; i < CHECK_FN_NR; i++) { + check_fn = test->guest_test_check[i]; + if (check_fn) + check_fn(); + } +} + +static void guest_code(struct test_desc *test) +{ + if (!guest_prepare(test)) + GUEST_SYNC(CMD_SKIP_TEST); + + GUEST_SYNC(test->mem_mark_cmd); + + if (test->guest_test) + test->guest_test(); + + guest_test_check(test); + GUEST_DONE(); +} + +static void no_dabt_handler(struct ex_regs *regs) +{ + GUEST_FAIL("Unexpected dabt, far_el1 = 0x%lx", read_sysreg(far_el1)); +} + +static void no_iabt_handler(struct ex_regs *regs) +{ + GUEST_FAIL("Unexpected iabt, pc = 0x%lx", regs->pc); +} + +static struct uffd_args { + char *copy; + void *hva; + uint64_t paging_size; +} pt_args, data_args; + +/* Returns true to continue the test, and false if it should be skipped. */ +static int uffd_generic_handler(int uffd_mode, int uffd, struct uffd_msg *msg, + struct uffd_args *args) +{ + uint64_t addr = msg->arg.pagefault.address; + uint64_t flags = msg->arg.pagefault.flags; + struct uffdio_copy copy; + int ret; + + TEST_ASSERT(uffd_mode == UFFDIO_REGISTER_MODE_MISSING, + "The only expected UFFD mode is MISSING"); + TEST_ASSERT_EQ(addr, (uint64_t)args->hva); + + pr_debug("uffd fault: addr=%p write=%d\n", + (void *)addr, !!(flags & UFFD_PAGEFAULT_FLAG_WRITE)); + + copy.src = (uint64_t)args->copy; + copy.dst = addr; + copy.len = args->paging_size; + copy.mode = 0; + + ret = ioctl(uffd, UFFDIO_COPY, ©); + if (ret == -1) { + pr_info("Failed UFFDIO_COPY in 0x%lx with errno: %d\n", + addr, errno); + return ret; + } + + pthread_mutex_lock(&events.uffd_faults_mutex); + events.uffd_faults += 1; + pthread_mutex_unlock(&events.uffd_faults_mutex); + return 0; +} + +static int uffd_pt_handler(int mode, int uffd, struct uffd_msg *msg) +{ + return uffd_generic_handler(mode, uffd, msg, &pt_args); +} + +static int uffd_data_handler(int mode, int uffd, struct uffd_msg *msg) +{ + return uffd_generic_handler(mode, uffd, msg, &data_args); +} + +static void setup_uffd_args(struct userspace_mem_region *region, + struct uffd_args *args) +{ + args->hva = (void *)region->region.userspace_addr; + args->paging_size = region->region.memory_size; + + args->copy = malloc(args->paging_size); + TEST_ASSERT(args->copy, "Failed to allocate data copy."); + memcpy(args->copy, args->hva, args->paging_size); +} + +static void setup_uffd(struct kvm_vm *vm, struct test_params *p, + struct uffd_desc **pt_uffd, struct uffd_desc **data_uffd) +{ + struct test_desc *test = p->test_desc; + int uffd_mode = UFFDIO_REGISTER_MODE_MISSING; + + setup_uffd_args(vm_get_mem_region(vm, MEM_REGION_PT), &pt_args); + setup_uffd_args(vm_get_mem_region(vm, MEM_REGION_TEST_DATA), &data_args); + + *pt_uffd = NULL; + if (test->uffd_pt_handler) + *pt_uffd = uffd_setup_demand_paging(uffd_mode, 0, + pt_args.hva, + pt_args.paging_size, + 1, test->uffd_pt_handler); + + *data_uffd = NULL; + if (test->uffd_data_handler) + *data_uffd = uffd_setup_demand_paging(uffd_mode, 0, + data_args.hva, + data_args.paging_size, + 1, test->uffd_data_handler); +} + +static void free_uffd(struct test_desc *test, struct uffd_desc *pt_uffd, + struct uffd_desc *data_uffd) +{ + if (test->uffd_pt_handler) + uffd_stop_demand_paging(pt_uffd); + if (test->uffd_data_handler) + uffd_stop_demand_paging(data_uffd); + + free(pt_args.copy); + free(data_args.copy); +} + +static int uffd_no_handler(int mode, int uffd, struct uffd_msg *msg) +{ + TEST_FAIL("There was no UFFD fault expected."); + return -1; +} + +/* Returns false if the test should be skipped. */ +static bool punch_hole_in_backing_store(struct kvm_vm *vm, + struct userspace_mem_region *region) +{ + void *hva = (void *)region->region.userspace_addr; + uint64_t paging_size = region->region.memory_size; + int ret, fd = region->fd; + + if (fd != -1) { + ret = fallocate(fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, + 0, paging_size); + TEST_ASSERT(ret == 0, "fallocate failed"); + } else { + ret = madvise(hva, paging_size, MADV_DONTNEED); + TEST_ASSERT(ret == 0, "madvise failed"); + } + + return true; +} + +static void mmio_on_test_gpa_handler(struct kvm_vm *vm, struct kvm_run *run) +{ + struct userspace_mem_region *region; + void *hva; + + region = vm_get_mem_region(vm, MEM_REGION_TEST_DATA); + hva = (void *)region->region.userspace_addr; + + TEST_ASSERT_EQ(run->mmio.phys_addr, region->region.guest_phys_addr); + + memcpy(hva, run->mmio.data, run->mmio.len); + events.mmio_exits += 1; +} + +static void mmio_no_handler(struct kvm_vm *vm, struct kvm_run *run) +{ + uint64_t data; + + memcpy(&data, run->mmio.data, sizeof(data)); + pr_debug("addr=%lld len=%d w=%d data=%lx\n", + run->mmio.phys_addr, run->mmio.len, + run->mmio.is_write, data); + TEST_FAIL("There was no MMIO exit expected."); +} + +static bool check_write_in_dirty_log(struct kvm_vm *vm, + struct userspace_mem_region *region, + uint64_t host_pg_nr) +{ + unsigned long *bmap; + bool first_page_dirty; + uint64_t size = region->region.memory_size; + + /* getpage_size() is not always equal to vm->page_size */ + bmap = bitmap_zalloc(size / getpagesize()); + kvm_vm_get_dirty_log(vm, region->region.slot, bmap); + first_page_dirty = test_bit(host_pg_nr, bmap); + free(bmap); + return first_page_dirty; +} + +/* Returns true to continue the test, and false if it should be skipped. */ +static bool handle_cmd(struct kvm_vm *vm, int cmd) +{ + struct userspace_mem_region *data_region, *pt_region; + bool continue_test = true; + uint64_t pte_gpa, pte_pg; + + data_region = vm_get_mem_region(vm, MEM_REGION_TEST_DATA); + pt_region = vm_get_mem_region(vm, MEM_REGION_PT); + pte_gpa = addr_hva2gpa(vm, virt_get_pte_hva(vm, TEST_GVA)); + pte_pg = (pte_gpa - pt_region->region.guest_phys_addr) / getpagesize(); + + if (cmd == CMD_SKIP_TEST) + continue_test = false; + + if (cmd & CMD_HOLE_PT) + continue_test = punch_hole_in_backing_store(vm, pt_region); + if (cmd & CMD_HOLE_DATA) + continue_test = punch_hole_in_backing_store(vm, data_region); + if (cmd & CMD_CHECK_WRITE_IN_DIRTY_LOG) + TEST_ASSERT(check_write_in_dirty_log(vm, data_region, 0), + "Missing write in dirty log"); + if (cmd & CMD_CHECK_S1PTW_WR_IN_DIRTY_LOG) + TEST_ASSERT(check_write_in_dirty_log(vm, pt_region, pte_pg), + "Missing s1ptw write in dirty log"); + if (cmd & CMD_CHECK_NO_WRITE_IN_DIRTY_LOG) + TEST_ASSERT(!check_write_in_dirty_log(vm, data_region, 0), + "Unexpected write in dirty log"); + if (cmd & CMD_CHECK_NO_S1PTW_WR_IN_DIRTY_LOG) + TEST_ASSERT(!check_write_in_dirty_log(vm, pt_region, pte_pg), + "Unexpected s1ptw write in dirty log"); + + return continue_test; +} + +void fail_vcpu_run_no_handler(int ret) +{ + TEST_FAIL("Unexpected vcpu run failure"); +} + +void fail_vcpu_run_mmio_no_syndrome_handler(int ret) +{ + TEST_ASSERT(errno == ENOSYS, + "The mmio handler should have returned not implemented."); + events.fail_vcpu_runs += 1; +} + +typedef uint32_t aarch64_insn_t; +extern aarch64_insn_t __exec_test[2]; + +noinline void __return_0x77(void) +{ + asm volatile("__exec_test: mov x0, #0x77\n" + "ret\n"); +} + +/* + * Note that this function runs on the host before the test VM starts: there's + * no need to sync the D$ and I$ caches. + */ +static void load_exec_code_for_test(struct kvm_vm *vm) +{ + uint64_t *code; + struct userspace_mem_region *region; + void *hva; + + region = vm_get_mem_region(vm, MEM_REGION_TEST_DATA); + hva = (void *)region->region.userspace_addr; + + assert(TEST_EXEC_GVA > TEST_GVA); + code = hva + TEST_EXEC_GVA - TEST_GVA; + memcpy(code, __exec_test, sizeof(__exec_test)); +} + +static void setup_abort_handlers(struct kvm_vm *vm, struct kvm_vcpu *vcpu, + struct test_desc *test) +{ + vm_init_descriptor_tables(vm); + vcpu_init_descriptor_tables(vcpu); + + vm_install_sync_handler(vm, VECTOR_SYNC_CURRENT, + ESR_EC_DABT, no_dabt_handler); + vm_install_sync_handler(vm, VECTOR_SYNC_CURRENT, + ESR_EC_IABT, no_iabt_handler); +} + +static void setup_gva_maps(struct kvm_vm *vm) +{ + struct userspace_mem_region *region; + uint64_t pte_gpa; + + region = vm_get_mem_region(vm, MEM_REGION_TEST_DATA); + /* Map TEST_GVA first. This will install a new PTE. */ + virt_pg_map(vm, TEST_GVA, region->region.guest_phys_addr); + /* Then map TEST_PTE_GVA to the above PTE. */ + pte_gpa = addr_hva2gpa(vm, virt_get_pte_hva(vm, TEST_GVA)); + virt_pg_map(vm, TEST_PTE_GVA, pte_gpa); +} + +enum pf_test_memslots { + CODE_AND_DATA_MEMSLOT, + PAGE_TABLE_MEMSLOT, + TEST_DATA_MEMSLOT, +}; + +/* + * Create a memslot for code and data at pfn=0, and test-data and PT ones + * at max_gfn. + */ +static void setup_memslots(struct kvm_vm *vm, struct test_params *p) +{ + uint64_t backing_src_pagesz = get_backing_src_pagesz(p->src_type); + uint64_t guest_page_size = vm->page_size; + uint64_t max_gfn = vm_compute_max_gfn(vm); + /* Enough for 2M of code when using 4K guest pages. */ + uint64_t code_npages = 512; + uint64_t pt_size, data_size, data_gpa; + + /* + * This test requires 1 pgd, 2 pud, 4 pmd, and 6 pte pages when using + * VM_MODE_P48V48_4K. Note that the .text takes ~1.6MBs. That's 13 + * pages. VM_MODE_P48V48_4K is the mode with most PT pages; let's use + * twice that just in case. + */ + pt_size = 26 * guest_page_size; + + /* memslot sizes and gpa's must be aligned to the backing page size */ + pt_size = align_up(pt_size, backing_src_pagesz); + data_size = align_up(guest_page_size, backing_src_pagesz); + data_gpa = (max_gfn * guest_page_size) - data_size; + data_gpa = align_down(data_gpa, backing_src_pagesz); + + vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, 0, + CODE_AND_DATA_MEMSLOT, code_npages, 0); + vm->memslots[MEM_REGION_CODE] = CODE_AND_DATA_MEMSLOT; + vm->memslots[MEM_REGION_DATA] = CODE_AND_DATA_MEMSLOT; + + vm_userspace_mem_region_add(vm, p->src_type, data_gpa - pt_size, + PAGE_TABLE_MEMSLOT, pt_size / guest_page_size, + p->test_desc->pt_memslot_flags); + vm->memslots[MEM_REGION_PT] = PAGE_TABLE_MEMSLOT; + + vm_userspace_mem_region_add(vm, p->src_type, data_gpa, TEST_DATA_MEMSLOT, + data_size / guest_page_size, + p->test_desc->data_memslot_flags); + vm->memslots[MEM_REGION_TEST_DATA] = TEST_DATA_MEMSLOT; +} + +static void setup_ucall(struct kvm_vm *vm) +{ + struct userspace_mem_region *region = vm_get_mem_region(vm, MEM_REGION_TEST_DATA); + + ucall_init(vm, region->region.guest_phys_addr + region->region.memory_size); +} + +static void setup_default_handlers(struct test_desc *test) +{ + if (!test->mmio_handler) + test->mmio_handler = mmio_no_handler; + + if (!test->fail_vcpu_run_handler) + test->fail_vcpu_run_handler = fail_vcpu_run_no_handler; +} + +static void check_event_counts(struct test_desc *test) +{ + TEST_ASSERT_EQ(test->expected_events.uffd_faults, events.uffd_faults); + TEST_ASSERT_EQ(test->expected_events.mmio_exits, events.mmio_exits); + TEST_ASSERT_EQ(test->expected_events.fail_vcpu_runs, events.fail_vcpu_runs); +} + +static void print_test_banner(enum vm_guest_mode mode, struct test_params *p) +{ + struct test_desc *test = p->test_desc; + + pr_debug("Test: %s\n", test->name); + pr_debug("Testing guest mode: %s\n", vm_guest_mode_string(mode)); + pr_debug("Testing memory backing src type: %s\n", + vm_mem_backing_src_alias(p->src_type)->name); +} + +static void reset_event_counts(void) +{ + memset(&events, 0, sizeof(events)); +} + +/* + * This function either succeeds, skips the test (after setting test->skip), or + * fails with a TEST_FAIL that aborts all tests. + */ +static void vcpu_run_loop(struct kvm_vm *vm, struct kvm_vcpu *vcpu, + struct test_desc *test) +{ + struct kvm_run *run; + struct ucall uc; + int ret; + + run = vcpu->run; + + for (;;) { + ret = _vcpu_run(vcpu); + if (ret) { + test->fail_vcpu_run_handler(ret); + goto done; + } + + switch (get_ucall(vcpu, &uc)) { + case UCALL_SYNC: + if (!handle_cmd(vm, uc.args[1])) { + test->skip = true; + goto done; + } + break; + case UCALL_ABORT: + REPORT_GUEST_ASSERT(uc); + break; + case UCALL_DONE: + goto done; + case UCALL_NONE: + if (run->exit_reason == KVM_EXIT_MMIO) + test->mmio_handler(vm, run); + break; + default: + TEST_FAIL("Unknown ucall %lu", uc.cmd); + } + } + +done: + pr_debug(test->skip ? "Skipped.\n" : "Done.\n"); +} + +static void run_test(enum vm_guest_mode mode, void *arg) +{ + struct test_params *p = (struct test_params *)arg; + struct test_desc *test = p->test_desc; + struct kvm_vm *vm; + struct kvm_vcpu *vcpu; + struct uffd_desc *pt_uffd, *data_uffd; + + print_test_banner(mode, p); + + vm = ____vm_create(VM_SHAPE(mode)); + setup_memslots(vm, p); + kvm_vm_elf_load(vm, program_invocation_name); + setup_ucall(vm); + vcpu = vm_vcpu_add(vm, 0, guest_code); + + setup_gva_maps(vm); + + reset_event_counts(); + + /* + * Set some code in the data memslot for the guest to execute (only + * applicable to the EXEC tests). This has to be done before + * setup_uffd() as that function copies the memslot data for the uffd + * handler. + */ + load_exec_code_for_test(vm); + setup_uffd(vm, p, &pt_uffd, &data_uffd); + setup_abort_handlers(vm, vcpu, test); + setup_default_handlers(test); + vcpu_args_set(vcpu, 1, test); + + vcpu_run_loop(vm, vcpu, test); + + kvm_vm_free(vm); + free_uffd(test, pt_uffd, data_uffd); + + /* + * Make sure we check the events after the uffd threads have exited, + * which means they updated their respective event counters. + */ + if (!test->skip) + check_event_counts(test); +} + +static void help(char *name) +{ + puts(""); + printf("usage: %s [-h] [-s mem-type]\n", name); + puts(""); + guest_modes_help(); + backing_src_help("-s"); + puts(""); +} + +#define SNAME(s) #s +#define SCAT2(a, b) SNAME(a ## _ ## b) +#define SCAT3(a, b, c) SCAT2(a, SCAT2(b, c)) +#define SCAT4(a, b, c, d) SCAT2(a, SCAT3(b, c, d)) + +#define _CHECK(_test) _CHECK_##_test +#define _PREPARE(_test) _PREPARE_##_test +#define _PREPARE_guest_read64 NULL +#define _PREPARE_guest_ld_preidx NULL +#define _PREPARE_guest_write64 NULL +#define _PREPARE_guest_st_preidx NULL +#define _PREPARE_guest_exec NULL +#define _PREPARE_guest_at NULL +#define _PREPARE_guest_dc_zva guest_check_dc_zva +#define _PREPARE_guest_cas guest_check_lse + +/* With or without access flag checks */ +#define _PREPARE_with_af guest_set_ha, guest_clear_pte_af +#define _PREPARE_no_af NULL +#define _CHECK_with_af guest_check_pte_af +#define _CHECK_no_af NULL + +/* Performs an access and checks that no faults were triggered. */ +#define TEST_ACCESS(_access, _with_af, _mark_cmd) \ +{ \ + .name = SCAT3(_access, _with_af, #_mark_cmd), \ + .guest_prepare = { _PREPARE(_with_af), \ + _PREPARE(_access) }, \ + .mem_mark_cmd = _mark_cmd, \ + .guest_test = _access, \ + .guest_test_check = { _CHECK(_with_af) }, \ + .expected_events = { 0 }, \ +} + +#define TEST_UFFD(_access, _with_af, _mark_cmd, \ + _uffd_data_handler, _uffd_pt_handler, _uffd_faults) \ +{ \ + .name = SCAT4(uffd, _access, _with_af, #_mark_cmd), \ + .guest_prepare = { _PREPARE(_with_af), \ + _PREPARE(_access) }, \ + .guest_test = _access, \ + .mem_mark_cmd = _mark_cmd, \ + .guest_test_check = { _CHECK(_with_af) }, \ + .uffd_data_handler = _uffd_data_handler, \ + .uffd_pt_handler = _uffd_pt_handler, \ + .expected_events = { .uffd_faults = _uffd_faults, }, \ +} + +#define TEST_DIRTY_LOG(_access, _with_af, _test_check, _pt_check) \ +{ \ + .name = SCAT3(dirty_log, _access, _with_af), \ + .data_memslot_flags = KVM_MEM_LOG_DIRTY_PAGES, \ + .pt_memslot_flags = KVM_MEM_LOG_DIRTY_PAGES, \ + .guest_prepare = { _PREPARE(_with_af), \ + _PREPARE(_access) }, \ + .guest_test = _access, \ + .guest_test_check = { _CHECK(_with_af), _test_check, _pt_check }, \ + .expected_events = { 0 }, \ +} + +#define TEST_UFFD_AND_DIRTY_LOG(_access, _with_af, _uffd_data_handler, \ + _uffd_faults, _test_check, _pt_check) \ +{ \ + .name = SCAT3(uffd_and_dirty_log, _access, _with_af), \ + .data_memslot_flags = KVM_MEM_LOG_DIRTY_PAGES, \ + .pt_memslot_flags = KVM_MEM_LOG_DIRTY_PAGES, \ + .guest_prepare = { _PREPARE(_with_af), \ + _PREPARE(_access) }, \ + .guest_test = _access, \ + .mem_mark_cmd = CMD_HOLE_DATA | CMD_HOLE_PT, \ + .guest_test_check = { _CHECK(_with_af), _test_check, _pt_check }, \ + .uffd_data_handler = _uffd_data_handler, \ + .uffd_pt_handler = uffd_pt_handler, \ + .expected_events = { .uffd_faults = _uffd_faults, }, \ +} + +#define TEST_RO_MEMSLOT(_access, _mmio_handler, _mmio_exits) \ +{ \ + .name = SCAT2(ro_memslot, _access), \ + .data_memslot_flags = KVM_MEM_READONLY, \ + .pt_memslot_flags = KVM_MEM_READONLY, \ + .guest_prepare = { _PREPARE(_access) }, \ + .guest_test = _access, \ + .mmio_handler = _mmio_handler, \ + .expected_events = { .mmio_exits = _mmio_exits }, \ +} + +#define TEST_RO_MEMSLOT_NO_SYNDROME(_access) \ +{ \ + .name = SCAT2(ro_memslot_no_syndrome, _access), \ + .data_memslot_flags = KVM_MEM_READONLY, \ + .pt_memslot_flags = KVM_MEM_READONLY, \ + .guest_prepare = { _PREPARE(_access) }, \ + .guest_test = _access, \ + .fail_vcpu_run_handler = fail_vcpu_run_mmio_no_syndrome_handler, \ + .expected_events = { .fail_vcpu_runs = 1 }, \ +} + +#define TEST_RO_MEMSLOT_AND_DIRTY_LOG(_access, _mmio_handler, _mmio_exits, \ + _test_check) \ +{ \ + .name = SCAT2(ro_memslot, _access), \ + .data_memslot_flags = KVM_MEM_READONLY | KVM_MEM_LOG_DIRTY_PAGES, \ + .pt_memslot_flags = KVM_MEM_READONLY | KVM_MEM_LOG_DIRTY_PAGES, \ + .guest_prepare = { _PREPARE(_access) }, \ + .guest_test = _access, \ + .guest_test_check = { _test_check }, \ + .mmio_handler = _mmio_handler, \ + .expected_events = { .mmio_exits = _mmio_exits}, \ +} + +#define TEST_RO_MEMSLOT_NO_SYNDROME_AND_DIRTY_LOG(_access, _test_check) \ +{ \ + .name = SCAT2(ro_memslot_no_syn_and_dlog, _access), \ + .data_memslot_flags = KVM_MEM_READONLY | KVM_MEM_LOG_DIRTY_PAGES, \ + .pt_memslot_flags = KVM_MEM_READONLY | KVM_MEM_LOG_DIRTY_PAGES, \ + .guest_prepare = { _PREPARE(_access) }, \ + .guest_test = _access, \ + .guest_test_check = { _test_check }, \ + .fail_vcpu_run_handler = fail_vcpu_run_mmio_no_syndrome_handler, \ + .expected_events = { .fail_vcpu_runs = 1 }, \ +} + +#define TEST_RO_MEMSLOT_AND_UFFD(_access, _mmio_handler, _mmio_exits, \ + _uffd_data_handler, _uffd_faults) \ +{ \ + .name = SCAT2(ro_memslot_uffd, _access), \ + .data_memslot_flags = KVM_MEM_READONLY, \ + .pt_memslot_flags = KVM_MEM_READONLY, \ + .mem_mark_cmd = CMD_HOLE_DATA | CMD_HOLE_PT, \ + .guest_prepare = { _PREPARE(_access) }, \ + .guest_test = _access, \ + .uffd_data_handler = _uffd_data_handler, \ + .uffd_pt_handler = uffd_pt_handler, \ + .mmio_handler = _mmio_handler, \ + .expected_events = { .mmio_exits = _mmio_exits, \ + .uffd_faults = _uffd_faults }, \ +} + +#define TEST_RO_MEMSLOT_NO_SYNDROME_AND_UFFD(_access, _uffd_data_handler, \ + _uffd_faults) \ +{ \ + .name = SCAT2(ro_memslot_no_syndrome, _access), \ + .data_memslot_flags = KVM_MEM_READONLY, \ + .pt_memslot_flags = KVM_MEM_READONLY, \ + .mem_mark_cmd = CMD_HOLE_DATA | CMD_HOLE_PT, \ + .guest_prepare = { _PREPARE(_access) }, \ + .guest_test = _access, \ + .uffd_data_handler = _uffd_data_handler, \ + .uffd_pt_handler = uffd_pt_handler, \ + .fail_vcpu_run_handler = fail_vcpu_run_mmio_no_syndrome_handler, \ + .expected_events = { .fail_vcpu_runs = 1, \ + .uffd_faults = _uffd_faults }, \ +} + +static struct test_desc tests[] = { + + /* Check that HW is setting the Access Flag (AF) (sanity checks). */ + TEST_ACCESS(guest_read64, with_af, CMD_NONE), + TEST_ACCESS(guest_ld_preidx, with_af, CMD_NONE), + TEST_ACCESS(guest_cas, with_af, CMD_NONE), + TEST_ACCESS(guest_write64, with_af, CMD_NONE), + TEST_ACCESS(guest_st_preidx, with_af, CMD_NONE), + TEST_ACCESS(guest_dc_zva, with_af, CMD_NONE), + TEST_ACCESS(guest_exec, with_af, CMD_NONE), + + /* + * Punch a hole in the data backing store, and then try multiple + * accesses: reads should rturn zeroes, and writes should + * re-populate the page. Moreover, the test also check that no + * exception was generated in the guest. Note that this + * reading/writing behavior is the same as reading/writing a + * punched page (with fallocate(FALLOC_FL_PUNCH_HOLE)) from + * userspace. + */ + TEST_ACCESS(guest_read64, no_af, CMD_HOLE_DATA), + TEST_ACCESS(guest_cas, no_af, CMD_HOLE_DATA), + TEST_ACCESS(guest_ld_preidx, no_af, CMD_HOLE_DATA), + TEST_ACCESS(guest_write64, no_af, CMD_HOLE_DATA), + TEST_ACCESS(guest_st_preidx, no_af, CMD_HOLE_DATA), + TEST_ACCESS(guest_at, no_af, CMD_HOLE_DATA), + TEST_ACCESS(guest_dc_zva, no_af, CMD_HOLE_DATA), + + /* + * Punch holes in the data and PT backing stores and mark them for + * userfaultfd handling. This should result in 2 faults: the access + * on the data backing store, and its respective S1 page table walk + * (S1PTW). + */ + TEST_UFFD(guest_read64, with_af, CMD_HOLE_DATA | CMD_HOLE_PT, + uffd_data_handler, uffd_pt_handler, 2), + TEST_UFFD(guest_read64, no_af, CMD_HOLE_DATA | CMD_HOLE_PT, + uffd_data_handler, uffd_pt_handler, 2), + TEST_UFFD(guest_cas, with_af, CMD_HOLE_DATA | CMD_HOLE_PT, + uffd_data_handler, uffd_pt_handler, 2), + /* + * Can't test guest_at with_af as it's IMPDEF whether the AF is set. + * The S1PTW fault should still be marked as a write. + */ + TEST_UFFD(guest_at, no_af, CMD_HOLE_DATA | CMD_HOLE_PT, + uffd_no_handler, uffd_pt_handler, 1), + TEST_UFFD(guest_ld_preidx, with_af, CMD_HOLE_DATA | CMD_HOLE_PT, + uffd_data_handler, uffd_pt_handler, 2), + TEST_UFFD(guest_write64, with_af, CMD_HOLE_DATA | CMD_HOLE_PT, + uffd_data_handler, uffd_pt_handler, 2), + TEST_UFFD(guest_dc_zva, with_af, CMD_HOLE_DATA | CMD_HOLE_PT, + uffd_data_handler, uffd_pt_handler, 2), + TEST_UFFD(guest_st_preidx, with_af, CMD_HOLE_DATA | CMD_HOLE_PT, + uffd_data_handler, uffd_pt_handler, 2), + TEST_UFFD(guest_exec, with_af, CMD_HOLE_DATA | CMD_HOLE_PT, + uffd_data_handler, uffd_pt_handler, 2), + + /* + * Try accesses when the data and PT memory regions are both + * tracked for dirty logging. + */ + TEST_DIRTY_LOG(guest_read64, with_af, guest_check_no_write_in_dirty_log, + guest_check_s1ptw_wr_in_dirty_log), + TEST_DIRTY_LOG(guest_read64, no_af, guest_check_no_write_in_dirty_log, + guest_check_no_s1ptw_wr_in_dirty_log), + TEST_DIRTY_LOG(guest_ld_preidx, with_af, + guest_check_no_write_in_dirty_log, + guest_check_s1ptw_wr_in_dirty_log), + TEST_DIRTY_LOG(guest_at, no_af, guest_check_no_write_in_dirty_log, + guest_check_no_s1ptw_wr_in_dirty_log), + TEST_DIRTY_LOG(guest_exec, with_af, guest_check_no_write_in_dirty_log, + guest_check_s1ptw_wr_in_dirty_log), + TEST_DIRTY_LOG(guest_write64, with_af, guest_check_write_in_dirty_log, + guest_check_s1ptw_wr_in_dirty_log), + TEST_DIRTY_LOG(guest_cas, with_af, guest_check_write_in_dirty_log, + guest_check_s1ptw_wr_in_dirty_log), + TEST_DIRTY_LOG(guest_dc_zva, with_af, guest_check_write_in_dirty_log, + guest_check_s1ptw_wr_in_dirty_log), + TEST_DIRTY_LOG(guest_st_preidx, with_af, guest_check_write_in_dirty_log, + guest_check_s1ptw_wr_in_dirty_log), + + /* + * Access when the data and PT memory regions are both marked for + * dirty logging and UFFD at the same time. The expected result is + * that writes should mark the dirty log and trigger a userfaultfd + * write fault. Reads/execs should result in a read userfaultfd + * fault, and nothing in the dirty log. Any S1PTW should result in + * a write in the dirty log and a userfaultfd write. + */ + TEST_UFFD_AND_DIRTY_LOG(guest_read64, with_af, + uffd_data_handler, 2, + guest_check_no_write_in_dirty_log, + guest_check_s1ptw_wr_in_dirty_log), + TEST_UFFD_AND_DIRTY_LOG(guest_read64, no_af, + uffd_data_handler, 2, + guest_check_no_write_in_dirty_log, + guest_check_no_s1ptw_wr_in_dirty_log), + TEST_UFFD_AND_DIRTY_LOG(guest_ld_preidx, with_af, + uffd_data_handler, + 2, guest_check_no_write_in_dirty_log, + guest_check_s1ptw_wr_in_dirty_log), + TEST_UFFD_AND_DIRTY_LOG(guest_at, with_af, uffd_no_handler, 1, + guest_check_no_write_in_dirty_log, + guest_check_s1ptw_wr_in_dirty_log), + TEST_UFFD_AND_DIRTY_LOG(guest_exec, with_af, + uffd_data_handler, 2, + guest_check_no_write_in_dirty_log, + guest_check_s1ptw_wr_in_dirty_log), + TEST_UFFD_AND_DIRTY_LOG(guest_write64, with_af, + uffd_data_handler, + 2, guest_check_write_in_dirty_log, + guest_check_s1ptw_wr_in_dirty_log), + TEST_UFFD_AND_DIRTY_LOG(guest_cas, with_af, + uffd_data_handler, 2, + guest_check_write_in_dirty_log, + guest_check_s1ptw_wr_in_dirty_log), + TEST_UFFD_AND_DIRTY_LOG(guest_dc_zva, with_af, + uffd_data_handler, + 2, guest_check_write_in_dirty_log, + guest_check_s1ptw_wr_in_dirty_log), + TEST_UFFD_AND_DIRTY_LOG(guest_st_preidx, with_af, + uffd_data_handler, 2, + guest_check_write_in_dirty_log, + guest_check_s1ptw_wr_in_dirty_log), + /* + * Access when both the PT and data regions are marked read-only + * (with KVM_MEM_READONLY). Writes with a syndrome result in an + * MMIO exit, writes with no syndrome (e.g., CAS) result in a + * failed vcpu run, and reads/execs with and without syndroms do + * not fault. + */ + TEST_RO_MEMSLOT(guest_read64, 0, 0), + TEST_RO_MEMSLOT(guest_ld_preidx, 0, 0), + TEST_RO_MEMSLOT(guest_at, 0, 0), + TEST_RO_MEMSLOT(guest_exec, 0, 0), + TEST_RO_MEMSLOT(guest_write64, mmio_on_test_gpa_handler, 1), + TEST_RO_MEMSLOT_NO_SYNDROME(guest_dc_zva), + TEST_RO_MEMSLOT_NO_SYNDROME(guest_cas), + TEST_RO_MEMSLOT_NO_SYNDROME(guest_st_preidx), + + /* + * The PT and data regions are both read-only and marked + * for dirty logging at the same time. The expected result is that + * for writes there should be no write in the dirty log. The + * readonly handling is the same as if the memslot was not marked + * for dirty logging: writes with a syndrome result in an MMIO + * exit, and writes with no syndrome result in a failed vcpu run. + */ + TEST_RO_MEMSLOT_AND_DIRTY_LOG(guest_read64, 0, 0, + guest_check_no_write_in_dirty_log), + TEST_RO_MEMSLOT_AND_DIRTY_LOG(guest_ld_preidx, 0, 0, + guest_check_no_write_in_dirty_log), + TEST_RO_MEMSLOT_AND_DIRTY_LOG(guest_at, 0, 0, + guest_check_no_write_in_dirty_log), + TEST_RO_MEMSLOT_AND_DIRTY_LOG(guest_exec, 0, 0, + guest_check_no_write_in_dirty_log), + TEST_RO_MEMSLOT_AND_DIRTY_LOG(guest_write64, mmio_on_test_gpa_handler, + 1, guest_check_no_write_in_dirty_log), + TEST_RO_MEMSLOT_NO_SYNDROME_AND_DIRTY_LOG(guest_dc_zva, + guest_check_no_write_in_dirty_log), + TEST_RO_MEMSLOT_NO_SYNDROME_AND_DIRTY_LOG(guest_cas, + guest_check_no_write_in_dirty_log), + TEST_RO_MEMSLOT_NO_SYNDROME_AND_DIRTY_LOG(guest_st_preidx, + guest_check_no_write_in_dirty_log), + + /* + * The PT and data regions are both read-only and punched with + * holes tracked with userfaultfd. The expected result is the + * union of both userfaultfd and read-only behaviors. For example, + * write accesses result in a userfaultfd write fault and an MMIO + * exit. Writes with no syndrome result in a failed vcpu run and + * no userfaultfd write fault. Reads result in userfaultfd getting + * triggered. + */ + TEST_RO_MEMSLOT_AND_UFFD(guest_read64, 0, 0, uffd_data_handler, 2), + TEST_RO_MEMSLOT_AND_UFFD(guest_ld_preidx, 0, 0, uffd_data_handler, 2), + TEST_RO_MEMSLOT_AND_UFFD(guest_at, 0, 0, uffd_no_handler, 1), + TEST_RO_MEMSLOT_AND_UFFD(guest_exec, 0, 0, uffd_data_handler, 2), + TEST_RO_MEMSLOT_AND_UFFD(guest_write64, mmio_on_test_gpa_handler, 1, + uffd_data_handler, 2), + TEST_RO_MEMSLOT_NO_SYNDROME_AND_UFFD(guest_cas, uffd_data_handler, 2), + TEST_RO_MEMSLOT_NO_SYNDROME_AND_UFFD(guest_dc_zva, uffd_no_handler, 1), + TEST_RO_MEMSLOT_NO_SYNDROME_AND_UFFD(guest_st_preidx, uffd_no_handler, 1), + + { 0 } +}; + +static void for_each_test_and_guest_mode(enum vm_mem_backing_src_type src_type) +{ + struct test_desc *t; + + for (t = &tests[0]; t->name; t++) { + if (t->skip) + continue; + + struct test_params p = { + .src_type = src_type, + .test_desc = t, + }; + + for_each_guest_mode(run_test, &p); + } +} + +int main(int argc, char *argv[]) +{ + enum vm_mem_backing_src_type src_type; + int opt; + + src_type = DEFAULT_VM_MEM_SRC; + + while ((opt = getopt(argc, argv, "hm:s:")) != -1) { + switch (opt) { + case 'm': + guest_modes_cmdline(optarg); + break; + case 's': + src_type = parse_backing_src_type(optarg); + break; + case 'h': + default: + help(argv[0]); + exit(0); + } + } + + for_each_test_and_guest_mode(src_type); + return 0; +} diff --git a/tools/testing/selftests/kvm/aarch64/psci_test.c b/tools/testing/selftests/kvm/aarch64/psci_test.c new file mode 100644 index 000000000000..61731a950def --- /dev/null +++ b/tools/testing/selftests/kvm/aarch64/psci_test.c @@ -0,0 +1,198 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * psci_test - Tests relating to KVM's PSCI implementation. + * + * Copyright (c) 2021 Google LLC. + * + * This test includes: + * - A regression test for a race between KVM servicing the PSCI CPU_ON call + * and userspace reading the targeted vCPU's registers. + * - A test for KVM's handling of PSCI SYSTEM_SUSPEND and the associated + * KVM_SYSTEM_EVENT_SUSPEND UAPI. + */ + +#include <linux/kernel.h> +#include <linux/psci.h> +#include <asm/cputype.h> + +#include "kvm_util.h" +#include "processor.h" +#include "test_util.h" + +#define CPU_ON_ENTRY_ADDR 0xfeedf00dul +#define CPU_ON_CONTEXT_ID 0xdeadc0deul + +static uint64_t psci_cpu_on(uint64_t target_cpu, uint64_t entry_addr, + uint64_t context_id) +{ + struct arm_smccc_res res; + + smccc_hvc(PSCI_0_2_FN64_CPU_ON, target_cpu, entry_addr, context_id, + 0, 0, 0, 0, &res); + + return res.a0; +} + +static uint64_t psci_affinity_info(uint64_t target_affinity, + uint64_t lowest_affinity_level) +{ + struct arm_smccc_res res; + + smccc_hvc(PSCI_0_2_FN64_AFFINITY_INFO, target_affinity, lowest_affinity_level, + 0, 0, 0, 0, 0, &res); + + return res.a0; +} + +static uint64_t psci_system_suspend(uint64_t entry_addr, uint64_t context_id) +{ + struct arm_smccc_res res; + + smccc_hvc(PSCI_1_0_FN64_SYSTEM_SUSPEND, entry_addr, context_id, + 0, 0, 0, 0, 0, &res); + + return res.a0; +} + +static uint64_t psci_features(uint32_t func_id) +{ + struct arm_smccc_res res; + + smccc_hvc(PSCI_1_0_FN_PSCI_FEATURES, func_id, 0, 0, 0, 0, 0, 0, &res); + + return res.a0; +} + +static void vcpu_power_off(struct kvm_vcpu *vcpu) +{ + struct kvm_mp_state mp_state = { + .mp_state = KVM_MP_STATE_STOPPED, + }; + + vcpu_mp_state_set(vcpu, &mp_state); +} + +static struct kvm_vm *setup_vm(void *guest_code, struct kvm_vcpu **source, + struct kvm_vcpu **target) +{ + struct kvm_vcpu_init init; + struct kvm_vm *vm; + + vm = vm_create(2); + + vm_ioctl(vm, KVM_ARM_PREFERRED_TARGET, &init); + init.features[0] |= (1 << KVM_ARM_VCPU_PSCI_0_2); + + *source = aarch64_vcpu_add(vm, 0, &init, guest_code); + *target = aarch64_vcpu_add(vm, 1, &init, guest_code); + + return vm; +} + +static void enter_guest(struct kvm_vcpu *vcpu) +{ + struct ucall uc; + + vcpu_run(vcpu); + if (get_ucall(vcpu, &uc) == UCALL_ABORT) + REPORT_GUEST_ASSERT(uc); +} + +static void assert_vcpu_reset(struct kvm_vcpu *vcpu) +{ + uint64_t obs_pc, obs_x0; + + vcpu_get_reg(vcpu, ARM64_CORE_REG(regs.pc), &obs_pc); + vcpu_get_reg(vcpu, ARM64_CORE_REG(regs.regs[0]), &obs_x0); + + TEST_ASSERT(obs_pc == CPU_ON_ENTRY_ADDR, + "unexpected target cpu pc: %lx (expected: %lx)", + obs_pc, CPU_ON_ENTRY_ADDR); + TEST_ASSERT(obs_x0 == CPU_ON_CONTEXT_ID, + "unexpected target context id: %lx (expected: %lx)", + obs_x0, CPU_ON_CONTEXT_ID); +} + +static void guest_test_cpu_on(uint64_t target_cpu) +{ + uint64_t target_state; + + GUEST_ASSERT(!psci_cpu_on(target_cpu, CPU_ON_ENTRY_ADDR, CPU_ON_CONTEXT_ID)); + + do { + target_state = psci_affinity_info(target_cpu, 0); + + GUEST_ASSERT((target_state == PSCI_0_2_AFFINITY_LEVEL_ON) || + (target_state == PSCI_0_2_AFFINITY_LEVEL_OFF)); + } while (target_state != PSCI_0_2_AFFINITY_LEVEL_ON); + + GUEST_DONE(); +} + +static void host_test_cpu_on(void) +{ + struct kvm_vcpu *source, *target; + uint64_t target_mpidr; + struct kvm_vm *vm; + struct ucall uc; + + vm = setup_vm(guest_test_cpu_on, &source, &target); + + /* + * make sure the target is already off when executing the test. + */ + vcpu_power_off(target); + + vcpu_get_reg(target, KVM_ARM64_SYS_REG(SYS_MPIDR_EL1), &target_mpidr); + vcpu_args_set(source, 1, target_mpidr & MPIDR_HWID_BITMASK); + enter_guest(source); + + if (get_ucall(source, &uc) != UCALL_DONE) + TEST_FAIL("Unhandled ucall: %lu", uc.cmd); + + assert_vcpu_reset(target); + kvm_vm_free(vm); +} + +static void guest_test_system_suspend(void) +{ + uint64_t ret; + + /* assert that SYSTEM_SUSPEND is discoverable */ + GUEST_ASSERT(!psci_features(PSCI_1_0_FN_SYSTEM_SUSPEND)); + GUEST_ASSERT(!psci_features(PSCI_1_0_FN64_SYSTEM_SUSPEND)); + + ret = psci_system_suspend(CPU_ON_ENTRY_ADDR, CPU_ON_CONTEXT_ID); + GUEST_SYNC(ret); +} + +static void host_test_system_suspend(void) +{ + struct kvm_vcpu *source, *target; + struct kvm_run *run; + struct kvm_vm *vm; + + vm = setup_vm(guest_test_system_suspend, &source, &target); + vm_enable_cap(vm, KVM_CAP_ARM_SYSTEM_SUSPEND, 0); + + vcpu_power_off(target); + run = source->run; + + enter_guest(source); + + TEST_ASSERT_KVM_EXIT_REASON(source, KVM_EXIT_SYSTEM_EVENT); + TEST_ASSERT(run->system_event.type == KVM_SYSTEM_EVENT_SUSPEND, + "Unhandled system event: %u (expected: %u)", + run->system_event.type, KVM_SYSTEM_EVENT_SUSPEND); + + kvm_vm_free(vm); +} + +int main(void) +{ + TEST_REQUIRE(kvm_has_cap(KVM_CAP_ARM_SYSTEM_SUSPEND)); + + host_test_cpu_on(); + host_test_system_suspend(); + return 0; +} diff --git a/tools/testing/selftests/kvm/aarch64/set_id_regs.c b/tools/testing/selftests/kvm/aarch64/set_id_regs.c new file mode 100644 index 000000000000..a7de39fa2a0a --- /dev/null +++ b/tools/testing/selftests/kvm/aarch64/set_id_regs.c @@ -0,0 +1,570 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * set_id_regs - Test for setting ID register from usersapce. + * + * Copyright (c) 2023 Google LLC. + * + * + * Test that KVM supports setting ID registers from userspace and handles the + * feature set correctly. + */ + +#include <stdint.h> +#include "kvm_util.h" +#include "processor.h" +#include "test_util.h" +#include <linux/bitfield.h> + +enum ftr_type { + FTR_EXACT, /* Use a predefined safe value */ + FTR_LOWER_SAFE, /* Smaller value is safe */ + FTR_HIGHER_SAFE, /* Bigger value is safe */ + FTR_HIGHER_OR_ZERO_SAFE, /* Bigger value is safe, but 0 is biggest */ + FTR_END, /* Mark the last ftr bits */ +}; + +#define FTR_SIGNED true /* Value should be treated as signed */ +#define FTR_UNSIGNED false /* Value should be treated as unsigned */ + +struct reg_ftr_bits { + char *name; + bool sign; + enum ftr_type type; + uint8_t shift; + uint64_t mask; + /* + * For FTR_EXACT, safe_val is used as the exact safe value. + * For FTR_LOWER_SAFE, safe_val is used as the minimal safe value. + */ + int64_t safe_val; +}; + +struct test_feature_reg { + uint32_t reg; + const struct reg_ftr_bits *ftr_bits; +}; + +#define __REG_FTR_BITS(NAME, SIGNED, TYPE, SHIFT, MASK, SAFE_VAL) \ + { \ + .name = #NAME, \ + .sign = SIGNED, \ + .type = TYPE, \ + .shift = SHIFT, \ + .mask = MASK, \ + .safe_val = SAFE_VAL, \ + } + +#define REG_FTR_BITS(type, reg, field, safe_val) \ + __REG_FTR_BITS(reg##_##field, FTR_UNSIGNED, type, reg##_##field##_SHIFT, \ + reg##_##field##_MASK, safe_val) + +#define S_REG_FTR_BITS(type, reg, field, safe_val) \ + __REG_FTR_BITS(reg##_##field, FTR_SIGNED, type, reg##_##field##_SHIFT, \ + reg##_##field##_MASK, safe_val) + +#define REG_FTR_END \ + { \ + .type = FTR_END, \ + } + +static const struct reg_ftr_bits ftr_id_aa64dfr0_el1[] = { + S_REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64DFR0_EL1, PMUVer, 0), + REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64DFR0_EL1, DebugVer, ID_AA64DFR0_EL1_DebugVer_IMP), + REG_FTR_END, +}; + +static const struct reg_ftr_bits ftr_id_dfr0_el1[] = { + S_REG_FTR_BITS(FTR_LOWER_SAFE, ID_DFR0_EL1, PerfMon, ID_DFR0_EL1_PerfMon_PMUv3), + REG_FTR_BITS(FTR_LOWER_SAFE, ID_DFR0_EL1, CopDbg, ID_DFR0_EL1_CopDbg_Armv8), + REG_FTR_END, +}; + +static const struct reg_ftr_bits ftr_id_aa64isar0_el1[] = { + REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64ISAR0_EL1, RNDR, 0), + REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64ISAR0_EL1, TLB, 0), + REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64ISAR0_EL1, TS, 0), + REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64ISAR0_EL1, FHM, 0), + REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64ISAR0_EL1, DP, 0), + REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64ISAR0_EL1, SM4, 0), + REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64ISAR0_EL1, SM3, 0), + REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64ISAR0_EL1, SHA3, 0), + REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64ISAR0_EL1, RDM, 0), + REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64ISAR0_EL1, TME, 0), + REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64ISAR0_EL1, ATOMIC, 0), + REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64ISAR0_EL1, CRC32, 0), + REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64ISAR0_EL1, SHA2, 0), + REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64ISAR0_EL1, SHA1, 0), + REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64ISAR0_EL1, AES, 0), + REG_FTR_END, +}; + +static const struct reg_ftr_bits ftr_id_aa64isar1_el1[] = { + REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64ISAR1_EL1, LS64, 0), + REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64ISAR1_EL1, XS, 0), + REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64ISAR1_EL1, I8MM, 0), + REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64ISAR1_EL1, DGH, 0), + REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64ISAR1_EL1, BF16, 0), + REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64ISAR1_EL1, SPECRES, 0), + REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64ISAR1_EL1, SB, 0), + REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64ISAR1_EL1, FRINTTS, 0), + REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64ISAR1_EL1, LRCPC, 0), + REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64ISAR1_EL1, FCMA, 0), + REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64ISAR1_EL1, JSCVT, 0), + REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64ISAR1_EL1, DPB, 0), + REG_FTR_END, +}; + +static const struct reg_ftr_bits ftr_id_aa64isar2_el1[] = { + REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64ISAR2_EL1, BC, 0), + REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64ISAR2_EL1, RPRES, 0), + REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64ISAR2_EL1, WFxT, 0), + REG_FTR_END, +}; + +static const struct reg_ftr_bits ftr_id_aa64pfr0_el1[] = { + REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64PFR0_EL1, CSV3, 0), + REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64PFR0_EL1, CSV2, 0), + REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64PFR0_EL1, DIT, 0), + REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64PFR0_EL1, SEL2, 0), + REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64PFR0_EL1, EL3, 0), + REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64PFR0_EL1, EL2, 0), + REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64PFR0_EL1, EL1, 0), + REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64PFR0_EL1, EL0, 0), + REG_FTR_END, +}; + +static const struct reg_ftr_bits ftr_id_aa64mmfr0_el1[] = { + REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64MMFR0_EL1, ECV, 0), + REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64MMFR0_EL1, EXS, 0), + S_REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64MMFR0_EL1, TGRAN4, 0), + S_REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64MMFR0_EL1, TGRAN64, 0), + REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64MMFR0_EL1, TGRAN16, 0), + REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64MMFR0_EL1, BIGENDEL0, 0), + REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64MMFR0_EL1, SNSMEM, 0), + REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64MMFR0_EL1, BIGEND, 0), + REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64MMFR0_EL1, ASIDBITS, 0), + REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64MMFR0_EL1, PARANGE, 0), + REG_FTR_END, +}; + +static const struct reg_ftr_bits ftr_id_aa64mmfr1_el1[] = { + REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64MMFR1_EL1, TIDCP1, 0), + REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64MMFR1_EL1, AFP, 0), + REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64MMFR1_EL1, ETS, 0), + REG_FTR_BITS(FTR_HIGHER_SAFE, ID_AA64MMFR1_EL1, SpecSEI, 0), + REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64MMFR1_EL1, PAN, 0), + REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64MMFR1_EL1, LO, 0), + REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64MMFR1_EL1, HPDS, 0), + REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64MMFR1_EL1, HAFDBS, 0), + REG_FTR_END, +}; + +static const struct reg_ftr_bits ftr_id_aa64mmfr2_el1[] = { + REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64MMFR2_EL1, E0PD, 0), + REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64MMFR2_EL1, BBM, 0), + REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64MMFR2_EL1, TTL, 0), + REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64MMFR2_EL1, AT, 0), + REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64MMFR2_EL1, ST, 0), + REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64MMFR2_EL1, VARange, 0), + REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64MMFR2_EL1, IESB, 0), + REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64MMFR2_EL1, LSM, 0), + REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64MMFR2_EL1, UAO, 0), + REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64MMFR2_EL1, CnP, 0), + REG_FTR_END, +}; + +static const struct reg_ftr_bits ftr_id_aa64zfr0_el1[] = { + REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64ZFR0_EL1, F64MM, 0), + REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64ZFR0_EL1, F32MM, 0), + REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64ZFR0_EL1, I8MM, 0), + REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64ZFR0_EL1, SM4, 0), + REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64ZFR0_EL1, SHA3, 0), + REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64ZFR0_EL1, BF16, 0), + REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64ZFR0_EL1, BitPerm, 0), + REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64ZFR0_EL1, AES, 0), + REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64ZFR0_EL1, SVEver, 0), + REG_FTR_END, +}; + +#define TEST_REG(id, table) \ + { \ + .reg = id, \ + .ftr_bits = &((table)[0]), \ + } + +static struct test_feature_reg test_regs[] = { + TEST_REG(SYS_ID_AA64DFR0_EL1, ftr_id_aa64dfr0_el1), + TEST_REG(SYS_ID_DFR0_EL1, ftr_id_dfr0_el1), + TEST_REG(SYS_ID_AA64ISAR0_EL1, ftr_id_aa64isar0_el1), + TEST_REG(SYS_ID_AA64ISAR1_EL1, ftr_id_aa64isar1_el1), + TEST_REG(SYS_ID_AA64ISAR2_EL1, ftr_id_aa64isar2_el1), + TEST_REG(SYS_ID_AA64PFR0_EL1, ftr_id_aa64pfr0_el1), + TEST_REG(SYS_ID_AA64MMFR0_EL1, ftr_id_aa64mmfr0_el1), + TEST_REG(SYS_ID_AA64MMFR1_EL1, ftr_id_aa64mmfr1_el1), + TEST_REG(SYS_ID_AA64MMFR2_EL1, ftr_id_aa64mmfr2_el1), + TEST_REG(SYS_ID_AA64ZFR0_EL1, ftr_id_aa64zfr0_el1), +}; + +#define GUEST_REG_SYNC(id) GUEST_SYNC_ARGS(0, id, read_sysreg_s(id), 0, 0); + +static void guest_code(void) +{ + GUEST_REG_SYNC(SYS_ID_AA64DFR0_EL1); + GUEST_REG_SYNC(SYS_ID_DFR0_EL1); + GUEST_REG_SYNC(SYS_ID_AA64ISAR0_EL1); + GUEST_REG_SYNC(SYS_ID_AA64ISAR1_EL1); + GUEST_REG_SYNC(SYS_ID_AA64ISAR2_EL1); + GUEST_REG_SYNC(SYS_ID_AA64PFR0_EL1); + GUEST_REG_SYNC(SYS_ID_AA64MMFR0_EL1); + GUEST_REG_SYNC(SYS_ID_AA64MMFR1_EL1); + GUEST_REG_SYNC(SYS_ID_AA64MMFR2_EL1); + GUEST_REG_SYNC(SYS_ID_AA64ZFR0_EL1); + + GUEST_DONE(); +} + +/* Return a safe value to a given ftr_bits an ftr value */ +uint64_t get_safe_value(const struct reg_ftr_bits *ftr_bits, uint64_t ftr) +{ + uint64_t ftr_max = GENMASK_ULL(ARM64_FEATURE_FIELD_BITS - 1, 0); + + if (ftr_bits->sign == FTR_UNSIGNED) { + switch (ftr_bits->type) { + case FTR_EXACT: + ftr = ftr_bits->safe_val; + break; + case FTR_LOWER_SAFE: + if (ftr > ftr_bits->safe_val) + ftr--; + break; + case FTR_HIGHER_SAFE: + if (ftr < ftr_max) + ftr++; + break; + case FTR_HIGHER_OR_ZERO_SAFE: + if (ftr == ftr_max) + ftr = 0; + else if (ftr != 0) + ftr++; + break; + default: + break; + } + } else if (ftr != ftr_max) { + switch (ftr_bits->type) { + case FTR_EXACT: + ftr = ftr_bits->safe_val; + break; + case FTR_LOWER_SAFE: + if (ftr > ftr_bits->safe_val) + ftr--; + break; + case FTR_HIGHER_SAFE: + if (ftr < ftr_max - 1) + ftr++; + break; + case FTR_HIGHER_OR_ZERO_SAFE: + if (ftr != 0 && ftr != ftr_max - 1) + ftr++; + break; + default: + break; + } + } + + return ftr; +} + +/* Return an invalid value to a given ftr_bits an ftr value */ +uint64_t get_invalid_value(const struct reg_ftr_bits *ftr_bits, uint64_t ftr) +{ + uint64_t ftr_max = GENMASK_ULL(ARM64_FEATURE_FIELD_BITS - 1, 0); + + if (ftr_bits->sign == FTR_UNSIGNED) { + switch (ftr_bits->type) { + case FTR_EXACT: + ftr = max((uint64_t)ftr_bits->safe_val + 1, ftr + 1); + break; + case FTR_LOWER_SAFE: + ftr++; + break; + case FTR_HIGHER_SAFE: + ftr--; + break; + case FTR_HIGHER_OR_ZERO_SAFE: + if (ftr == 0) + ftr = ftr_max; + else + ftr--; + break; + default: + break; + } + } else if (ftr != ftr_max) { + switch (ftr_bits->type) { + case FTR_EXACT: + ftr = max((uint64_t)ftr_bits->safe_val + 1, ftr + 1); + break; + case FTR_LOWER_SAFE: + ftr++; + break; + case FTR_HIGHER_SAFE: + ftr--; + break; + case FTR_HIGHER_OR_ZERO_SAFE: + if (ftr == 0) + ftr = ftr_max - 1; + else + ftr--; + break; + default: + break; + } + } else { + ftr = 0; + } + + return ftr; +} + +static uint64_t test_reg_set_success(struct kvm_vcpu *vcpu, uint64_t reg, + const struct reg_ftr_bits *ftr_bits) +{ + uint8_t shift = ftr_bits->shift; + uint64_t mask = ftr_bits->mask; + uint64_t val, new_val, ftr; + + vcpu_get_reg(vcpu, reg, &val); + ftr = (val & mask) >> shift; + + ftr = get_safe_value(ftr_bits, ftr); + + ftr <<= shift; + val &= ~mask; + val |= ftr; + + vcpu_set_reg(vcpu, reg, val); + vcpu_get_reg(vcpu, reg, &new_val); + TEST_ASSERT_EQ(new_val, val); + + return new_val; +} + +static void test_reg_set_fail(struct kvm_vcpu *vcpu, uint64_t reg, + const struct reg_ftr_bits *ftr_bits) +{ + uint8_t shift = ftr_bits->shift; + uint64_t mask = ftr_bits->mask; + uint64_t val, old_val, ftr; + int r; + + vcpu_get_reg(vcpu, reg, &val); + ftr = (val & mask) >> shift; + + ftr = get_invalid_value(ftr_bits, ftr); + + old_val = val; + ftr <<= shift; + val &= ~mask; + val |= ftr; + + r = __vcpu_set_reg(vcpu, reg, val); + TEST_ASSERT(r < 0 && errno == EINVAL, + "Unexpected KVM_SET_ONE_REG error: r=%d, errno=%d", r, errno); + + vcpu_get_reg(vcpu, reg, &val); + TEST_ASSERT_EQ(val, old_val); +} + +static uint64_t test_reg_vals[KVM_ARM_FEATURE_ID_RANGE_SIZE]; + +#define encoding_to_range_idx(encoding) \ + KVM_ARM_FEATURE_ID_RANGE_IDX(sys_reg_Op0(encoding), sys_reg_Op1(encoding), \ + sys_reg_CRn(encoding), sys_reg_CRm(encoding), \ + sys_reg_Op2(encoding)) + + +static void test_vm_ftr_id_regs(struct kvm_vcpu *vcpu, bool aarch64_only) +{ + uint64_t masks[KVM_ARM_FEATURE_ID_RANGE_SIZE]; + struct reg_mask_range range = { + .addr = (__u64)masks, + }; + int ret; + + /* KVM should return error when reserved field is not zero */ + range.reserved[0] = 1; + ret = __vm_ioctl(vcpu->vm, KVM_ARM_GET_REG_WRITABLE_MASKS, &range); + TEST_ASSERT(ret, "KVM doesn't check invalid parameters."); + + /* Get writable masks for feature ID registers */ + memset(range.reserved, 0, sizeof(range.reserved)); + vm_ioctl(vcpu->vm, KVM_ARM_GET_REG_WRITABLE_MASKS, &range); + + for (int i = 0; i < ARRAY_SIZE(test_regs); i++) { + const struct reg_ftr_bits *ftr_bits = test_regs[i].ftr_bits; + uint32_t reg_id = test_regs[i].reg; + uint64_t reg = KVM_ARM64_SYS_REG(reg_id); + int idx; + + /* Get the index to masks array for the idreg */ + idx = encoding_to_range_idx(reg_id); + + for (int j = 0; ftr_bits[j].type != FTR_END; j++) { + /* Skip aarch32 reg on aarch64 only system, since they are RAZ/WI. */ + if (aarch64_only && sys_reg_CRm(reg_id) < 4) { + ksft_test_result_skip("%s on AARCH64 only system\n", + ftr_bits[j].name); + continue; + } + + /* Make sure the feature field is writable */ + TEST_ASSERT_EQ(masks[idx] & ftr_bits[j].mask, ftr_bits[j].mask); + + test_reg_set_fail(vcpu, reg, &ftr_bits[j]); + + test_reg_vals[idx] = test_reg_set_success(vcpu, reg, + &ftr_bits[j]); + + ksft_test_result_pass("%s\n", ftr_bits[j].name); + } + } +} + +static void test_guest_reg_read(struct kvm_vcpu *vcpu) +{ + bool done = false; + struct ucall uc; + + while (!done) { + vcpu_run(vcpu); + + switch (get_ucall(vcpu, &uc)) { + case UCALL_ABORT: + REPORT_GUEST_ASSERT(uc); + break; + case UCALL_SYNC: + /* Make sure the written values are seen by guest */ + TEST_ASSERT_EQ(test_reg_vals[encoding_to_range_idx(uc.args[2])], + uc.args[3]); + break; + case UCALL_DONE: + done = true; + break; + default: + TEST_FAIL("Unexpected ucall: %lu", uc.cmd); + } + } +} + +/* Politely lifted from arch/arm64/include/asm/cache.h */ +/* Ctypen, bits[3(n - 1) + 2 : 3(n - 1)], for n = 1 to 7 */ +#define CLIDR_CTYPE_SHIFT(level) (3 * (level - 1)) +#define CLIDR_CTYPE_MASK(level) (7 << CLIDR_CTYPE_SHIFT(level)) +#define CLIDR_CTYPE(clidr, level) \ + (((clidr) & CLIDR_CTYPE_MASK(level)) >> CLIDR_CTYPE_SHIFT(level)) + +static void test_clidr(struct kvm_vcpu *vcpu) +{ + uint64_t clidr; + int level; + + vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(SYS_CLIDR_EL1), &clidr); + + /* find the first empty level in the cache hierarchy */ + for (level = 1; level < 7; level++) { + if (!CLIDR_CTYPE(clidr, level)) + break; + } + + /* + * If you have a mind-boggling 7 levels of cache, congratulations, you + * get to fix this. + */ + TEST_ASSERT(level <= 7, "can't find an empty level in cache hierarchy"); + + /* stick in a unified cache level */ + clidr |= BIT(2) << CLIDR_CTYPE_SHIFT(level); + + vcpu_set_reg(vcpu, KVM_ARM64_SYS_REG(SYS_CLIDR_EL1), clidr); + test_reg_vals[encoding_to_range_idx(SYS_CLIDR_EL1)] = clidr; +} + +static void test_vcpu_ftr_id_regs(struct kvm_vcpu *vcpu) +{ + u64 val; + + test_clidr(vcpu); + + vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(SYS_MPIDR_EL1), &val); + val++; + vcpu_set_reg(vcpu, KVM_ARM64_SYS_REG(SYS_MPIDR_EL1), val); + + test_reg_vals[encoding_to_range_idx(SYS_MPIDR_EL1)] = val; + ksft_test_result_pass("%s\n", __func__); +} + +static void test_assert_id_reg_unchanged(struct kvm_vcpu *vcpu, uint32_t encoding) +{ + size_t idx = encoding_to_range_idx(encoding); + uint64_t observed; + + vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(encoding), &observed); + TEST_ASSERT_EQ(test_reg_vals[idx], observed); +} + +static void test_reset_preserves_id_regs(struct kvm_vcpu *vcpu) +{ + /* + * Calls KVM_ARM_VCPU_INIT behind the scenes, which will do an + * architectural reset of the vCPU. + */ + aarch64_vcpu_setup(vcpu, NULL); + + for (int i = 0; i < ARRAY_SIZE(test_regs); i++) + test_assert_id_reg_unchanged(vcpu, test_regs[i].reg); + + test_assert_id_reg_unchanged(vcpu, SYS_CLIDR_EL1); + + ksft_test_result_pass("%s\n", __func__); +} + +int main(void) +{ + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; + bool aarch64_only; + uint64_t val, el0; + int test_cnt; + + TEST_REQUIRE(kvm_has_cap(KVM_CAP_ARM_SUPPORTED_REG_MASK_RANGES)); + + vm = vm_create_with_one_vcpu(&vcpu, guest_code); + + /* Check for AARCH64 only system */ + vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(SYS_ID_AA64PFR0_EL1), &val); + el0 = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_EL0), val); + aarch64_only = (el0 == ID_AA64PFR0_EL1_ELx_64BIT_ONLY); + + ksft_print_header(); + + test_cnt = ARRAY_SIZE(ftr_id_aa64dfr0_el1) + ARRAY_SIZE(ftr_id_dfr0_el1) + + ARRAY_SIZE(ftr_id_aa64isar0_el1) + ARRAY_SIZE(ftr_id_aa64isar1_el1) + + ARRAY_SIZE(ftr_id_aa64isar2_el1) + ARRAY_SIZE(ftr_id_aa64pfr0_el1) + + ARRAY_SIZE(ftr_id_aa64mmfr0_el1) + ARRAY_SIZE(ftr_id_aa64mmfr1_el1) + + ARRAY_SIZE(ftr_id_aa64mmfr2_el1) + ARRAY_SIZE(ftr_id_aa64zfr0_el1) - + ARRAY_SIZE(test_regs) + 2; + + ksft_set_plan(test_cnt); + + test_vm_ftr_id_regs(vcpu, aarch64_only); + test_vcpu_ftr_id_regs(vcpu); + + test_guest_reg_read(vcpu); + + test_reset_preserves_id_regs(vcpu); + + kvm_vm_free(vm); + + ksft_finished(); +} diff --git a/tools/testing/selftests/kvm/aarch64/smccc_filter.c b/tools/testing/selftests/kvm/aarch64/smccc_filter.c new file mode 100644 index 000000000000..2d189f3da228 --- /dev/null +++ b/tools/testing/selftests/kvm/aarch64/smccc_filter.c @@ -0,0 +1,268 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * smccc_filter - Tests for the SMCCC filter UAPI. + * + * Copyright (c) 2023 Google LLC + * + * This test includes: + * - Tests that the UAPI constraints are upheld by KVM. For example, userspace + * is prevented from filtering the architecture range of SMCCC calls. + * - Test that the filter actions (DENIED, FWD_TO_USER) work as intended. + */ + +#include <linux/arm-smccc.h> +#include <linux/psci.h> +#include <stdint.h> + +#include "processor.h" +#include "test_util.h" + +enum smccc_conduit { + HVC_INSN, + SMC_INSN, +}; + +#define for_each_conduit(conduit) \ + for (conduit = HVC_INSN; conduit <= SMC_INSN; conduit++) + +static void guest_main(uint32_t func_id, enum smccc_conduit conduit) +{ + struct arm_smccc_res res; + + if (conduit == SMC_INSN) + smccc_smc(func_id, 0, 0, 0, 0, 0, 0, 0, &res); + else + smccc_hvc(func_id, 0, 0, 0, 0, 0, 0, 0, &res); + + GUEST_SYNC(res.a0); +} + +static int __set_smccc_filter(struct kvm_vm *vm, uint32_t start, uint32_t nr_functions, + enum kvm_smccc_filter_action action) +{ + struct kvm_smccc_filter filter = { + .base = start, + .nr_functions = nr_functions, + .action = action, + }; + + return __kvm_device_attr_set(vm->fd, KVM_ARM_VM_SMCCC_CTRL, + KVM_ARM_VM_SMCCC_FILTER, &filter); +} + +static void set_smccc_filter(struct kvm_vm *vm, uint32_t start, uint32_t nr_functions, + enum kvm_smccc_filter_action action) +{ + int ret = __set_smccc_filter(vm, start, nr_functions, action); + + TEST_ASSERT(!ret, "failed to configure SMCCC filter: %d", ret); +} + +static struct kvm_vm *setup_vm(struct kvm_vcpu **vcpu) +{ + struct kvm_vcpu_init init; + struct kvm_vm *vm; + + vm = vm_create(1); + vm_ioctl(vm, KVM_ARM_PREFERRED_TARGET, &init); + + /* + * Enable in-kernel emulation of PSCI to ensure that calls are denied + * due to the SMCCC filter, not because of KVM. + */ + init.features[0] |= (1 << KVM_ARM_VCPU_PSCI_0_2); + + *vcpu = aarch64_vcpu_add(vm, 0, &init, guest_main); + return vm; +} + +static void test_pad_must_be_zero(void) +{ + struct kvm_vcpu *vcpu; + struct kvm_vm *vm = setup_vm(&vcpu); + struct kvm_smccc_filter filter = { + .base = PSCI_0_2_FN_PSCI_VERSION, + .nr_functions = 1, + .action = KVM_SMCCC_FILTER_DENY, + .pad = { -1 }, + }; + int r; + + r = __kvm_device_attr_set(vm->fd, KVM_ARM_VM_SMCCC_CTRL, + KVM_ARM_VM_SMCCC_FILTER, &filter); + TEST_ASSERT(r < 0 && errno == EINVAL, + "Setting filter with nonzero padding should return EINVAL"); +} + +/* Ensure that userspace cannot filter the Arm Architecture SMCCC range */ +static void test_filter_reserved_range(void) +{ + struct kvm_vcpu *vcpu; + struct kvm_vm *vm = setup_vm(&vcpu); + uint32_t smc64_fn; + int r; + + r = __set_smccc_filter(vm, ARM_SMCCC_ARCH_WORKAROUND_1, + 1, KVM_SMCCC_FILTER_DENY); + TEST_ASSERT(r < 0 && errno == EEXIST, + "Attempt to filter reserved range should return EEXIST"); + + smc64_fn = ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, ARM_SMCCC_SMC_64, + 0, 0); + + r = __set_smccc_filter(vm, smc64_fn, 1, KVM_SMCCC_FILTER_DENY); + TEST_ASSERT(r < 0 && errno == EEXIST, + "Attempt to filter reserved range should return EEXIST"); + + kvm_vm_free(vm); +} + +static void test_invalid_nr_functions(void) +{ + struct kvm_vcpu *vcpu; + struct kvm_vm *vm = setup_vm(&vcpu); + int r; + + r = __set_smccc_filter(vm, PSCI_0_2_FN64_CPU_ON, 0, KVM_SMCCC_FILTER_DENY); + TEST_ASSERT(r < 0 && errno == EINVAL, + "Attempt to filter 0 functions should return EINVAL"); + + kvm_vm_free(vm); +} + +static void test_overflow_nr_functions(void) +{ + struct kvm_vcpu *vcpu; + struct kvm_vm *vm = setup_vm(&vcpu); + int r; + + r = __set_smccc_filter(vm, ~0, ~0, KVM_SMCCC_FILTER_DENY); + TEST_ASSERT(r < 0 && errno == EINVAL, + "Attempt to overflow filter range should return EINVAL"); + + kvm_vm_free(vm); +} + +static void test_reserved_action(void) +{ + struct kvm_vcpu *vcpu; + struct kvm_vm *vm = setup_vm(&vcpu); + int r; + + r = __set_smccc_filter(vm, PSCI_0_2_FN64_CPU_ON, 1, -1); + TEST_ASSERT(r < 0 && errno == EINVAL, + "Attempt to use reserved filter action should return EINVAL"); + + kvm_vm_free(vm); +} + + +/* Test that overlapping configurations of the SMCCC filter are rejected */ +static void test_filter_overlap(void) +{ + struct kvm_vcpu *vcpu; + struct kvm_vm *vm = setup_vm(&vcpu); + int r; + + set_smccc_filter(vm, PSCI_0_2_FN64_CPU_ON, 1, KVM_SMCCC_FILTER_DENY); + + r = __set_smccc_filter(vm, PSCI_0_2_FN64_CPU_ON, 1, KVM_SMCCC_FILTER_DENY); + TEST_ASSERT(r < 0 && errno == EEXIST, + "Attempt to filter already configured range should return EEXIST"); + + kvm_vm_free(vm); +} + +static void expect_call_denied(struct kvm_vcpu *vcpu) +{ + struct ucall uc; + + if (get_ucall(vcpu, &uc) != UCALL_SYNC) + TEST_FAIL("Unexpected ucall: %lu", uc.cmd); + + TEST_ASSERT(uc.args[1] == SMCCC_RET_NOT_SUPPORTED, + "Unexpected SMCCC return code: %lu", uc.args[1]); +} + +/* Denied SMCCC calls have a return code of SMCCC_RET_NOT_SUPPORTED */ +static void test_filter_denied(void) +{ + enum smccc_conduit conduit; + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; + + for_each_conduit(conduit) { + vm = setup_vm(&vcpu); + + set_smccc_filter(vm, PSCI_0_2_FN_PSCI_VERSION, 1, KVM_SMCCC_FILTER_DENY); + vcpu_args_set(vcpu, 2, PSCI_0_2_FN_PSCI_VERSION, conduit); + + vcpu_run(vcpu); + expect_call_denied(vcpu); + + kvm_vm_free(vm); + } +} + +static void expect_call_fwd_to_user(struct kvm_vcpu *vcpu, uint32_t func_id, + enum smccc_conduit conduit) +{ + struct kvm_run *run = vcpu->run; + + TEST_ASSERT(run->exit_reason == KVM_EXIT_HYPERCALL, + "Unexpected exit reason: %u", run->exit_reason); + TEST_ASSERT(run->hypercall.nr == func_id, + "Unexpected SMCCC function: %llu", run->hypercall.nr); + + if (conduit == SMC_INSN) + TEST_ASSERT(run->hypercall.flags & KVM_HYPERCALL_EXIT_SMC, + "KVM_HYPERCALL_EXIT_SMC is not set"); + else + TEST_ASSERT(!(run->hypercall.flags & KVM_HYPERCALL_EXIT_SMC), + "KVM_HYPERCALL_EXIT_SMC is set"); +} + +/* SMCCC calls forwarded to userspace cause KVM_EXIT_HYPERCALL exits */ +static void test_filter_fwd_to_user(void) +{ + enum smccc_conduit conduit; + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; + + for_each_conduit(conduit) { + vm = setup_vm(&vcpu); + + set_smccc_filter(vm, PSCI_0_2_FN_PSCI_VERSION, 1, KVM_SMCCC_FILTER_FWD_TO_USER); + vcpu_args_set(vcpu, 2, PSCI_0_2_FN_PSCI_VERSION, conduit); + + vcpu_run(vcpu); + expect_call_fwd_to_user(vcpu, PSCI_0_2_FN_PSCI_VERSION, conduit); + + kvm_vm_free(vm); + } +} + +static bool kvm_supports_smccc_filter(void) +{ + struct kvm_vm *vm = vm_create_barebones(); + int r; + + r = __kvm_has_device_attr(vm->fd, KVM_ARM_VM_SMCCC_CTRL, KVM_ARM_VM_SMCCC_FILTER); + + kvm_vm_free(vm); + return !r; +} + +int main(void) +{ + TEST_REQUIRE(kvm_supports_smccc_filter()); + + test_pad_must_be_zero(); + test_invalid_nr_functions(); + test_overflow_nr_functions(); + test_reserved_action(); + test_filter_reserved_range(); + test_filter_overlap(); + test_filter_denied(); + test_filter_fwd_to_user(); +} diff --git a/tools/testing/selftests/kvm/aarch64/vcpu_width_config.c b/tools/testing/selftests/kvm/aarch64/vcpu_width_config.c new file mode 100644 index 000000000000..80b74c6f152b --- /dev/null +++ b/tools/testing/selftests/kvm/aarch64/vcpu_width_config.c @@ -0,0 +1,121 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * vcpu_width_config - Test KVM_ARM_VCPU_INIT() with KVM_ARM_VCPU_EL1_32BIT. + * + * Copyright (c) 2022 Google LLC. + * + * This is a test that ensures that non-mixed-width vCPUs (all 64bit vCPUs + * or all 32bit vcPUs) can be configured and mixed-width vCPUs cannot be + * configured. + */ + +#include "kvm_util.h" +#include "processor.h" +#include "test_util.h" + + +/* + * Add a vCPU, run KVM_ARM_VCPU_INIT with @init0, and then + * add another vCPU, and run KVM_ARM_VCPU_INIT with @init1. + */ +static int add_init_2vcpus(struct kvm_vcpu_init *init0, + struct kvm_vcpu_init *init1) +{ + struct kvm_vcpu *vcpu0, *vcpu1; + struct kvm_vm *vm; + int ret; + + vm = vm_create_barebones(); + + vcpu0 = __vm_vcpu_add(vm, 0); + ret = __vcpu_ioctl(vcpu0, KVM_ARM_VCPU_INIT, init0); + if (ret) + goto free_exit; + + vcpu1 = __vm_vcpu_add(vm, 1); + ret = __vcpu_ioctl(vcpu1, KVM_ARM_VCPU_INIT, init1); + +free_exit: + kvm_vm_free(vm); + return ret; +} + +/* + * Add two vCPUs, then run KVM_ARM_VCPU_INIT for one vCPU with @init0, + * and run KVM_ARM_VCPU_INIT for another vCPU with @init1. + */ +static int add_2vcpus_init_2vcpus(struct kvm_vcpu_init *init0, + struct kvm_vcpu_init *init1) +{ + struct kvm_vcpu *vcpu0, *vcpu1; + struct kvm_vm *vm; + int ret; + + vm = vm_create_barebones(); + + vcpu0 = __vm_vcpu_add(vm, 0); + vcpu1 = __vm_vcpu_add(vm, 1); + + ret = __vcpu_ioctl(vcpu0, KVM_ARM_VCPU_INIT, init0); + if (ret) + goto free_exit; + + ret = __vcpu_ioctl(vcpu1, KVM_ARM_VCPU_INIT, init1); + +free_exit: + kvm_vm_free(vm); + return ret; +} + +/* + * Tests that two 64bit vCPUs can be configured, two 32bit vCPUs can be + * configured, and two mixed-width vCPUs cannot be configured. + * Each of those three cases, configure vCPUs in two different orders. + * The one is running KVM_CREATE_VCPU for 2 vCPUs, and then running + * KVM_ARM_VCPU_INIT for them. + * The other is running KVM_CREATE_VCPU and KVM_ARM_VCPU_INIT for a vCPU, + * and then run those commands for another vCPU. + */ +int main(void) +{ + struct kvm_vcpu_init init0, init1; + struct kvm_vm *vm; + int ret; + + TEST_REQUIRE(kvm_has_cap(KVM_CAP_ARM_EL1_32BIT)); + + /* Get the preferred target type and copy that to init1 for later use */ + vm = vm_create_barebones(); + vm_ioctl(vm, KVM_ARM_PREFERRED_TARGET, &init0); + kvm_vm_free(vm); + init1 = init0; + + /* Test with 64bit vCPUs */ + ret = add_init_2vcpus(&init0, &init0); + TEST_ASSERT(ret == 0, + "Configuring 64bit EL1 vCPUs failed unexpectedly"); + ret = add_2vcpus_init_2vcpus(&init0, &init0); + TEST_ASSERT(ret == 0, + "Configuring 64bit EL1 vCPUs failed unexpectedly"); + + /* Test with 32bit vCPUs */ + init0.features[0] = (1 << KVM_ARM_VCPU_EL1_32BIT); + ret = add_init_2vcpus(&init0, &init0); + TEST_ASSERT(ret == 0, + "Configuring 32bit EL1 vCPUs failed unexpectedly"); + ret = add_2vcpus_init_2vcpus(&init0, &init0); + TEST_ASSERT(ret == 0, + "Configuring 32bit EL1 vCPUs failed unexpectedly"); + + /* Test with mixed-width vCPUs */ + init0.features[0] = 0; + init1.features[0] = (1 << KVM_ARM_VCPU_EL1_32BIT); + ret = add_init_2vcpus(&init0, &init1); + TEST_ASSERT(ret != 0, + "Configuring mixed-width vCPUs worked unexpectedly"); + ret = add_2vcpus_init_2vcpus(&init0, &init1); + TEST_ASSERT(ret != 0, + "Configuring mixed-width vCPUs worked unexpectedly"); + + return 0; +} diff --git a/tools/testing/selftests/kvm/aarch64/vgic_init.c b/tools/testing/selftests/kvm/aarch64/vgic_init.c new file mode 100644 index 000000000000..b3b5fb0ff0a9 --- /dev/null +++ b/tools/testing/selftests/kvm/aarch64/vgic_init.c @@ -0,0 +1,764 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * vgic init sequence tests + * + * Copyright (C) 2020, Red Hat, Inc. + */ +#include <linux/kernel.h> +#include <sys/syscall.h> +#include <asm/kvm.h> +#include <asm/kvm_para.h> + +#include "test_util.h" +#include "kvm_util.h" +#include "processor.h" +#include "vgic.h" + +#define NR_VCPUS 4 + +#define REG_OFFSET(vcpu, offset) (((uint64_t)vcpu << 32) | offset) + +#define GICR_TYPER 0x8 + +#define VGIC_DEV_IS_V2(_d) ((_d) == KVM_DEV_TYPE_ARM_VGIC_V2) +#define VGIC_DEV_IS_V3(_d) ((_d) == KVM_DEV_TYPE_ARM_VGIC_V3) + +struct vm_gic { + struct kvm_vm *vm; + int gic_fd; + uint32_t gic_dev_type; +}; + +static uint64_t max_phys_size; + +/* + * Helpers to access a redistributor register and verify the ioctl() failed or + * succeeded as expected, and provided the correct value on success. + */ +static void v3_redist_reg_get_errno(int gicv3_fd, int vcpu, int offset, + int want, const char *msg) +{ + uint32_t ignored_val; + int ret = __kvm_device_attr_get(gicv3_fd, KVM_DEV_ARM_VGIC_GRP_REDIST_REGS, + REG_OFFSET(vcpu, offset), &ignored_val); + + TEST_ASSERT(ret && errno == want, "%s; want errno = %d", msg, want); +} + +static void v3_redist_reg_get(int gicv3_fd, int vcpu, int offset, uint32_t want, + const char *msg) +{ + uint32_t val; + + kvm_device_attr_get(gicv3_fd, KVM_DEV_ARM_VGIC_GRP_REDIST_REGS, + REG_OFFSET(vcpu, offset), &val); + TEST_ASSERT(val == want, "%s; want '0x%x', got '0x%x'", msg, want, val); +} + +/* dummy guest code */ +static void guest_code(void) +{ + GUEST_SYNC(0); + GUEST_SYNC(1); + GUEST_SYNC(2); + GUEST_DONE(); +} + +/* we don't want to assert on run execution, hence that helper */ +static int run_vcpu(struct kvm_vcpu *vcpu) +{ + return __vcpu_run(vcpu) ? -errno : 0; +} + +static struct vm_gic vm_gic_create_with_vcpus(uint32_t gic_dev_type, + uint32_t nr_vcpus, + struct kvm_vcpu *vcpus[]) +{ + struct vm_gic v; + + v.gic_dev_type = gic_dev_type; + v.vm = vm_create_with_vcpus(nr_vcpus, guest_code, vcpus); + v.gic_fd = kvm_create_device(v.vm, gic_dev_type); + + return v; +} + +static struct vm_gic vm_gic_create_barebones(uint32_t gic_dev_type) +{ + struct vm_gic v; + + v.gic_dev_type = gic_dev_type; + v.vm = vm_create_barebones(); + v.gic_fd = kvm_create_device(v.vm, gic_dev_type); + + return v; +} + + +static void vm_gic_destroy(struct vm_gic *v) +{ + close(v->gic_fd); + kvm_vm_free(v->vm); +} + +struct vgic_region_attr { + uint64_t attr; + uint64_t size; + uint64_t alignment; +}; + +struct vgic_region_attr gic_v3_dist_region = { + .attr = KVM_VGIC_V3_ADDR_TYPE_DIST, + .size = 0x10000, + .alignment = 0x10000, +}; + +struct vgic_region_attr gic_v3_redist_region = { + .attr = KVM_VGIC_V3_ADDR_TYPE_REDIST, + .size = NR_VCPUS * 0x20000, + .alignment = 0x10000, +}; + +struct vgic_region_attr gic_v2_dist_region = { + .attr = KVM_VGIC_V2_ADDR_TYPE_DIST, + .size = 0x1000, + .alignment = 0x1000, +}; + +struct vgic_region_attr gic_v2_cpu_region = { + .attr = KVM_VGIC_V2_ADDR_TYPE_CPU, + .size = 0x2000, + .alignment = 0x1000, +}; + +/** + * Helper routine that performs KVM device tests in general. Eventually the + * ARM_VGIC (GICv2 or GICv3) device gets created with an overlapping + * DIST/REDIST (or DIST/CPUIF for GICv2). Assumption is 4 vcpus are going to be + * used hence the overlap. In the case of GICv3, A RDIST region is set at @0x0 + * and a DIST region is set @0x70000. The GICv2 case sets a CPUIF @0x0 and a + * DIST region @0x1000. + */ +static void subtest_dist_rdist(struct vm_gic *v) +{ + int ret; + uint64_t addr; + struct vgic_region_attr rdist; /* CPU interface in GICv2*/ + struct vgic_region_attr dist; + + rdist = VGIC_DEV_IS_V3(v->gic_dev_type) ? gic_v3_redist_region + : gic_v2_cpu_region; + dist = VGIC_DEV_IS_V3(v->gic_dev_type) ? gic_v3_dist_region + : gic_v2_dist_region; + + /* Check existing group/attributes */ + kvm_has_device_attr(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, dist.attr); + + kvm_has_device_attr(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, rdist.attr); + + /* check non existing attribute */ + ret = __kvm_has_device_attr(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, -1); + TEST_ASSERT(ret && errno == ENXIO, "attribute not supported"); + + /* misaligned DIST and REDIST address settings */ + addr = dist.alignment / 0x10; + ret = __kvm_device_attr_set(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, + dist.attr, &addr); + TEST_ASSERT(ret && errno == EINVAL, "GIC dist base not aligned"); + + addr = rdist.alignment / 0x10; + ret = __kvm_device_attr_set(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, + rdist.attr, &addr); + TEST_ASSERT(ret && errno == EINVAL, "GIC redist/cpu base not aligned"); + + /* out of range address */ + addr = max_phys_size; + ret = __kvm_device_attr_set(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, + dist.attr, &addr); + TEST_ASSERT(ret && errno == E2BIG, "dist address beyond IPA limit"); + + ret = __kvm_device_attr_set(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, + rdist.attr, &addr); + TEST_ASSERT(ret && errno == E2BIG, "redist address beyond IPA limit"); + + /* Space for half a rdist (a rdist is: 2 * rdist.alignment). */ + addr = max_phys_size - dist.alignment; + ret = __kvm_device_attr_set(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, + rdist.attr, &addr); + TEST_ASSERT(ret && errno == E2BIG, + "half of the redist is beyond IPA limit"); + + /* set REDIST base address @0x0*/ + addr = 0x00000; + kvm_device_attr_set(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, + rdist.attr, &addr); + + /* Attempt to create a second legacy redistributor region */ + addr = 0xE0000; + ret = __kvm_device_attr_set(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, + rdist.attr, &addr); + TEST_ASSERT(ret && errno == EEXIST, "GIC redist base set again"); + + ret = __kvm_has_device_attr(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, + KVM_VGIC_V3_ADDR_TYPE_REDIST); + if (!ret) { + /* Attempt to mix legacy and new redistributor regions */ + addr = REDIST_REGION_ATTR_ADDR(NR_VCPUS, 0x100000, 0, 0); + ret = __kvm_device_attr_set(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, + KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr); + TEST_ASSERT(ret && errno == EINVAL, + "attempt to mix GICv3 REDIST and REDIST_REGION"); + } + + /* + * Set overlapping DIST / REDIST, cannot be detected here. Will be detected + * on first vcpu run instead. + */ + addr = rdist.size - rdist.alignment; + kvm_device_attr_set(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, + dist.attr, &addr); +} + +/* Test the new REDIST region API */ +static void subtest_v3_redist_regions(struct vm_gic *v) +{ + uint64_t addr, expected_addr; + int ret; + + ret = __kvm_has_device_attr(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, + KVM_VGIC_V3_ADDR_TYPE_REDIST); + TEST_ASSERT(!ret, "Multiple redist regions advertised"); + + addr = REDIST_REGION_ATTR_ADDR(NR_VCPUS, 0x100000, 2, 0); + ret = __kvm_device_attr_set(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, + KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr); + TEST_ASSERT(ret && errno == EINVAL, "redist region attr value with flags != 0"); + + addr = REDIST_REGION_ATTR_ADDR(0, 0x100000, 0, 0); + ret = __kvm_device_attr_set(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, + KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr); + TEST_ASSERT(ret && errno == EINVAL, "redist region attr value with count== 0"); + + addr = REDIST_REGION_ATTR_ADDR(2, 0x200000, 0, 1); + ret = __kvm_device_attr_set(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, + KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr); + TEST_ASSERT(ret && errno == EINVAL, + "attempt to register the first rdist region with index != 0"); + + addr = REDIST_REGION_ATTR_ADDR(2, 0x201000, 0, 1); + ret = __kvm_device_attr_set(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, + KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr); + TEST_ASSERT(ret && errno == EINVAL, "rdist region with misaligned address"); + + addr = REDIST_REGION_ATTR_ADDR(2, 0x200000, 0, 0); + kvm_device_attr_set(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, + KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr); + + addr = REDIST_REGION_ATTR_ADDR(2, 0x200000, 0, 1); + ret = __kvm_device_attr_set(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, + KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr); + TEST_ASSERT(ret && errno == EINVAL, "register an rdist region with already used index"); + + addr = REDIST_REGION_ATTR_ADDR(1, 0x210000, 0, 2); + ret = __kvm_device_attr_set(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, + KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr); + TEST_ASSERT(ret && errno == EINVAL, + "register an rdist region overlapping with another one"); + + addr = REDIST_REGION_ATTR_ADDR(1, 0x240000, 0, 2); + ret = __kvm_device_attr_set(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, + KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr); + TEST_ASSERT(ret && errno == EINVAL, "register redist region with index not +1"); + + addr = REDIST_REGION_ATTR_ADDR(1, 0x240000, 0, 1); + kvm_device_attr_set(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, + KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr); + + addr = REDIST_REGION_ATTR_ADDR(1, max_phys_size, 0, 2); + ret = __kvm_device_attr_set(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, + KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr); + TEST_ASSERT(ret && errno == E2BIG, + "register redist region with base address beyond IPA range"); + + /* The last redist is above the pa range. */ + addr = REDIST_REGION_ATTR_ADDR(2, max_phys_size - 0x30000, 0, 2); + ret = __kvm_device_attr_set(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, + KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr); + TEST_ASSERT(ret && errno == E2BIG, + "register redist region with top address beyond IPA range"); + + addr = 0x260000; + ret = __kvm_device_attr_set(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, + KVM_VGIC_V3_ADDR_TYPE_REDIST, &addr); + TEST_ASSERT(ret && errno == EINVAL, + "Mix KVM_VGIC_V3_ADDR_TYPE_REDIST and REDIST_REGION"); + + /* + * Now there are 2 redist regions: + * region 0 @ 0x200000 2 redists + * region 1 @ 0x240000 1 redist + * Attempt to read their characteristics + */ + + addr = REDIST_REGION_ATTR_ADDR(0, 0, 0, 0); + expected_addr = REDIST_REGION_ATTR_ADDR(2, 0x200000, 0, 0); + ret = __kvm_device_attr_get(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, + KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr); + TEST_ASSERT(!ret && addr == expected_addr, "read characteristics of region #0"); + + addr = REDIST_REGION_ATTR_ADDR(0, 0, 0, 1); + expected_addr = REDIST_REGION_ATTR_ADDR(1, 0x240000, 0, 1); + ret = __kvm_device_attr_get(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, + KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr); + TEST_ASSERT(!ret && addr == expected_addr, "read characteristics of region #1"); + + addr = REDIST_REGION_ATTR_ADDR(0, 0, 0, 2); + ret = __kvm_device_attr_get(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, + KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr); + TEST_ASSERT(ret && errno == ENOENT, "read characteristics of non existing region"); + + addr = 0x260000; + kvm_device_attr_set(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, + KVM_VGIC_V3_ADDR_TYPE_DIST, &addr); + + addr = REDIST_REGION_ATTR_ADDR(1, 0x260000, 0, 2); + ret = __kvm_device_attr_set(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, + KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr); + TEST_ASSERT(ret && errno == EINVAL, "register redist region colliding with dist"); +} + +/* + * VGIC KVM device is created and initialized before the secondary CPUs + * get created + */ +static void test_vgic_then_vcpus(uint32_t gic_dev_type) +{ + struct kvm_vcpu *vcpus[NR_VCPUS]; + struct vm_gic v; + int ret, i; + + v = vm_gic_create_with_vcpus(gic_dev_type, 1, vcpus); + + subtest_dist_rdist(&v); + + /* Add the rest of the VCPUs */ + for (i = 1; i < NR_VCPUS; ++i) + vcpus[i] = vm_vcpu_add(v.vm, i, guest_code); + + ret = run_vcpu(vcpus[3]); + TEST_ASSERT(ret == -EINVAL, "dist/rdist overlap detected on 1st vcpu run"); + + vm_gic_destroy(&v); +} + +/* All the VCPUs are created before the VGIC KVM device gets initialized */ +static void test_vcpus_then_vgic(uint32_t gic_dev_type) +{ + struct kvm_vcpu *vcpus[NR_VCPUS]; + struct vm_gic v; + int ret; + + v = vm_gic_create_with_vcpus(gic_dev_type, NR_VCPUS, vcpus); + + subtest_dist_rdist(&v); + + ret = run_vcpu(vcpus[3]); + TEST_ASSERT(ret == -EINVAL, "dist/rdist overlap detected on 1st vcpu run"); + + vm_gic_destroy(&v); +} + +#define KVM_VGIC_V2_ATTR(offset, cpu) \ + (FIELD_PREP(KVM_DEV_ARM_VGIC_OFFSET_MASK, offset) | \ + FIELD_PREP(KVM_DEV_ARM_VGIC_CPUID_MASK, cpu)) + +#define GIC_CPU_CTRL 0x00 + +static void test_v2_uaccess_cpuif_no_vcpus(void) +{ + struct vm_gic v; + u64 val = 0; + int ret; + + v = vm_gic_create_barebones(KVM_DEV_TYPE_ARM_VGIC_V2); + subtest_dist_rdist(&v); + + ret = __kvm_has_device_attr(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_CPU_REGS, + KVM_VGIC_V2_ATTR(GIC_CPU_CTRL, 0)); + TEST_ASSERT(ret && errno == EINVAL, + "accessed non-existent CPU interface, want errno: %i", + EINVAL); + ret = __kvm_device_attr_get(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_CPU_REGS, + KVM_VGIC_V2_ATTR(GIC_CPU_CTRL, 0), &val); + TEST_ASSERT(ret && errno == EINVAL, + "accessed non-existent CPU interface, want errno: %i", + EINVAL); + ret = __kvm_device_attr_set(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_CPU_REGS, + KVM_VGIC_V2_ATTR(GIC_CPU_CTRL, 0), &val); + TEST_ASSERT(ret && errno == EINVAL, + "accessed non-existent CPU interface, want errno: %i", + EINVAL); + + vm_gic_destroy(&v); +} + +static void test_v3_new_redist_regions(void) +{ + struct kvm_vcpu *vcpus[NR_VCPUS]; + void *dummy = NULL; + struct vm_gic v; + uint64_t addr; + int ret; + + v = vm_gic_create_with_vcpus(KVM_DEV_TYPE_ARM_VGIC_V3, NR_VCPUS, vcpus); + subtest_v3_redist_regions(&v); + kvm_device_attr_set(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_CTRL, + KVM_DEV_ARM_VGIC_CTRL_INIT, NULL); + + ret = run_vcpu(vcpus[3]); + TEST_ASSERT(ret == -ENXIO, "running without sufficient number of rdists"); + vm_gic_destroy(&v); + + /* step2 */ + + v = vm_gic_create_with_vcpus(KVM_DEV_TYPE_ARM_VGIC_V3, NR_VCPUS, vcpus); + subtest_v3_redist_regions(&v); + + addr = REDIST_REGION_ATTR_ADDR(1, 0x280000, 0, 2); + kvm_device_attr_set(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, + KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr); + + ret = run_vcpu(vcpus[3]); + TEST_ASSERT(ret == -EBUSY, "running without vgic explicit init"); + + vm_gic_destroy(&v); + + /* step 3 */ + + v = vm_gic_create_with_vcpus(KVM_DEV_TYPE_ARM_VGIC_V3, NR_VCPUS, vcpus); + subtest_v3_redist_regions(&v); + + ret = __kvm_device_attr_set(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, + KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, dummy); + TEST_ASSERT(ret && errno == EFAULT, + "register a third region allowing to cover the 4 vcpus"); + + addr = REDIST_REGION_ATTR_ADDR(1, 0x280000, 0, 2); + kvm_device_attr_set(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, + KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr); + + kvm_device_attr_set(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_CTRL, + KVM_DEV_ARM_VGIC_CTRL_INIT, NULL); + + ret = run_vcpu(vcpus[3]); + TEST_ASSERT(!ret, "vcpu run"); + + vm_gic_destroy(&v); +} + +static void test_v3_typer_accesses(void) +{ + struct vm_gic v; + uint64_t addr; + int ret, i; + + v.vm = vm_create(NR_VCPUS); + (void)vm_vcpu_add(v.vm, 0, guest_code); + + v.gic_fd = kvm_create_device(v.vm, KVM_DEV_TYPE_ARM_VGIC_V3); + + (void)vm_vcpu_add(v.vm, 3, guest_code); + + v3_redist_reg_get_errno(v.gic_fd, 1, GICR_TYPER, EINVAL, + "attempting to read GICR_TYPER of non created vcpu"); + + (void)vm_vcpu_add(v.vm, 1, guest_code); + + v3_redist_reg_get_errno(v.gic_fd, 1, GICR_TYPER, EBUSY, + "read GICR_TYPER before GIC initialized"); + + (void)vm_vcpu_add(v.vm, 2, guest_code); + + kvm_device_attr_set(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_CTRL, + KVM_DEV_ARM_VGIC_CTRL_INIT, NULL); + + for (i = 0; i < NR_VCPUS ; i++) { + v3_redist_reg_get(v.gic_fd, i, GICR_TYPER, i * 0x100, + "read GICR_TYPER before rdist region setting"); + } + + addr = REDIST_REGION_ATTR_ADDR(2, 0x200000, 0, 0); + kvm_device_attr_set(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, + KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr); + + /* The 2 first rdists should be put there (vcpu 0 and 3) */ + v3_redist_reg_get(v.gic_fd, 0, GICR_TYPER, 0x0, "read typer of rdist #0"); + v3_redist_reg_get(v.gic_fd, 3, GICR_TYPER, 0x310, "read typer of rdist #1"); + + addr = REDIST_REGION_ATTR_ADDR(10, 0x100000, 0, 1); + ret = __kvm_device_attr_set(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, + KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr); + TEST_ASSERT(ret && errno == EINVAL, "collision with previous rdist region"); + + v3_redist_reg_get(v.gic_fd, 1, GICR_TYPER, 0x100, + "no redist region attached to vcpu #1 yet, last cannot be returned"); + v3_redist_reg_get(v.gic_fd, 2, GICR_TYPER, 0x200, + "no redist region attached to vcpu #2, last cannot be returned"); + + addr = REDIST_REGION_ATTR_ADDR(10, 0x20000, 0, 1); + kvm_device_attr_set(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, + KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr); + + v3_redist_reg_get(v.gic_fd, 1, GICR_TYPER, 0x100, "read typer of rdist #1"); + v3_redist_reg_get(v.gic_fd, 2, GICR_TYPER, 0x210, + "read typer of rdist #1, last properly returned"); + + vm_gic_destroy(&v); +} + +static struct vm_gic vm_gic_v3_create_with_vcpuids(int nr_vcpus, + uint32_t vcpuids[]) +{ + struct vm_gic v; + int i; + + v.vm = vm_create(nr_vcpus); + for (i = 0; i < nr_vcpus; i++) + vm_vcpu_add(v.vm, vcpuids[i], guest_code); + + v.gic_fd = kvm_create_device(v.vm, KVM_DEV_TYPE_ARM_VGIC_V3); + + return v; +} + +/** + * Test GICR_TYPER last bit with new redist regions + * rdist regions #1 and #2 are contiguous + * rdist region #0 @0x100000 2 rdist capacity + * rdists: 0, 3 (Last) + * rdist region #1 @0x240000 2 rdist capacity + * rdists: 5, 4 (Last) + * rdist region #2 @0x200000 2 rdist capacity + * rdists: 1, 2 + */ +static void test_v3_last_bit_redist_regions(void) +{ + uint32_t vcpuids[] = { 0, 3, 5, 4, 1, 2 }; + struct vm_gic v; + uint64_t addr; + + v = vm_gic_v3_create_with_vcpuids(ARRAY_SIZE(vcpuids), vcpuids); + + kvm_device_attr_set(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_CTRL, + KVM_DEV_ARM_VGIC_CTRL_INIT, NULL); + + addr = REDIST_REGION_ATTR_ADDR(2, 0x100000, 0, 0); + kvm_device_attr_set(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, + KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr); + + addr = REDIST_REGION_ATTR_ADDR(2, 0x240000, 0, 1); + kvm_device_attr_set(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, + KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr); + + addr = REDIST_REGION_ATTR_ADDR(2, 0x200000, 0, 2); + kvm_device_attr_set(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, + KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr); + + v3_redist_reg_get(v.gic_fd, 0, GICR_TYPER, 0x000, "read typer of rdist #0"); + v3_redist_reg_get(v.gic_fd, 1, GICR_TYPER, 0x100, "read typer of rdist #1"); + v3_redist_reg_get(v.gic_fd, 2, GICR_TYPER, 0x200, "read typer of rdist #2"); + v3_redist_reg_get(v.gic_fd, 3, GICR_TYPER, 0x310, "read typer of rdist #3"); + v3_redist_reg_get(v.gic_fd, 5, GICR_TYPER, 0x500, "read typer of rdist #5"); + v3_redist_reg_get(v.gic_fd, 4, GICR_TYPER, 0x410, "read typer of rdist #4"); + + vm_gic_destroy(&v); +} + +/* Test last bit with legacy region */ +static void test_v3_last_bit_single_rdist(void) +{ + uint32_t vcpuids[] = { 0, 3, 5, 4, 1, 2 }; + struct vm_gic v; + uint64_t addr; + + v = vm_gic_v3_create_with_vcpuids(ARRAY_SIZE(vcpuids), vcpuids); + + kvm_device_attr_set(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_CTRL, + KVM_DEV_ARM_VGIC_CTRL_INIT, NULL); + + addr = 0x10000; + kvm_device_attr_set(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, + KVM_VGIC_V3_ADDR_TYPE_REDIST, &addr); + + v3_redist_reg_get(v.gic_fd, 0, GICR_TYPER, 0x000, "read typer of rdist #0"); + v3_redist_reg_get(v.gic_fd, 3, GICR_TYPER, 0x300, "read typer of rdist #1"); + v3_redist_reg_get(v.gic_fd, 5, GICR_TYPER, 0x500, "read typer of rdist #2"); + v3_redist_reg_get(v.gic_fd, 1, GICR_TYPER, 0x100, "read typer of rdist #3"); + v3_redist_reg_get(v.gic_fd, 2, GICR_TYPER, 0x210, "read typer of rdist #3"); + + vm_gic_destroy(&v); +} + +/* Uses the legacy REDIST region API. */ +static void test_v3_redist_ipa_range_check_at_vcpu_run(void) +{ + struct kvm_vcpu *vcpus[NR_VCPUS]; + struct vm_gic v; + int ret, i; + uint64_t addr; + + v = vm_gic_create_with_vcpus(KVM_DEV_TYPE_ARM_VGIC_V3, 1, vcpus); + + /* Set space for 3 redists, we have 1 vcpu, so this succeeds. */ + addr = max_phys_size - (3 * 2 * 0x10000); + kvm_device_attr_set(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, + KVM_VGIC_V3_ADDR_TYPE_REDIST, &addr); + + addr = 0x00000; + kvm_device_attr_set(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, + KVM_VGIC_V3_ADDR_TYPE_DIST, &addr); + + /* Add the rest of the VCPUs */ + for (i = 1; i < NR_VCPUS; ++i) + vcpus[i] = vm_vcpu_add(v.vm, i, guest_code); + + kvm_device_attr_set(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_CTRL, + KVM_DEV_ARM_VGIC_CTRL_INIT, NULL); + + /* Attempt to run a vcpu without enough redist space. */ + ret = run_vcpu(vcpus[2]); + TEST_ASSERT(ret && errno == EINVAL, + "redist base+size above PA range detected on 1st vcpu run"); + + vm_gic_destroy(&v); +} + +static void test_v3_its_region(void) +{ + struct kvm_vcpu *vcpus[NR_VCPUS]; + struct vm_gic v; + uint64_t addr; + int its_fd, ret; + + v = vm_gic_create_with_vcpus(KVM_DEV_TYPE_ARM_VGIC_V3, NR_VCPUS, vcpus); + its_fd = kvm_create_device(v.vm, KVM_DEV_TYPE_ARM_VGIC_ITS); + + addr = 0x401000; + ret = __kvm_device_attr_set(its_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, + KVM_VGIC_ITS_ADDR_TYPE, &addr); + TEST_ASSERT(ret && errno == EINVAL, + "ITS region with misaligned address"); + + addr = max_phys_size; + ret = __kvm_device_attr_set(its_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, + KVM_VGIC_ITS_ADDR_TYPE, &addr); + TEST_ASSERT(ret && errno == E2BIG, + "register ITS region with base address beyond IPA range"); + + addr = max_phys_size - 0x10000; + ret = __kvm_device_attr_set(its_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, + KVM_VGIC_ITS_ADDR_TYPE, &addr); + TEST_ASSERT(ret && errno == E2BIG, + "Half of ITS region is beyond IPA range"); + + /* This one succeeds setting the ITS base */ + addr = 0x400000; + kvm_device_attr_set(its_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, + KVM_VGIC_ITS_ADDR_TYPE, &addr); + + addr = 0x300000; + ret = __kvm_device_attr_set(its_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, + KVM_VGIC_ITS_ADDR_TYPE, &addr); + TEST_ASSERT(ret && errno == EEXIST, "ITS base set again"); + + close(its_fd); + vm_gic_destroy(&v); +} + +/* + * Returns 0 if it's possible to create GIC device of a given type (V2 or V3). + */ +int test_kvm_device(uint32_t gic_dev_type) +{ + struct kvm_vcpu *vcpus[NR_VCPUS]; + struct vm_gic v; + uint32_t other; + int ret; + + v.vm = vm_create_with_vcpus(NR_VCPUS, guest_code, vcpus); + + /* try to create a non existing KVM device */ + ret = __kvm_test_create_device(v.vm, 0); + TEST_ASSERT(ret && errno == ENODEV, "unsupported device"); + + /* trial mode */ + ret = __kvm_test_create_device(v.vm, gic_dev_type); + if (ret) + return ret; + v.gic_fd = kvm_create_device(v.vm, gic_dev_type); + + ret = __kvm_create_device(v.vm, gic_dev_type); + TEST_ASSERT(ret < 0 && errno == EEXIST, "create GIC device twice"); + + /* try to create the other gic_dev_type */ + other = VGIC_DEV_IS_V2(gic_dev_type) ? KVM_DEV_TYPE_ARM_VGIC_V3 + : KVM_DEV_TYPE_ARM_VGIC_V2; + + if (!__kvm_test_create_device(v.vm, other)) { + ret = __kvm_create_device(v.vm, other); + TEST_ASSERT(ret < 0 && (errno == EINVAL || errno == EEXIST), + "create GIC device while other version exists"); + } + + vm_gic_destroy(&v); + + return 0; +} + +void run_tests(uint32_t gic_dev_type) +{ + test_vcpus_then_vgic(gic_dev_type); + test_vgic_then_vcpus(gic_dev_type); + + if (VGIC_DEV_IS_V2(gic_dev_type)) + test_v2_uaccess_cpuif_no_vcpus(); + + if (VGIC_DEV_IS_V3(gic_dev_type)) { + test_v3_new_redist_regions(); + test_v3_typer_accesses(); + test_v3_last_bit_redist_regions(); + test_v3_last_bit_single_rdist(); + test_v3_redist_ipa_range_check_at_vcpu_run(); + test_v3_its_region(); + } +} + +int main(int ac, char **av) +{ + int ret; + int pa_bits; + int cnt_impl = 0; + + pa_bits = vm_guest_mode_params[VM_MODE_DEFAULT].pa_bits; + max_phys_size = 1ULL << pa_bits; + + ret = test_kvm_device(KVM_DEV_TYPE_ARM_VGIC_V3); + if (!ret) { + pr_info("Running GIC_v3 tests.\n"); + run_tests(KVM_DEV_TYPE_ARM_VGIC_V3); + cnt_impl++; + } + + ret = test_kvm_device(KVM_DEV_TYPE_ARM_VGIC_V2); + if (!ret) { + pr_info("Running GIC_v2 tests.\n"); + run_tests(KVM_DEV_TYPE_ARM_VGIC_V2); + cnt_impl++; + } + + if (!cnt_impl) { + print_skip("No GICv2 nor GICv3 support"); + exit(KSFT_SKIP); + } + return 0; +} diff --git a/tools/testing/selftests/kvm/aarch64/vgic_irq.c b/tools/testing/selftests/kvm/aarch64/vgic_irq.c new file mode 100644 index 000000000000..a51dbd2a5f84 --- /dev/null +++ b/tools/testing/selftests/kvm/aarch64/vgic_irq.c @@ -0,0 +1,848 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * vgic_irq.c - Test userspace injection of IRQs + * + * This test validates the injection of IRQs from userspace using various + * methods (e.g., KVM_IRQ_LINE) and modes (e.g., EOI). The guest "asks" the + * host to inject a specific intid via a GUEST_SYNC call, and then checks that + * it received it. + */ +#include <asm/kvm.h> +#include <asm/kvm_para.h> +#include <sys/eventfd.h> +#include <linux/sizes.h> + +#include "processor.h" +#include "test_util.h" +#include "kvm_util.h" +#include "gic.h" +#include "gic_v3.h" +#include "vgic.h" + +/* + * Stores the user specified args; it's passed to the guest and to every test + * function. + */ +struct test_args { + uint32_t nr_irqs; /* number of KVM supported IRQs. */ + bool eoi_split; /* 1 is eoir+dir, 0 is eoir only */ + bool level_sensitive; /* 1 is level, 0 is edge */ + int kvm_max_routes; /* output of KVM_CAP_IRQ_ROUTING */ + bool kvm_supports_irqfd; /* output of KVM_CAP_IRQFD */ +}; + +/* + * KVM implements 32 priority levels: + * 0x00 (highest priority) - 0xF8 (lowest priority), in steps of 8 + * + * Note that these macros will still be correct in the case that KVM implements + * more priority levels. Also note that 32 is the minimum for GICv3 and GICv2. + */ +#define KVM_NUM_PRIOS 32 +#define KVM_PRIO_SHIFT 3 /* steps of 8 = 1 << 3 */ +#define KVM_PRIO_STEPS (1 << KVM_PRIO_SHIFT) /* 8 */ +#define LOWEST_PRIO (KVM_NUM_PRIOS - 1) +#define CPU_PRIO_MASK (LOWEST_PRIO << KVM_PRIO_SHIFT) /* 0xf8 */ +#define IRQ_DEFAULT_PRIO (LOWEST_PRIO - 1) +#define IRQ_DEFAULT_PRIO_REG (IRQ_DEFAULT_PRIO << KVM_PRIO_SHIFT) /* 0xf0 */ + +/* + * The kvm_inject_* utilities are used by the guest to ask the host to inject + * interrupts (e.g., using the KVM_IRQ_LINE ioctl). + */ + +typedef enum { + KVM_INJECT_EDGE_IRQ_LINE = 1, + KVM_SET_IRQ_LINE, + KVM_SET_IRQ_LINE_HIGH, + KVM_SET_LEVEL_INFO_HIGH, + KVM_INJECT_IRQFD, + KVM_WRITE_ISPENDR, + KVM_WRITE_ISACTIVER, +} kvm_inject_cmd; + +struct kvm_inject_args { + kvm_inject_cmd cmd; + uint32_t first_intid; + uint32_t num; + int level; + bool expect_failure; +}; + +/* Used on the guest side to perform the hypercall. */ +static void kvm_inject_call(kvm_inject_cmd cmd, uint32_t first_intid, + uint32_t num, int level, bool expect_failure); + +/* Used on the host side to get the hypercall info. */ +static void kvm_inject_get_call(struct kvm_vm *vm, struct ucall *uc, + struct kvm_inject_args *args); + +#define _KVM_INJECT_MULTI(cmd, intid, num, expect_failure) \ + kvm_inject_call(cmd, intid, num, -1 /* not used */, expect_failure) + +#define KVM_INJECT_MULTI(cmd, intid, num) \ + _KVM_INJECT_MULTI(cmd, intid, num, false) + +#define _KVM_INJECT(cmd, intid, expect_failure) \ + _KVM_INJECT_MULTI(cmd, intid, 1, expect_failure) + +#define KVM_INJECT(cmd, intid) \ + _KVM_INJECT_MULTI(cmd, intid, 1, false) + +#define KVM_ACTIVATE(cmd, intid) \ + kvm_inject_call(cmd, intid, 1, 1, false); + +struct kvm_inject_desc { + kvm_inject_cmd cmd; + /* can inject PPIs, PPIs, and/or SPIs. */ + bool sgi, ppi, spi; +}; + +static struct kvm_inject_desc inject_edge_fns[] = { + /* sgi ppi spi */ + { KVM_INJECT_EDGE_IRQ_LINE, false, false, true }, + { KVM_INJECT_IRQFD, false, false, true }, + { KVM_WRITE_ISPENDR, true, false, true }, + { 0, }, +}; + +static struct kvm_inject_desc inject_level_fns[] = { + /* sgi ppi spi */ + { KVM_SET_IRQ_LINE_HIGH, false, true, true }, + { KVM_SET_LEVEL_INFO_HIGH, false, true, true }, + { KVM_INJECT_IRQFD, false, false, true }, + { KVM_WRITE_ISPENDR, false, true, true }, + { 0, }, +}; + +static struct kvm_inject_desc set_active_fns[] = { + /* sgi ppi spi */ + { KVM_WRITE_ISACTIVER, true, true, true }, + { 0, }, +}; + +#define for_each_inject_fn(t, f) \ + for ((f) = (t); (f)->cmd; (f)++) + +#define for_each_supported_inject_fn(args, t, f) \ + for_each_inject_fn(t, f) \ + if ((args)->kvm_supports_irqfd || (f)->cmd != KVM_INJECT_IRQFD) + +#define for_each_supported_activate_fn(args, t, f) \ + for_each_supported_inject_fn((args), (t), (f)) + +/* Shared between the guest main thread and the IRQ handlers. */ +volatile uint64_t irq_handled; +volatile uint32_t irqnr_received[MAX_SPI + 1]; + +static void reset_stats(void) +{ + int i; + + irq_handled = 0; + for (i = 0; i <= MAX_SPI; i++) + irqnr_received[i] = 0; +} + +static uint64_t gic_read_ap1r0(void) +{ + uint64_t reg = read_sysreg_s(SYS_ICC_AP1R0_EL1); + + dsb(sy); + return reg; +} + +static void gic_write_ap1r0(uint64_t val) +{ + write_sysreg_s(val, SYS_ICC_AP1R0_EL1); + isb(); +} + +static void guest_set_irq_line(uint32_t intid, uint32_t level); + +static void guest_irq_generic_handler(bool eoi_split, bool level_sensitive) +{ + uint32_t intid = gic_get_and_ack_irq(); + + if (intid == IAR_SPURIOUS) + return; + + GUEST_ASSERT(gic_irq_get_active(intid)); + + if (!level_sensitive) + GUEST_ASSERT(!gic_irq_get_pending(intid)); + + if (level_sensitive) + guest_set_irq_line(intid, 0); + + GUEST_ASSERT(intid < MAX_SPI); + irqnr_received[intid] += 1; + irq_handled += 1; + + gic_set_eoi(intid); + GUEST_ASSERT_EQ(gic_read_ap1r0(), 0); + if (eoi_split) + gic_set_dir(intid); + + GUEST_ASSERT(!gic_irq_get_active(intid)); + GUEST_ASSERT(!gic_irq_get_pending(intid)); +} + +static void kvm_inject_call(kvm_inject_cmd cmd, uint32_t first_intid, + uint32_t num, int level, bool expect_failure) +{ + struct kvm_inject_args args = { + .cmd = cmd, + .first_intid = first_intid, + .num = num, + .level = level, + .expect_failure = expect_failure, + }; + GUEST_SYNC(&args); +} + +#define GUEST_ASSERT_IAR_EMPTY() \ +do { \ + uint32_t _intid; \ + _intid = gic_get_and_ack_irq(); \ + GUEST_ASSERT(_intid == 0 || _intid == IAR_SPURIOUS); \ +} while (0) + +#define CAT_HELPER(a, b) a ## b +#define CAT(a, b) CAT_HELPER(a, b) +#define PREFIX guest_irq_handler_ +#define GUEST_IRQ_HANDLER_NAME(split, lev) CAT(PREFIX, CAT(split, lev)) +#define GENERATE_GUEST_IRQ_HANDLER(split, lev) \ +static void CAT(PREFIX, CAT(split, lev))(struct ex_regs *regs) \ +{ \ + guest_irq_generic_handler(split, lev); \ +} + +GENERATE_GUEST_IRQ_HANDLER(0, 0); +GENERATE_GUEST_IRQ_HANDLER(0, 1); +GENERATE_GUEST_IRQ_HANDLER(1, 0); +GENERATE_GUEST_IRQ_HANDLER(1, 1); + +static void (*guest_irq_handlers[2][2])(struct ex_regs *) = { + {GUEST_IRQ_HANDLER_NAME(0, 0), GUEST_IRQ_HANDLER_NAME(0, 1),}, + {GUEST_IRQ_HANDLER_NAME(1, 0), GUEST_IRQ_HANDLER_NAME(1, 1),}, +}; + +static void reset_priorities(struct test_args *args) +{ + int i; + + for (i = 0; i < args->nr_irqs; i++) + gic_set_priority(i, IRQ_DEFAULT_PRIO_REG); +} + +static void guest_set_irq_line(uint32_t intid, uint32_t level) +{ + kvm_inject_call(KVM_SET_IRQ_LINE, intid, 1, level, false); +} + +static void test_inject_fail(struct test_args *args, + uint32_t intid, kvm_inject_cmd cmd) +{ + reset_stats(); + + _KVM_INJECT(cmd, intid, true); + /* no IRQ to handle on entry */ + + GUEST_ASSERT_EQ(irq_handled, 0); + GUEST_ASSERT_IAR_EMPTY(); +} + +static void guest_inject(struct test_args *args, + uint32_t first_intid, uint32_t num, + kvm_inject_cmd cmd) +{ + uint32_t i; + + reset_stats(); + + /* Cycle over all priorities to make things more interesting. */ + for (i = first_intid; i < num + first_intid; i++) + gic_set_priority(i, (i % (KVM_NUM_PRIOS - 1)) << 3); + + asm volatile("msr daifset, #2" : : : "memory"); + KVM_INJECT_MULTI(cmd, first_intid, num); + + while (irq_handled < num) { + asm volatile("wfi\n" + "msr daifclr, #2\n" + /* handle IRQ */ + "msr daifset, #2\n" + : : : "memory"); + } + asm volatile("msr daifclr, #2" : : : "memory"); + + GUEST_ASSERT_EQ(irq_handled, num); + for (i = first_intid; i < num + first_intid; i++) + GUEST_ASSERT_EQ(irqnr_received[i], 1); + GUEST_ASSERT_IAR_EMPTY(); + + reset_priorities(args); +} + +/* + * Restore the active state of multiple concurrent IRQs (given by + * concurrent_irqs). This does what a live-migration would do on the + * destination side assuming there are some active IRQs that were not + * deactivated yet. + */ +static void guest_restore_active(struct test_args *args, + uint32_t first_intid, uint32_t num, + kvm_inject_cmd cmd) +{ + uint32_t prio, intid, ap1r; + int i; + + /* + * Set the priorities of the first (KVM_NUM_PRIOS - 1) IRQs + * in descending order, so intid+1 can preempt intid. + */ + for (i = 0, prio = (num - 1) * 8; i < num; i++, prio -= 8) { + GUEST_ASSERT(prio >= 0); + intid = i + first_intid; + gic_set_priority(intid, prio); + } + + /* + * In a real migration, KVM would restore all GIC state before running + * guest code. + */ + for (i = 0; i < num; i++) { + intid = i + first_intid; + KVM_ACTIVATE(cmd, intid); + ap1r = gic_read_ap1r0(); + ap1r |= 1U << i; + gic_write_ap1r0(ap1r); + } + + /* This is where the "migration" would occur. */ + + /* finish handling the IRQs starting with the highest priority one. */ + for (i = 0; i < num; i++) { + intid = num - i - 1 + first_intid; + gic_set_eoi(intid); + if (args->eoi_split) + gic_set_dir(intid); + } + + for (i = 0; i < num; i++) + GUEST_ASSERT(!gic_irq_get_active(i + first_intid)); + GUEST_ASSERT_EQ(gic_read_ap1r0(), 0); + GUEST_ASSERT_IAR_EMPTY(); +} + +/* + * Polls the IAR until it's not a spurious interrupt. + * + * This function should only be used in test_inject_preemption (with IRQs + * masked). + */ +static uint32_t wait_for_and_activate_irq(void) +{ + uint32_t intid; + + do { + asm volatile("wfi" : : : "memory"); + intid = gic_get_and_ack_irq(); + } while (intid == IAR_SPURIOUS); + + return intid; +} + +/* + * Inject multiple concurrent IRQs (num IRQs starting at first_intid) and + * handle them without handling the actual exceptions. This is done by masking + * interrupts for the whole test. + */ +static void test_inject_preemption(struct test_args *args, + uint32_t first_intid, int num, + kvm_inject_cmd cmd) +{ + uint32_t intid, prio, step = KVM_PRIO_STEPS; + int i; + + /* Set the priorities of the first (KVM_NUM_PRIOS - 1) IRQs + * in descending order, so intid+1 can preempt intid. + */ + for (i = 0, prio = (num - 1) * step; i < num; i++, prio -= step) { + GUEST_ASSERT(prio >= 0); + intid = i + first_intid; + gic_set_priority(intid, prio); + } + + local_irq_disable(); + + for (i = 0; i < num; i++) { + uint32_t tmp; + intid = i + first_intid; + KVM_INJECT(cmd, intid); + /* Each successive IRQ will preempt the previous one. */ + tmp = wait_for_and_activate_irq(); + GUEST_ASSERT_EQ(tmp, intid); + if (args->level_sensitive) + guest_set_irq_line(intid, 0); + } + + /* finish handling the IRQs starting with the highest priority one. */ + for (i = 0; i < num; i++) { + intid = num - i - 1 + first_intid; + gic_set_eoi(intid); + if (args->eoi_split) + gic_set_dir(intid); + } + + local_irq_enable(); + + for (i = 0; i < num; i++) + GUEST_ASSERT(!gic_irq_get_active(i + first_intid)); + GUEST_ASSERT_EQ(gic_read_ap1r0(), 0); + GUEST_ASSERT_IAR_EMPTY(); + + reset_priorities(args); +} + +static void test_injection(struct test_args *args, struct kvm_inject_desc *f) +{ + uint32_t nr_irqs = args->nr_irqs; + + if (f->sgi) { + guest_inject(args, MIN_SGI, 1, f->cmd); + guest_inject(args, 0, 16, f->cmd); + } + + if (f->ppi) + guest_inject(args, MIN_PPI, 1, f->cmd); + + if (f->spi) { + guest_inject(args, MIN_SPI, 1, f->cmd); + guest_inject(args, nr_irqs - 1, 1, f->cmd); + guest_inject(args, MIN_SPI, nr_irqs - MIN_SPI, f->cmd); + } +} + +static void test_injection_failure(struct test_args *args, + struct kvm_inject_desc *f) +{ + uint32_t bad_intid[] = { args->nr_irqs, 1020, 1024, 1120, 5120, ~0U, }; + int i; + + for (i = 0; i < ARRAY_SIZE(bad_intid); i++) + test_inject_fail(args, bad_intid[i], f->cmd); +} + +static void test_preemption(struct test_args *args, struct kvm_inject_desc *f) +{ + /* + * Test up to 4 levels of preemption. The reason is that KVM doesn't + * currently implement the ability to have more than the number-of-LRs + * number of concurrently active IRQs. The number of LRs implemented is + * IMPLEMENTATION DEFINED, however, it seems that most implement 4. + */ + if (f->sgi) + test_inject_preemption(args, MIN_SGI, 4, f->cmd); + + if (f->ppi) + test_inject_preemption(args, MIN_PPI, 4, f->cmd); + + if (f->spi) + test_inject_preemption(args, MIN_SPI, 4, f->cmd); +} + +static void test_restore_active(struct test_args *args, struct kvm_inject_desc *f) +{ + /* Test up to 4 active IRQs. Same reason as in test_preemption. */ + if (f->sgi) + guest_restore_active(args, MIN_SGI, 4, f->cmd); + + if (f->ppi) + guest_restore_active(args, MIN_PPI, 4, f->cmd); + + if (f->spi) + guest_restore_active(args, MIN_SPI, 4, f->cmd); +} + +static void guest_code(struct test_args *args) +{ + uint32_t i, nr_irqs = args->nr_irqs; + bool level_sensitive = args->level_sensitive; + struct kvm_inject_desc *f, *inject_fns; + + gic_init(GIC_V3, 1); + + for (i = 0; i < nr_irqs; i++) + gic_irq_enable(i); + + for (i = MIN_SPI; i < nr_irqs; i++) + gic_irq_set_config(i, !level_sensitive); + + gic_set_eoi_split(args->eoi_split); + + reset_priorities(args); + gic_set_priority_mask(CPU_PRIO_MASK); + + inject_fns = level_sensitive ? inject_level_fns + : inject_edge_fns; + + local_irq_enable(); + + /* Start the tests. */ + for_each_supported_inject_fn(args, inject_fns, f) { + test_injection(args, f); + test_preemption(args, f); + test_injection_failure(args, f); + } + + /* + * Restore the active state of IRQs. This would happen when live + * migrating IRQs in the middle of being handled. + */ + for_each_supported_activate_fn(args, set_active_fns, f) + test_restore_active(args, f); + + GUEST_DONE(); +} + +static void kvm_irq_line_check(struct kvm_vm *vm, uint32_t intid, int level, + struct test_args *test_args, bool expect_failure) +{ + int ret; + + if (!expect_failure) { + kvm_arm_irq_line(vm, intid, level); + } else { + /* The interface doesn't allow larger intid's. */ + if (intid > KVM_ARM_IRQ_NUM_MASK) + return; + + ret = _kvm_arm_irq_line(vm, intid, level); + TEST_ASSERT(ret != 0 && errno == EINVAL, + "Bad intid %i did not cause KVM_IRQ_LINE " + "error: rc: %i errno: %i", intid, ret, errno); + } +} + +void kvm_irq_set_level_info_check(int gic_fd, uint32_t intid, int level, + bool expect_failure) +{ + if (!expect_failure) { + kvm_irq_set_level_info(gic_fd, intid, level); + } else { + int ret = _kvm_irq_set_level_info(gic_fd, intid, level); + /* + * The kernel silently fails for invalid SPIs and SGIs (which + * are not level-sensitive). It only checks for intid to not + * spill over 1U << 10 (the max reserved SPI). Also, callers + * are supposed to mask the intid with 0x3ff (1023). + */ + if (intid > VGIC_MAX_RESERVED) + TEST_ASSERT(ret != 0 && errno == EINVAL, + "Bad intid %i did not cause VGIC_GRP_LEVEL_INFO " + "error: rc: %i errno: %i", intid, ret, errno); + else + TEST_ASSERT(!ret, "KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO " + "for intid %i failed, rc: %i errno: %i", + intid, ret, errno); + } +} + +static void kvm_set_gsi_routing_irqchip_check(struct kvm_vm *vm, + uint32_t intid, uint32_t num, uint32_t kvm_max_routes, + bool expect_failure) +{ + struct kvm_irq_routing *routing; + int ret; + uint64_t i; + + assert(num <= kvm_max_routes && kvm_max_routes <= KVM_MAX_IRQ_ROUTES); + + routing = kvm_gsi_routing_create(); + for (i = intid; i < (uint64_t)intid + num; i++) + kvm_gsi_routing_irqchip_add(routing, i - MIN_SPI, i - MIN_SPI); + + if (!expect_failure) { + kvm_gsi_routing_write(vm, routing); + } else { + ret = _kvm_gsi_routing_write(vm, routing); + /* The kernel only checks e->irqchip.pin >= KVM_IRQCHIP_NUM_PINS */ + if (((uint64_t)intid + num - 1 - MIN_SPI) >= KVM_IRQCHIP_NUM_PINS) + TEST_ASSERT(ret != 0 && errno == EINVAL, + "Bad intid %u did not cause KVM_SET_GSI_ROUTING " + "error: rc: %i errno: %i", intid, ret, errno); + else + TEST_ASSERT(ret == 0, "KVM_SET_GSI_ROUTING " + "for intid %i failed, rc: %i errno: %i", + intid, ret, errno); + } +} + +static void kvm_irq_write_ispendr_check(int gic_fd, uint32_t intid, + struct kvm_vcpu *vcpu, + bool expect_failure) +{ + /* + * Ignore this when expecting failure as invalid intids will lead to + * either trying to inject SGIs when we configured the test to be + * level_sensitive (or the reverse), or inject large intids which + * will lead to writing above the ISPENDR register space (and we + * don't want to do that either). + */ + if (!expect_failure) + kvm_irq_write_ispendr(gic_fd, intid, vcpu); +} + +static void kvm_routing_and_irqfd_check(struct kvm_vm *vm, + uint32_t intid, uint32_t num, uint32_t kvm_max_routes, + bool expect_failure) +{ + int fd[MAX_SPI]; + uint64_t val; + int ret, f; + uint64_t i; + + /* + * There is no way to try injecting an SGI or PPI as the interface + * starts counting from the first SPI (above the private ones), so just + * exit. + */ + if (INTID_IS_SGI(intid) || INTID_IS_PPI(intid)) + return; + + kvm_set_gsi_routing_irqchip_check(vm, intid, num, + kvm_max_routes, expect_failure); + + /* + * If expect_failure, then just to inject anyway. These + * will silently fail. And in any case, the guest will check + * that no actual interrupt was injected for those cases. + */ + + for (f = 0, i = intid; i < (uint64_t)intid + num; i++, f++) { + fd[f] = eventfd(0, 0); + TEST_ASSERT(fd[f] != -1, __KVM_SYSCALL_ERROR("eventfd()", fd[f])); + } + + for (f = 0, i = intid; i < (uint64_t)intid + num; i++, f++) { + struct kvm_irqfd irqfd = { + .fd = fd[f], + .gsi = i - MIN_SPI, + }; + assert(i <= (uint64_t)UINT_MAX); + vm_ioctl(vm, KVM_IRQFD, &irqfd); + } + + for (f = 0, i = intid; i < (uint64_t)intid + num; i++, f++) { + val = 1; + ret = write(fd[f], &val, sizeof(uint64_t)); + TEST_ASSERT(ret == sizeof(uint64_t), + __KVM_SYSCALL_ERROR("write()", ret)); + } + + for (f = 0, i = intid; i < (uint64_t)intid + num; i++, f++) + close(fd[f]); +} + +/* handles the valid case: intid=0xffffffff num=1 */ +#define for_each_intid(first, num, tmp, i) \ + for ((tmp) = (i) = (first); \ + (tmp) < (uint64_t)(first) + (uint64_t)(num); \ + (tmp)++, (i)++) + +static void run_guest_cmd(struct kvm_vcpu *vcpu, int gic_fd, + struct kvm_inject_args *inject_args, + struct test_args *test_args) +{ + kvm_inject_cmd cmd = inject_args->cmd; + uint32_t intid = inject_args->first_intid; + uint32_t num = inject_args->num; + int level = inject_args->level; + bool expect_failure = inject_args->expect_failure; + struct kvm_vm *vm = vcpu->vm; + uint64_t tmp; + uint32_t i; + + /* handles the valid case: intid=0xffffffff num=1 */ + assert(intid < UINT_MAX - num || num == 1); + + switch (cmd) { + case KVM_INJECT_EDGE_IRQ_LINE: + for_each_intid(intid, num, tmp, i) + kvm_irq_line_check(vm, i, 1, test_args, + expect_failure); + for_each_intid(intid, num, tmp, i) + kvm_irq_line_check(vm, i, 0, test_args, + expect_failure); + break; + case KVM_SET_IRQ_LINE: + for_each_intid(intid, num, tmp, i) + kvm_irq_line_check(vm, i, level, test_args, + expect_failure); + break; + case KVM_SET_IRQ_LINE_HIGH: + for_each_intid(intid, num, tmp, i) + kvm_irq_line_check(vm, i, 1, test_args, + expect_failure); + break; + case KVM_SET_LEVEL_INFO_HIGH: + for_each_intid(intid, num, tmp, i) + kvm_irq_set_level_info_check(gic_fd, i, 1, + expect_failure); + break; + case KVM_INJECT_IRQFD: + kvm_routing_and_irqfd_check(vm, intid, num, + test_args->kvm_max_routes, + expect_failure); + break; + case KVM_WRITE_ISPENDR: + for (i = intid; i < intid + num; i++) + kvm_irq_write_ispendr_check(gic_fd, i, vcpu, + expect_failure); + break; + case KVM_WRITE_ISACTIVER: + for (i = intid; i < intid + num; i++) + kvm_irq_write_isactiver(gic_fd, i, vcpu); + break; + default: + break; + } +} + +static void kvm_inject_get_call(struct kvm_vm *vm, struct ucall *uc, + struct kvm_inject_args *args) +{ + struct kvm_inject_args *kvm_args_hva; + vm_vaddr_t kvm_args_gva; + + kvm_args_gva = uc->args[1]; + kvm_args_hva = (struct kvm_inject_args *)addr_gva2hva(vm, kvm_args_gva); + memcpy(args, kvm_args_hva, sizeof(struct kvm_inject_args)); +} + +static void print_args(struct test_args *args) +{ + printf("nr-irqs=%d level-sensitive=%d eoi-split=%d\n", + args->nr_irqs, args->level_sensitive, + args->eoi_split); +} + +static void test_vgic(uint32_t nr_irqs, bool level_sensitive, bool eoi_split) +{ + struct ucall uc; + int gic_fd; + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; + struct kvm_inject_args inject_args; + vm_vaddr_t args_gva; + + struct test_args args = { + .nr_irqs = nr_irqs, + .level_sensitive = level_sensitive, + .eoi_split = eoi_split, + .kvm_max_routes = kvm_check_cap(KVM_CAP_IRQ_ROUTING), + .kvm_supports_irqfd = kvm_check_cap(KVM_CAP_IRQFD), + }; + + print_args(&args); + + vm = vm_create_with_one_vcpu(&vcpu, guest_code); + + vm_init_descriptor_tables(vm); + vcpu_init_descriptor_tables(vcpu); + + /* Setup the guest args page (so it gets the args). */ + args_gva = vm_vaddr_alloc_page(vm); + memcpy(addr_gva2hva(vm, args_gva), &args, sizeof(args)); + vcpu_args_set(vcpu, 1, args_gva); + + gic_fd = vgic_v3_setup(vm, 1, nr_irqs); + __TEST_REQUIRE(gic_fd >= 0, "Failed to create vgic-v3, skipping"); + + vm_install_exception_handler(vm, VECTOR_IRQ_CURRENT, + guest_irq_handlers[args.eoi_split][args.level_sensitive]); + + while (1) { + vcpu_run(vcpu); + + switch (get_ucall(vcpu, &uc)) { + case UCALL_SYNC: + kvm_inject_get_call(vm, &uc, &inject_args); + run_guest_cmd(vcpu, gic_fd, &inject_args, &args); + break; + case UCALL_ABORT: + REPORT_GUEST_ASSERT(uc); + break; + case UCALL_DONE: + goto done; + default: + TEST_FAIL("Unknown ucall %lu", uc.cmd); + } + } + +done: + close(gic_fd); + kvm_vm_free(vm); +} + +static void help(const char *name) +{ + printf( + "\n" + "usage: %s [-n num_irqs] [-e eoi_split] [-l level_sensitive]\n", name); + printf(" -n: specify number of IRQs to setup the vgic with. " + "It has to be a multiple of 32 and between 64 and 1024.\n"); + printf(" -e: if 1 then EOI is split into a write to DIR on top " + "of writing EOI.\n"); + printf(" -l: specify whether the IRQs are level-sensitive (1) or not (0)."); + puts(""); + exit(1); +} + +int main(int argc, char **argv) +{ + uint32_t nr_irqs = 64; + bool default_args = true; + bool level_sensitive = false; + int opt; + bool eoi_split = false; + + while ((opt = getopt(argc, argv, "hn:e:l:")) != -1) { + switch (opt) { + case 'n': + nr_irqs = atoi_non_negative("Number of IRQs", optarg); + if (nr_irqs > 1024 || nr_irqs % 32) + help(argv[0]); + break; + case 'e': + eoi_split = (bool)atoi_paranoid(optarg); + default_args = false; + break; + case 'l': + level_sensitive = (bool)atoi_paranoid(optarg); + default_args = false; + break; + case 'h': + default: + help(argv[0]); + break; + } + } + + /* + * If the user just specified nr_irqs and/or gic_version, then run all + * combinations. + */ + if (default_args) { + test_vgic(nr_irqs, false /* level */, false /* eoi_split */); + test_vgic(nr_irqs, false /* level */, true /* eoi_split */); + test_vgic(nr_irqs, true /* level */, false /* eoi_split */); + test_vgic(nr_irqs, true /* level */, true /* eoi_split */); + } else { + test_vgic(nr_irqs, level_sensitive, eoi_split); + } + + return 0; +} diff --git a/tools/testing/selftests/kvm/aarch64/vgic_lpi_stress.c b/tools/testing/selftests/kvm/aarch64/vgic_lpi_stress.c new file mode 100644 index 000000000000..fc4fe52fb6f8 --- /dev/null +++ b/tools/testing/selftests/kvm/aarch64/vgic_lpi_stress.c @@ -0,0 +1,410 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * vgic_lpi_stress - Stress test for KVM's ITS emulation + * + * Copyright (c) 2024 Google LLC + */ + +#include <linux/sizes.h> +#include <pthread.h> +#include <stdatomic.h> +#include <sys/sysinfo.h> + +#include "kvm_util.h" +#include "gic.h" +#include "gic_v3.h" +#include "gic_v3_its.h" +#include "processor.h" +#include "ucall.h" +#include "vgic.h" + +#define TEST_MEMSLOT_INDEX 1 + +#define GIC_LPI_OFFSET 8192 + +static size_t nr_iterations = 1000; +static vm_paddr_t gpa_base; + +static struct kvm_vm *vm; +static struct kvm_vcpu **vcpus; +static int gic_fd, its_fd; + +static struct test_data { + bool request_vcpus_stop; + u32 nr_cpus; + u32 nr_devices; + u32 nr_event_ids; + + vm_paddr_t device_table; + vm_paddr_t collection_table; + vm_paddr_t cmdq_base; + void *cmdq_base_va; + vm_paddr_t itt_tables; + + vm_paddr_t lpi_prop_table; + vm_paddr_t lpi_pend_tables; +} test_data = { + .nr_cpus = 1, + .nr_devices = 1, + .nr_event_ids = 16, +}; + +static void guest_irq_handler(struct ex_regs *regs) +{ + u32 intid = gic_get_and_ack_irq(); + + if (intid == IAR_SPURIOUS) + return; + + GUEST_ASSERT(intid >= GIC_LPI_OFFSET); + gic_set_eoi(intid); +} + +static void guest_setup_its_mappings(void) +{ + u32 coll_id, device_id, event_id, intid = GIC_LPI_OFFSET; + u32 nr_events = test_data.nr_event_ids; + u32 nr_devices = test_data.nr_devices; + u32 nr_cpus = test_data.nr_cpus; + + for (coll_id = 0; coll_id < nr_cpus; coll_id++) + its_send_mapc_cmd(test_data.cmdq_base_va, coll_id, coll_id, true); + + /* Round-robin the LPIs to all of the vCPUs in the VM */ + coll_id = 0; + for (device_id = 0; device_id < nr_devices; device_id++) { + vm_paddr_t itt_base = test_data.itt_tables + (device_id * SZ_64K); + + its_send_mapd_cmd(test_data.cmdq_base_va, device_id, + itt_base, SZ_64K, true); + + for (event_id = 0; event_id < nr_events; event_id++) { + its_send_mapti_cmd(test_data.cmdq_base_va, device_id, + event_id, coll_id, intid++); + + coll_id = (coll_id + 1) % test_data.nr_cpus; + } + } +} + +static void guest_invalidate_all_rdists(void) +{ + int i; + + for (i = 0; i < test_data.nr_cpus; i++) + its_send_invall_cmd(test_data.cmdq_base_va, i); +} + +static void guest_setup_gic(void) +{ + static atomic_int nr_cpus_ready = 0; + u32 cpuid = guest_get_vcpuid(); + + gic_init(GIC_V3, test_data.nr_cpus); + gic_rdist_enable_lpis(test_data.lpi_prop_table, SZ_64K, + test_data.lpi_pend_tables + (cpuid * SZ_64K)); + + atomic_fetch_add(&nr_cpus_ready, 1); + + if (cpuid > 0) + return; + + while (atomic_load(&nr_cpus_ready) < test_data.nr_cpus) + cpu_relax(); + + its_init(test_data.collection_table, SZ_64K, + test_data.device_table, SZ_64K, + test_data.cmdq_base, SZ_64K); + + guest_setup_its_mappings(); + guest_invalidate_all_rdists(); +} + +static void guest_code(size_t nr_lpis) +{ + guest_setup_gic(); + + GUEST_SYNC(0); + + /* + * Don't use WFI here to avoid blocking the vCPU thread indefinitely and + * never getting the stop signal. + */ + while (!READ_ONCE(test_data.request_vcpus_stop)) + cpu_relax(); + + GUEST_DONE(); +} + +static void setup_memslot(void) +{ + size_t pages; + size_t sz; + + /* + * For the ITS: + * - A single level device table + * - A single level collection table + * - The command queue + * - An ITT for each device + */ + sz = (3 + test_data.nr_devices) * SZ_64K; + + /* + * For the redistributors: + * - A shared LPI configuration table + * - An LPI pending table for each vCPU + */ + sz += (1 + test_data.nr_cpus) * SZ_64K; + + pages = sz / vm->page_size; + gpa_base = ((vm_compute_max_gfn(vm) + 1) * vm->page_size) - sz; + vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, gpa_base, + TEST_MEMSLOT_INDEX, pages, 0); +} + +#define LPI_PROP_DEFAULT_PRIO 0xa0 + +static void configure_lpis(void) +{ + size_t nr_lpis = test_data.nr_devices * test_data.nr_event_ids; + u8 *tbl = addr_gpa2hva(vm, test_data.lpi_prop_table); + size_t i; + + for (i = 0; i < nr_lpis; i++) { + tbl[i] = LPI_PROP_DEFAULT_PRIO | + LPI_PROP_GROUP1 | + LPI_PROP_ENABLED; + } +} + +static void setup_test_data(void) +{ + size_t pages_per_64k = vm_calc_num_guest_pages(vm->mode, SZ_64K); + u32 nr_devices = test_data.nr_devices; + u32 nr_cpus = test_data.nr_cpus; + vm_paddr_t cmdq_base; + + test_data.device_table = vm_phy_pages_alloc(vm, pages_per_64k, + gpa_base, + TEST_MEMSLOT_INDEX); + + test_data.collection_table = vm_phy_pages_alloc(vm, pages_per_64k, + gpa_base, + TEST_MEMSLOT_INDEX); + + cmdq_base = vm_phy_pages_alloc(vm, pages_per_64k, gpa_base, + TEST_MEMSLOT_INDEX); + virt_map(vm, cmdq_base, cmdq_base, pages_per_64k); + test_data.cmdq_base = cmdq_base; + test_data.cmdq_base_va = (void *)cmdq_base; + + test_data.itt_tables = vm_phy_pages_alloc(vm, pages_per_64k * nr_devices, + gpa_base, TEST_MEMSLOT_INDEX); + + test_data.lpi_prop_table = vm_phy_pages_alloc(vm, pages_per_64k, + gpa_base, TEST_MEMSLOT_INDEX); + configure_lpis(); + + test_data.lpi_pend_tables = vm_phy_pages_alloc(vm, pages_per_64k * nr_cpus, + gpa_base, TEST_MEMSLOT_INDEX); + + sync_global_to_guest(vm, test_data); +} + +static void setup_gic(void) +{ + gic_fd = vgic_v3_setup(vm, test_data.nr_cpus, 64); + __TEST_REQUIRE(gic_fd >= 0, "Failed to create GICv3"); + + its_fd = vgic_its_setup(vm); +} + +static void signal_lpi(u32 device_id, u32 event_id) +{ + vm_paddr_t db_addr = GITS_BASE_GPA + GITS_TRANSLATER; + + struct kvm_msi msi = { + .address_lo = db_addr, + .address_hi = db_addr >> 32, + .data = event_id, + .devid = device_id, + .flags = KVM_MSI_VALID_DEVID, + }; + + /* + * KVM_SIGNAL_MSI returns 1 if the MSI wasn't 'blocked' by the VM, + * which for arm64 implies having a valid translation in the ITS. + */ + TEST_ASSERT(__vm_ioctl(vm, KVM_SIGNAL_MSI, &msi) == 1, + "KVM_SIGNAL_MSI ioctl failed"); +} + +static pthread_barrier_t test_setup_barrier; + +static void *lpi_worker_thread(void *data) +{ + u32 device_id = (size_t)data; + u32 event_id; + size_t i; + + pthread_barrier_wait(&test_setup_barrier); + + for (i = 0; i < nr_iterations; i++) + for (event_id = 0; event_id < test_data.nr_event_ids; event_id++) + signal_lpi(device_id, event_id); + + return NULL; +} + +static void *vcpu_worker_thread(void *data) +{ + struct kvm_vcpu *vcpu = data; + struct ucall uc; + + while (true) { + vcpu_run(vcpu); + + switch (get_ucall(vcpu, &uc)) { + case UCALL_SYNC: + pthread_barrier_wait(&test_setup_barrier); + continue; + case UCALL_DONE: + return NULL; + case UCALL_ABORT: + REPORT_GUEST_ASSERT(uc); + break; + default: + TEST_FAIL("Unknown ucall: %lu", uc.cmd); + } + } + + return NULL; +} + +static void report_stats(struct timespec delta) +{ + double nr_lpis; + double time; + + nr_lpis = test_data.nr_devices * test_data.nr_event_ids * nr_iterations; + + time = delta.tv_sec; + time += ((double)delta.tv_nsec) / NSEC_PER_SEC; + + pr_info("Rate: %.2f LPIs/sec\n", nr_lpis / time); +} + +static void run_test(void) +{ + u32 nr_devices = test_data.nr_devices; + u32 nr_vcpus = test_data.nr_cpus; + pthread_t *lpi_threads = malloc(nr_devices * sizeof(pthread_t)); + pthread_t *vcpu_threads = malloc(nr_vcpus * sizeof(pthread_t)); + struct timespec start, delta; + size_t i; + + TEST_ASSERT(lpi_threads && vcpu_threads, "Failed to allocate pthread arrays"); + + pthread_barrier_init(&test_setup_barrier, NULL, nr_vcpus + nr_devices + 1); + + for (i = 0; i < nr_vcpus; i++) + pthread_create(&vcpu_threads[i], NULL, vcpu_worker_thread, vcpus[i]); + + for (i = 0; i < nr_devices; i++) + pthread_create(&lpi_threads[i], NULL, lpi_worker_thread, (void *)i); + + pthread_barrier_wait(&test_setup_barrier); + + clock_gettime(CLOCK_MONOTONIC, &start); + + for (i = 0; i < nr_devices; i++) + pthread_join(lpi_threads[i], NULL); + + delta = timespec_elapsed(start); + write_guest_global(vm, test_data.request_vcpus_stop, true); + + for (i = 0; i < nr_vcpus; i++) + pthread_join(vcpu_threads[i], NULL); + + report_stats(delta); +} + +static void setup_vm(void) +{ + int i; + + vcpus = malloc(test_data.nr_cpus * sizeof(struct kvm_vcpu)); + TEST_ASSERT(vcpus, "Failed to allocate vCPU array"); + + vm = vm_create_with_vcpus(test_data.nr_cpus, guest_code, vcpus); + + vm_init_descriptor_tables(vm); + for (i = 0; i < test_data.nr_cpus; i++) + vcpu_init_descriptor_tables(vcpus[i]); + + vm_install_exception_handler(vm, VECTOR_IRQ_CURRENT, guest_irq_handler); + + setup_memslot(); + + setup_gic(); + + setup_test_data(); +} + +static void destroy_vm(void) +{ + close(its_fd); + close(gic_fd); + kvm_vm_free(vm); + free(vcpus); +} + +static void pr_usage(const char *name) +{ + pr_info("%s [-v NR_VCPUS] [-d NR_DEVICES] [-e NR_EVENTS] [-i ITERS] -h\n", name); + pr_info(" -v:\tnumber of vCPUs (default: %u)\n", test_data.nr_cpus); + pr_info(" -d:\tnumber of devices (default: %u)\n", test_data.nr_devices); + pr_info(" -e:\tnumber of event IDs per device (default: %u)\n", test_data.nr_event_ids); + pr_info(" -i:\tnumber of iterations (default: %lu)\n", nr_iterations); +} + +int main(int argc, char **argv) +{ + u32 nr_threads; + int c; + + while ((c = getopt(argc, argv, "hv:d:e:i:")) != -1) { + switch (c) { + case 'v': + test_data.nr_cpus = atoi(optarg); + break; + case 'd': + test_data.nr_devices = atoi(optarg); + break; + case 'e': + test_data.nr_event_ids = atoi(optarg); + break; + case 'i': + nr_iterations = strtoul(optarg, NULL, 0); + break; + case 'h': + default: + pr_usage(argv[0]); + return 1; + } + } + + nr_threads = test_data.nr_cpus + test_data.nr_devices; + if (nr_threads > get_nprocs()) + pr_info("WARNING: running %u threads on %d CPUs; performance is degraded.\n", + nr_threads, get_nprocs()); + + setup_vm(); + + run_test(); + + destroy_vm(); + + return 0; +} diff --git a/tools/testing/selftests/kvm/aarch64/vpmu_counter_access.c b/tools/testing/selftests/kvm/aarch64/vpmu_counter_access.c new file mode 100644 index 000000000000..d31b9f64ba14 --- /dev/null +++ b/tools/testing/selftests/kvm/aarch64/vpmu_counter_access.c @@ -0,0 +1,649 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * vpmu_counter_access - Test vPMU event counter access + * + * Copyright (c) 2023 Google LLC. + * + * This test checks if the guest can see the same number of the PMU event + * counters (PMCR_EL0.N) that userspace sets, if the guest can access + * those counters, and if the guest is prevented from accessing any + * other counters. + * It also checks if the userspace accesses to the PMU regsisters honor the + * PMCR.N value that's set for the guest. + * This test runs only when KVM_CAP_ARM_PMU_V3 is supported on the host. + */ +#include <kvm_util.h> +#include <processor.h> +#include <test_util.h> +#include <vgic.h> +#include <perf/arm_pmuv3.h> +#include <linux/bitfield.h> + +/* The max number of the PMU event counters (excluding the cycle counter) */ +#define ARMV8_PMU_MAX_GENERAL_COUNTERS (ARMV8_PMU_MAX_COUNTERS - 1) + +/* The cycle counter bit position that's common among the PMU registers */ +#define ARMV8_PMU_CYCLE_IDX 31 + +struct vpmu_vm { + struct kvm_vm *vm; + struct kvm_vcpu *vcpu; + int gic_fd; +}; + +static struct vpmu_vm vpmu_vm; + +struct pmreg_sets { + uint64_t set_reg_id; + uint64_t clr_reg_id; +}; + +#define PMREG_SET(set, clr) {.set_reg_id = set, .clr_reg_id = clr} + +static uint64_t get_pmcr_n(uint64_t pmcr) +{ + return FIELD_GET(ARMV8_PMU_PMCR_N, pmcr); +} + +static void set_pmcr_n(uint64_t *pmcr, uint64_t pmcr_n) +{ + u64p_replace_bits((__u64 *) pmcr, pmcr_n, ARMV8_PMU_PMCR_N); +} + +static uint64_t get_counters_mask(uint64_t n) +{ + uint64_t mask = BIT(ARMV8_PMU_CYCLE_IDX); + + if (n) + mask |= GENMASK(n - 1, 0); + return mask; +} + +/* Read PMEVTCNTR<n>_EL0 through PMXEVCNTR_EL0 */ +static inline unsigned long read_sel_evcntr(int sel) +{ + write_sysreg(sel, pmselr_el0); + isb(); + return read_sysreg(pmxevcntr_el0); +} + +/* Write PMEVTCNTR<n>_EL0 through PMXEVCNTR_EL0 */ +static inline void write_sel_evcntr(int sel, unsigned long val) +{ + write_sysreg(sel, pmselr_el0); + isb(); + write_sysreg(val, pmxevcntr_el0); + isb(); +} + +/* Read PMEVTYPER<n>_EL0 through PMXEVTYPER_EL0 */ +static inline unsigned long read_sel_evtyper(int sel) +{ + write_sysreg(sel, pmselr_el0); + isb(); + return read_sysreg(pmxevtyper_el0); +} + +/* Write PMEVTYPER<n>_EL0 through PMXEVTYPER_EL0 */ +static inline void write_sel_evtyper(int sel, unsigned long val) +{ + write_sysreg(sel, pmselr_el0); + isb(); + write_sysreg(val, pmxevtyper_el0); + isb(); +} + +static void pmu_disable_reset(void) +{ + uint64_t pmcr = read_sysreg(pmcr_el0); + + /* Reset all counters, disabling them */ + pmcr &= ~ARMV8_PMU_PMCR_E; + write_sysreg(pmcr | ARMV8_PMU_PMCR_P, pmcr_el0); + isb(); +} + +#define RETURN_READ_PMEVCNTRN(n) \ + return read_sysreg(pmevcntr##n##_el0) +static unsigned long read_pmevcntrn(int n) +{ + PMEVN_SWITCH(n, RETURN_READ_PMEVCNTRN); + return 0; +} + +#define WRITE_PMEVCNTRN(n) \ + write_sysreg(val, pmevcntr##n##_el0) +static void write_pmevcntrn(int n, unsigned long val) +{ + PMEVN_SWITCH(n, WRITE_PMEVCNTRN); + isb(); +} + +#define READ_PMEVTYPERN(n) \ + return read_sysreg(pmevtyper##n##_el0) +static unsigned long read_pmevtypern(int n) +{ + PMEVN_SWITCH(n, READ_PMEVTYPERN); + return 0; +} + +#define WRITE_PMEVTYPERN(n) \ + write_sysreg(val, pmevtyper##n##_el0) +static void write_pmevtypern(int n, unsigned long val) +{ + PMEVN_SWITCH(n, WRITE_PMEVTYPERN); + isb(); +} + +/* + * The pmc_accessor structure has pointers to PMEV{CNTR,TYPER}<n>_EL0 + * accessors that test cases will use. Each of the accessors will + * either directly reads/writes PMEV{CNTR,TYPER}<n>_EL0 + * (i.e. {read,write}_pmev{cnt,type}rn()), or reads/writes them through + * PMXEV{CNTR,TYPER}_EL0 (i.e. {read,write}_sel_ev{cnt,type}r()). + * + * This is used to test that combinations of those accessors provide + * the consistent behavior. + */ +struct pmc_accessor { + /* A function to be used to read PMEVTCNTR<n>_EL0 */ + unsigned long (*read_cntr)(int idx); + /* A function to be used to write PMEVTCNTR<n>_EL0 */ + void (*write_cntr)(int idx, unsigned long val); + /* A function to be used to read PMEVTYPER<n>_EL0 */ + unsigned long (*read_typer)(int idx); + /* A function to be used to write PMEVTYPER<n>_EL0 */ + void (*write_typer)(int idx, unsigned long val); +}; + +struct pmc_accessor pmc_accessors[] = { + /* test with all direct accesses */ + { read_pmevcntrn, write_pmevcntrn, read_pmevtypern, write_pmevtypern }, + /* test with all indirect accesses */ + { read_sel_evcntr, write_sel_evcntr, read_sel_evtyper, write_sel_evtyper }, + /* read with direct accesses, and write with indirect accesses */ + { read_pmevcntrn, write_sel_evcntr, read_pmevtypern, write_sel_evtyper }, + /* read with indirect accesses, and write with direct accesses */ + { read_sel_evcntr, write_pmevcntrn, read_sel_evtyper, write_pmevtypern }, +}; + +/* + * Convert a pointer of pmc_accessor to an index in pmc_accessors[], + * assuming that the pointer is one of the entries in pmc_accessors[]. + */ +#define PMC_ACC_TO_IDX(acc) (acc - &pmc_accessors[0]) + +#define GUEST_ASSERT_BITMAP_REG(regname, mask, set_expected) \ +{ \ + uint64_t _tval = read_sysreg(regname); \ + \ + if (set_expected) \ + __GUEST_ASSERT((_tval & mask), \ + "tval: 0x%lx; mask: 0x%lx; set_expected: %u", \ + _tval, mask, set_expected); \ + else \ + __GUEST_ASSERT(!(_tval & mask), \ + "tval: 0x%lx; mask: 0x%lx; set_expected: %u", \ + _tval, mask, set_expected); \ +} + +/* + * Check if @mask bits in {PMCNTEN,PMINTEN,PMOVS}{SET,CLR} registers + * are set or cleared as specified in @set_expected. + */ +static void check_bitmap_pmu_regs(uint64_t mask, bool set_expected) +{ + GUEST_ASSERT_BITMAP_REG(pmcntenset_el0, mask, set_expected); + GUEST_ASSERT_BITMAP_REG(pmcntenclr_el0, mask, set_expected); + GUEST_ASSERT_BITMAP_REG(pmintenset_el1, mask, set_expected); + GUEST_ASSERT_BITMAP_REG(pmintenclr_el1, mask, set_expected); + GUEST_ASSERT_BITMAP_REG(pmovsset_el0, mask, set_expected); + GUEST_ASSERT_BITMAP_REG(pmovsclr_el0, mask, set_expected); +} + +/* + * Check if the bit in {PMCNTEN,PMINTEN,PMOVS}{SET,CLR} registers corresponding + * to the specified counter (@pmc_idx) can be read/written as expected. + * When @set_op is true, it tries to set the bit for the counter in + * those registers by writing the SET registers (the bit won't be set + * if the counter is not implemented though). + * Otherwise, it tries to clear the bits in the registers by writing + * the CLR registers. + * Then, it checks if the values indicated in the registers are as expected. + */ +static void test_bitmap_pmu_regs(int pmc_idx, bool set_op) +{ + uint64_t pmcr_n, test_bit = BIT(pmc_idx); + bool set_expected = false; + + if (set_op) { + write_sysreg(test_bit, pmcntenset_el0); + write_sysreg(test_bit, pmintenset_el1); + write_sysreg(test_bit, pmovsset_el0); + + /* The bit will be set only if the counter is implemented */ + pmcr_n = get_pmcr_n(read_sysreg(pmcr_el0)); + set_expected = (pmc_idx < pmcr_n) ? true : false; + } else { + write_sysreg(test_bit, pmcntenclr_el0); + write_sysreg(test_bit, pmintenclr_el1); + write_sysreg(test_bit, pmovsclr_el0); + } + check_bitmap_pmu_regs(test_bit, set_expected); +} + +/* + * Tests for reading/writing registers for the (implemented) event counter + * specified by @pmc_idx. + */ +static void test_access_pmc_regs(struct pmc_accessor *acc, int pmc_idx) +{ + uint64_t write_data, read_data; + + /* Disable all PMCs and reset all PMCs to zero. */ + pmu_disable_reset(); + + /* + * Tests for reading/writing {PMCNTEN,PMINTEN,PMOVS}{SET,CLR}_EL1. + */ + + /* Make sure that the bit in those registers are set to 0 */ + test_bitmap_pmu_regs(pmc_idx, false); + /* Test if setting the bit in those registers works */ + test_bitmap_pmu_regs(pmc_idx, true); + /* Test if clearing the bit in those registers works */ + test_bitmap_pmu_regs(pmc_idx, false); + + /* + * Tests for reading/writing the event type register. + */ + + /* + * Set the event type register to an arbitrary value just for testing + * of reading/writing the register. + * Arm ARM says that for the event from 0x0000 to 0x003F, + * the value indicated in the PMEVTYPER<n>_EL0.evtCount field is + * the value written to the field even when the specified event + * is not supported. + */ + write_data = (ARMV8_PMU_EXCLUDE_EL1 | ARMV8_PMUV3_PERFCTR_INST_RETIRED); + acc->write_typer(pmc_idx, write_data); + read_data = acc->read_typer(pmc_idx); + __GUEST_ASSERT(read_data == write_data, + "pmc_idx: 0x%x; acc_idx: 0x%lx; read_data: 0x%lx; write_data: 0x%lx", + pmc_idx, PMC_ACC_TO_IDX(acc), read_data, write_data); + + /* + * Tests for reading/writing the event count register. + */ + + read_data = acc->read_cntr(pmc_idx); + + /* The count value must be 0, as it is disabled and reset */ + __GUEST_ASSERT(read_data == 0, + "pmc_idx: 0x%x; acc_idx: 0x%lx; read_data: 0x%lx", + pmc_idx, PMC_ACC_TO_IDX(acc), read_data); + + write_data = read_data + pmc_idx + 0x12345; + acc->write_cntr(pmc_idx, write_data); + read_data = acc->read_cntr(pmc_idx); + __GUEST_ASSERT(read_data == write_data, + "pmc_idx: 0x%x; acc_idx: 0x%lx; read_data: 0x%lx; write_data: 0x%lx", + pmc_idx, PMC_ACC_TO_IDX(acc), read_data, write_data); +} + +#define INVALID_EC (-1ul) +uint64_t expected_ec = INVALID_EC; + +static void guest_sync_handler(struct ex_regs *regs) +{ + uint64_t esr, ec; + + esr = read_sysreg(esr_el1); + ec = (esr >> ESR_EC_SHIFT) & ESR_EC_MASK; + + __GUEST_ASSERT(expected_ec == ec, + "PC: 0x%lx; ESR: 0x%lx; EC: 0x%lx; EC expected: 0x%lx", + regs->pc, esr, ec, expected_ec); + + /* skip the trapping instruction */ + regs->pc += 4; + + /* Use INVALID_EC to indicate an exception occurred */ + expected_ec = INVALID_EC; +} + +/* + * Run the given operation that should trigger an exception with the + * given exception class. The exception handler (guest_sync_handler) + * will reset op_end_addr to 0, expected_ec to INVALID_EC, and skip + * the instruction that trapped. + */ +#define TEST_EXCEPTION(ec, ops) \ +({ \ + GUEST_ASSERT(ec != INVALID_EC); \ + WRITE_ONCE(expected_ec, ec); \ + dsb(ish); \ + ops; \ + GUEST_ASSERT(expected_ec == INVALID_EC); \ +}) + +/* + * Tests for reading/writing registers for the unimplemented event counter + * specified by @pmc_idx (>= PMCR_EL0.N). + */ +static void test_access_invalid_pmc_regs(struct pmc_accessor *acc, int pmc_idx) +{ + /* + * Reading/writing the event count/type registers should cause + * an UNDEFINED exception. + */ + TEST_EXCEPTION(ESR_EC_UNKNOWN, acc->read_cntr(pmc_idx)); + TEST_EXCEPTION(ESR_EC_UNKNOWN, acc->write_cntr(pmc_idx, 0)); + TEST_EXCEPTION(ESR_EC_UNKNOWN, acc->read_typer(pmc_idx)); + TEST_EXCEPTION(ESR_EC_UNKNOWN, acc->write_typer(pmc_idx, 0)); + /* + * The bit corresponding to the (unimplemented) counter in + * {PMCNTEN,PMINTEN,PMOVS}{SET,CLR} registers should be RAZ. + */ + test_bitmap_pmu_regs(pmc_idx, 1); + test_bitmap_pmu_regs(pmc_idx, 0); +} + +/* + * The guest is configured with PMUv3 with @expected_pmcr_n number of + * event counters. + * Check if @expected_pmcr_n is consistent with PMCR_EL0.N, and + * if reading/writing PMU registers for implemented or unimplemented + * counters works as expected. + */ +static void guest_code(uint64_t expected_pmcr_n) +{ + uint64_t pmcr, pmcr_n, unimp_mask; + int i, pmc; + + __GUEST_ASSERT(expected_pmcr_n <= ARMV8_PMU_MAX_GENERAL_COUNTERS, + "Expected PMCR.N: 0x%lx; ARMv8 general counters: 0x%x", + expected_pmcr_n, ARMV8_PMU_MAX_GENERAL_COUNTERS); + + pmcr = read_sysreg(pmcr_el0); + pmcr_n = get_pmcr_n(pmcr); + + /* Make sure that PMCR_EL0.N indicates the value userspace set */ + __GUEST_ASSERT(pmcr_n == expected_pmcr_n, + "Expected PMCR.N: 0x%lx, PMCR.N: 0x%lx", + expected_pmcr_n, pmcr_n); + + /* + * Make sure that (RAZ) bits corresponding to unimplemented event + * counters in {PMCNTEN,PMINTEN,PMOVS}{SET,CLR} registers are reset + * to zero. + * (NOTE: bits for implemented event counters are reset to UNKNOWN) + */ + unimp_mask = GENMASK_ULL(ARMV8_PMU_MAX_GENERAL_COUNTERS - 1, pmcr_n); + check_bitmap_pmu_regs(unimp_mask, false); + + /* + * Tests for reading/writing PMU registers for implemented counters. + * Use each combination of PMEV{CNTR,TYPER}<n>_EL0 accessor functions. + */ + for (i = 0; i < ARRAY_SIZE(pmc_accessors); i++) { + for (pmc = 0; pmc < pmcr_n; pmc++) + test_access_pmc_regs(&pmc_accessors[i], pmc); + } + + /* + * Tests for reading/writing PMU registers for unimplemented counters. + * Use each combination of PMEV{CNTR,TYPER}<n>_EL0 accessor functions. + */ + for (i = 0; i < ARRAY_SIZE(pmc_accessors); i++) { + for (pmc = pmcr_n; pmc < ARMV8_PMU_MAX_GENERAL_COUNTERS; pmc++) + test_access_invalid_pmc_regs(&pmc_accessors[i], pmc); + } + + GUEST_DONE(); +} + +/* Create a VM that has one vCPU with PMUv3 configured. */ +static void create_vpmu_vm(void *guest_code) +{ + struct kvm_vcpu_init init; + uint8_t pmuver, ec; + uint64_t dfr0, irq = 23; + struct kvm_device_attr irq_attr = { + .group = KVM_ARM_VCPU_PMU_V3_CTRL, + .attr = KVM_ARM_VCPU_PMU_V3_IRQ, + .addr = (uint64_t)&irq, + }; + struct kvm_device_attr init_attr = { + .group = KVM_ARM_VCPU_PMU_V3_CTRL, + .attr = KVM_ARM_VCPU_PMU_V3_INIT, + }; + + /* The test creates the vpmu_vm multiple times. Ensure a clean state */ + memset(&vpmu_vm, 0, sizeof(vpmu_vm)); + + vpmu_vm.vm = vm_create(1); + vm_init_descriptor_tables(vpmu_vm.vm); + for (ec = 0; ec < ESR_EC_NUM; ec++) { + vm_install_sync_handler(vpmu_vm.vm, VECTOR_SYNC_CURRENT, ec, + guest_sync_handler); + } + + /* Create vCPU with PMUv3 */ + vm_ioctl(vpmu_vm.vm, KVM_ARM_PREFERRED_TARGET, &init); + init.features[0] |= (1 << KVM_ARM_VCPU_PMU_V3); + vpmu_vm.vcpu = aarch64_vcpu_add(vpmu_vm.vm, 0, &init, guest_code); + vcpu_init_descriptor_tables(vpmu_vm.vcpu); + vpmu_vm.gic_fd = vgic_v3_setup(vpmu_vm.vm, 1, 64); + __TEST_REQUIRE(vpmu_vm.gic_fd >= 0, + "Failed to create vgic-v3, skipping"); + + /* Make sure that PMUv3 support is indicated in the ID register */ + vcpu_get_reg(vpmu_vm.vcpu, + KVM_ARM64_SYS_REG(SYS_ID_AA64DFR0_EL1), &dfr0); + pmuver = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_PMUVer), dfr0); + TEST_ASSERT(pmuver != ID_AA64DFR0_EL1_PMUVer_IMP_DEF && + pmuver >= ID_AA64DFR0_EL1_PMUVer_IMP, + "Unexpected PMUVER (0x%x) on the vCPU with PMUv3", pmuver); + + /* Initialize vPMU */ + vcpu_ioctl(vpmu_vm.vcpu, KVM_SET_DEVICE_ATTR, &irq_attr); + vcpu_ioctl(vpmu_vm.vcpu, KVM_SET_DEVICE_ATTR, &init_attr); +} + +static void destroy_vpmu_vm(void) +{ + close(vpmu_vm.gic_fd); + kvm_vm_free(vpmu_vm.vm); +} + +static void run_vcpu(struct kvm_vcpu *vcpu, uint64_t pmcr_n) +{ + struct ucall uc; + + vcpu_args_set(vcpu, 1, pmcr_n); + vcpu_run(vcpu); + switch (get_ucall(vcpu, &uc)) { + case UCALL_ABORT: + REPORT_GUEST_ASSERT(uc); + break; + case UCALL_DONE: + break; + default: + TEST_FAIL("Unknown ucall %lu", uc.cmd); + break; + } +} + +static void test_create_vpmu_vm_with_pmcr_n(uint64_t pmcr_n, bool expect_fail) +{ + struct kvm_vcpu *vcpu; + uint64_t pmcr, pmcr_orig; + + create_vpmu_vm(guest_code); + vcpu = vpmu_vm.vcpu; + + vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(SYS_PMCR_EL0), &pmcr_orig); + pmcr = pmcr_orig; + + /* + * Setting a larger value of PMCR.N should not modify the field, and + * return a success. + */ + set_pmcr_n(&pmcr, pmcr_n); + vcpu_set_reg(vcpu, KVM_ARM64_SYS_REG(SYS_PMCR_EL0), pmcr); + vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(SYS_PMCR_EL0), &pmcr); + + if (expect_fail) + TEST_ASSERT(pmcr_orig == pmcr, + "PMCR.N modified by KVM to a larger value (PMCR: 0x%lx) for pmcr_n: 0x%lx", + pmcr, pmcr_n); + else + TEST_ASSERT(pmcr_n == get_pmcr_n(pmcr), + "Failed to update PMCR.N to %lu (received: %lu)", + pmcr_n, get_pmcr_n(pmcr)); +} + +/* + * Create a guest with one vCPU, set the PMCR_EL0.N for the vCPU to @pmcr_n, + * and run the test. + */ +static void run_access_test(uint64_t pmcr_n) +{ + uint64_t sp; + struct kvm_vcpu *vcpu; + struct kvm_vcpu_init init; + + pr_debug("Test with pmcr_n %lu\n", pmcr_n); + + test_create_vpmu_vm_with_pmcr_n(pmcr_n, false); + vcpu = vpmu_vm.vcpu; + + /* Save the initial sp to restore them later to run the guest again */ + vcpu_get_reg(vcpu, ARM64_CORE_REG(sp_el1), &sp); + + run_vcpu(vcpu, pmcr_n); + + /* + * Reset and re-initialize the vCPU, and run the guest code again to + * check if PMCR_EL0.N is preserved. + */ + vm_ioctl(vpmu_vm.vm, KVM_ARM_PREFERRED_TARGET, &init); + init.features[0] |= (1 << KVM_ARM_VCPU_PMU_V3); + aarch64_vcpu_setup(vcpu, &init); + vcpu_init_descriptor_tables(vcpu); + vcpu_set_reg(vcpu, ARM64_CORE_REG(sp_el1), sp); + vcpu_set_reg(vcpu, ARM64_CORE_REG(regs.pc), (uint64_t)guest_code); + + run_vcpu(vcpu, pmcr_n); + + destroy_vpmu_vm(); +} + +static struct pmreg_sets validity_check_reg_sets[] = { + PMREG_SET(SYS_PMCNTENSET_EL0, SYS_PMCNTENCLR_EL0), + PMREG_SET(SYS_PMINTENSET_EL1, SYS_PMINTENCLR_EL1), + PMREG_SET(SYS_PMOVSSET_EL0, SYS_PMOVSCLR_EL0), +}; + +/* + * Create a VM, and check if KVM handles the userspace accesses of + * the PMU register sets in @validity_check_reg_sets[] correctly. + */ +static void run_pmregs_validity_test(uint64_t pmcr_n) +{ + int i; + struct kvm_vcpu *vcpu; + uint64_t set_reg_id, clr_reg_id, reg_val; + uint64_t valid_counters_mask, max_counters_mask; + + test_create_vpmu_vm_with_pmcr_n(pmcr_n, false); + vcpu = vpmu_vm.vcpu; + + valid_counters_mask = get_counters_mask(pmcr_n); + max_counters_mask = get_counters_mask(ARMV8_PMU_MAX_COUNTERS); + + for (i = 0; i < ARRAY_SIZE(validity_check_reg_sets); i++) { + set_reg_id = validity_check_reg_sets[i].set_reg_id; + clr_reg_id = validity_check_reg_sets[i].clr_reg_id; + + /* + * Test if the 'set' and 'clr' variants of the registers + * are initialized based on the number of valid counters. + */ + vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(set_reg_id), ®_val); + TEST_ASSERT((reg_val & (~valid_counters_mask)) == 0, + "Initial read of set_reg: 0x%llx has unimplemented counters enabled: 0x%lx", + KVM_ARM64_SYS_REG(set_reg_id), reg_val); + + vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(clr_reg_id), ®_val); + TEST_ASSERT((reg_val & (~valid_counters_mask)) == 0, + "Initial read of clr_reg: 0x%llx has unimplemented counters enabled: 0x%lx", + KVM_ARM64_SYS_REG(clr_reg_id), reg_val); + + /* + * Using the 'set' variant, force-set the register to the + * max number of possible counters and test if KVM discards + * the bits for unimplemented counters as it should. + */ + vcpu_set_reg(vcpu, KVM_ARM64_SYS_REG(set_reg_id), max_counters_mask); + + vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(set_reg_id), ®_val); + TEST_ASSERT((reg_val & (~valid_counters_mask)) == 0, + "Read of set_reg: 0x%llx has unimplemented counters enabled: 0x%lx", + KVM_ARM64_SYS_REG(set_reg_id), reg_val); + + vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(clr_reg_id), ®_val); + TEST_ASSERT((reg_val & (~valid_counters_mask)) == 0, + "Read of clr_reg: 0x%llx has unimplemented counters enabled: 0x%lx", + KVM_ARM64_SYS_REG(clr_reg_id), reg_val); + } + + destroy_vpmu_vm(); +} + +/* + * Create a guest with one vCPU, and attempt to set the PMCR_EL0.N for + * the vCPU to @pmcr_n, which is larger than the host value. + * The attempt should fail as @pmcr_n is too big to set for the vCPU. + */ +static void run_error_test(uint64_t pmcr_n) +{ + pr_debug("Error test with pmcr_n %lu (larger than the host)\n", pmcr_n); + + test_create_vpmu_vm_with_pmcr_n(pmcr_n, true); + destroy_vpmu_vm(); +} + +/* + * Return the default number of implemented PMU event counters excluding + * the cycle counter (i.e. PMCR_EL0.N value) for the guest. + */ +static uint64_t get_pmcr_n_limit(void) +{ + uint64_t pmcr; + + create_vpmu_vm(guest_code); + vcpu_get_reg(vpmu_vm.vcpu, KVM_ARM64_SYS_REG(SYS_PMCR_EL0), &pmcr); + destroy_vpmu_vm(); + return get_pmcr_n(pmcr); +} + +int main(void) +{ + uint64_t i, pmcr_n; + + TEST_REQUIRE(kvm_has_cap(KVM_CAP_ARM_PMU_V3)); + + pmcr_n = get_pmcr_n_limit(); + for (i = 0; i <= pmcr_n; i++) { + run_access_test(i); + run_pmregs_validity_test(i); + } + + for (i = pmcr_n + 1; i < ARMV8_PMU_MAX_COUNTERS; i++) + run_error_test(i); + + return 0; +} diff --git a/tools/testing/selftests/kvm/access_tracking_perf_test.c b/tools/testing/selftests/kvm/access_tracking_perf_test.c new file mode 100644 index 000000000000..3c7defd34f56 --- /dev/null +++ b/tools/testing/selftests/kvm/access_tracking_perf_test.c @@ -0,0 +1,394 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * access_tracking_perf_test + * + * Copyright (C) 2021, Google, Inc. + * + * This test measures the performance effects of KVM's access tracking. + * Access tracking is driven by the MMU notifiers test_young, clear_young, and + * clear_flush_young. These notifiers do not have a direct userspace API, + * however the clear_young notifier can be triggered by marking a pages as idle + * in /sys/kernel/mm/page_idle/bitmap. This test leverages that mechanism to + * enable access tracking on guest memory. + * + * To measure performance this test runs a VM with a configurable number of + * vCPUs that each touch every page in disjoint regions of memory. Performance + * is measured in the time it takes all vCPUs to finish touching their + * predefined region. + * + * Note that a deterministic correctness test of access tracking is not possible + * by using page_idle as it exists today. This is for a few reasons: + * + * 1. page_idle only issues clear_young notifiers, which lack a TLB flush. This + * means subsequent guest accesses are not guaranteed to see page table + * updates made by KVM until some time in the future. + * + * 2. page_idle only operates on LRU pages. Newly allocated pages are not + * immediately allocated to LRU lists. Instead they are held in a "pagevec", + * which is drained to LRU lists some time in the future. There is no + * userspace API to force this drain to occur. + * + * These limitations are worked around in this test by using a large enough + * region of memory for each vCPU such that the number of translations cached in + * the TLB and the number of pages held in pagevecs are a small fraction of the + * overall workload. And if either of those conditions are not true (for example + * in nesting, where TLB size is unlimited) this test will print a warning + * rather than silently passing. + */ +#include <inttypes.h> +#include <limits.h> +#include <pthread.h> +#include <sys/mman.h> +#include <sys/types.h> +#include <sys/stat.h> + +#include "kvm_util.h" +#include "test_util.h" +#include "memstress.h" +#include "guest_modes.h" +#include "processor.h" + +/* Global variable used to synchronize all of the vCPU threads. */ +static int iteration; + +/* Defines what vCPU threads should do during a given iteration. */ +static enum { + /* Run the vCPU to access all its memory. */ + ITERATION_ACCESS_MEMORY, + /* Mark the vCPU's memory idle in page_idle. */ + ITERATION_MARK_IDLE, +} iteration_work; + +/* The iteration that was last completed by each vCPU. */ +static int vcpu_last_completed_iteration[KVM_MAX_VCPUS]; + +/* Whether to overlap the regions of memory vCPUs access. */ +static bool overlap_memory_access; + +struct test_params { + /* The backing source for the region of memory. */ + enum vm_mem_backing_src_type backing_src; + + /* The amount of memory to allocate for each vCPU. */ + uint64_t vcpu_memory_bytes; + + /* The number of vCPUs to create in the VM. */ + int nr_vcpus; +}; + +static uint64_t pread_uint64(int fd, const char *filename, uint64_t index) +{ + uint64_t value; + off_t offset = index * sizeof(value); + + TEST_ASSERT(pread(fd, &value, sizeof(value), offset) == sizeof(value), + "pread from %s offset 0x%" PRIx64 " failed!", + filename, offset); + + return value; + +} + +#define PAGEMAP_PRESENT (1ULL << 63) +#define PAGEMAP_PFN_MASK ((1ULL << 55) - 1) + +static uint64_t lookup_pfn(int pagemap_fd, struct kvm_vm *vm, uint64_t gva) +{ + uint64_t hva = (uint64_t) addr_gva2hva(vm, gva); + uint64_t entry; + uint64_t pfn; + + entry = pread_uint64(pagemap_fd, "pagemap", hva / getpagesize()); + if (!(entry & PAGEMAP_PRESENT)) + return 0; + + pfn = entry & PAGEMAP_PFN_MASK; + __TEST_REQUIRE(pfn, "Looking up PFNs requires CAP_SYS_ADMIN"); + + return pfn; +} + +static bool is_page_idle(int page_idle_fd, uint64_t pfn) +{ + uint64_t bits = pread_uint64(page_idle_fd, "page_idle", pfn / 64); + + return !!((bits >> (pfn % 64)) & 1); +} + +static void mark_page_idle(int page_idle_fd, uint64_t pfn) +{ + uint64_t bits = 1ULL << (pfn % 64); + + TEST_ASSERT(pwrite(page_idle_fd, &bits, 8, 8 * (pfn / 64)) == 8, + "Set page_idle bits for PFN 0x%" PRIx64, pfn); +} + +static void mark_vcpu_memory_idle(struct kvm_vm *vm, + struct memstress_vcpu_args *vcpu_args) +{ + int vcpu_idx = vcpu_args->vcpu_idx; + uint64_t base_gva = vcpu_args->gva; + uint64_t pages = vcpu_args->pages; + uint64_t page; + uint64_t still_idle = 0; + uint64_t no_pfn = 0; + int page_idle_fd; + int pagemap_fd; + + /* If vCPUs are using an overlapping region, let vCPU 0 mark it idle. */ + if (overlap_memory_access && vcpu_idx) + return; + + page_idle_fd = open("/sys/kernel/mm/page_idle/bitmap", O_RDWR); + TEST_ASSERT(page_idle_fd > 0, "Failed to open page_idle."); + + pagemap_fd = open("/proc/self/pagemap", O_RDONLY); + TEST_ASSERT(pagemap_fd > 0, "Failed to open pagemap."); + + for (page = 0; page < pages; page++) { + uint64_t gva = base_gva + page * memstress_args.guest_page_size; + uint64_t pfn = lookup_pfn(pagemap_fd, vm, gva); + + if (!pfn) { + no_pfn++; + continue; + } + + if (is_page_idle(page_idle_fd, pfn)) { + still_idle++; + continue; + } + + mark_page_idle(page_idle_fd, pfn); + } + + /* + * Assumption: Less than 1% of pages are going to be swapped out from + * under us during this test. + */ + TEST_ASSERT(no_pfn < pages / 100, + "vCPU %d: No PFN for %" PRIu64 " out of %" PRIu64 " pages.", + vcpu_idx, no_pfn, pages); + + /* + * Check that at least 90% of memory has been marked idle (the rest + * might not be marked idle because the pages have not yet made it to an + * LRU list or the translations are still cached in the TLB). 90% is + * arbitrary; high enough that we ensure most memory access went through + * access tracking but low enough as to not make the test too brittle + * over time and across architectures. + * + * When running the guest as a nested VM, "warn" instead of asserting + * as the TLB size is effectively unlimited and the KVM doesn't + * explicitly flush the TLB when aging SPTEs. As a result, more pages + * are cached and the guest won't see the "idle" bit cleared. + */ + if (still_idle >= pages / 10) { +#ifdef __x86_64__ + TEST_ASSERT(this_cpu_has(X86_FEATURE_HYPERVISOR), + "vCPU%d: Too many pages still idle (%lu out of %lu)", + vcpu_idx, still_idle, pages); +#endif + printf("WARNING: vCPU%d: Too many pages still idle (%lu out of %lu), " + "this will affect performance results.\n", + vcpu_idx, still_idle, pages); + } + + close(page_idle_fd); + close(pagemap_fd); +} + +static void assert_ucall(struct kvm_vcpu *vcpu, uint64_t expected_ucall) +{ + struct ucall uc; + uint64_t actual_ucall = get_ucall(vcpu, &uc); + + TEST_ASSERT(expected_ucall == actual_ucall, + "Guest exited unexpectedly (expected ucall %" PRIu64 + ", got %" PRIu64 ")", + expected_ucall, actual_ucall); +} + +static bool spin_wait_for_next_iteration(int *current_iteration) +{ + int last_iteration = *current_iteration; + + do { + if (READ_ONCE(memstress_args.stop_vcpus)) + return false; + + *current_iteration = READ_ONCE(iteration); + } while (last_iteration == *current_iteration); + + return true; +} + +static void vcpu_thread_main(struct memstress_vcpu_args *vcpu_args) +{ + struct kvm_vcpu *vcpu = vcpu_args->vcpu; + struct kvm_vm *vm = memstress_args.vm; + int vcpu_idx = vcpu_args->vcpu_idx; + int current_iteration = 0; + + while (spin_wait_for_next_iteration(¤t_iteration)) { + switch (READ_ONCE(iteration_work)) { + case ITERATION_ACCESS_MEMORY: + vcpu_run(vcpu); + assert_ucall(vcpu, UCALL_SYNC); + break; + case ITERATION_MARK_IDLE: + mark_vcpu_memory_idle(vm, vcpu_args); + break; + }; + + vcpu_last_completed_iteration[vcpu_idx] = current_iteration; + } +} + +static void spin_wait_for_vcpu(int vcpu_idx, int target_iteration) +{ + while (READ_ONCE(vcpu_last_completed_iteration[vcpu_idx]) != + target_iteration) { + continue; + } +} + +/* The type of memory accesses to perform in the VM. */ +enum access_type { + ACCESS_READ, + ACCESS_WRITE, +}; + +static void run_iteration(struct kvm_vm *vm, int nr_vcpus, const char *description) +{ + struct timespec ts_start; + struct timespec ts_elapsed; + int next_iteration, i; + + /* Kick off the vCPUs by incrementing iteration. */ + next_iteration = ++iteration; + + clock_gettime(CLOCK_MONOTONIC, &ts_start); + + /* Wait for all vCPUs to finish the iteration. */ + for (i = 0; i < nr_vcpus; i++) + spin_wait_for_vcpu(i, next_iteration); + + ts_elapsed = timespec_elapsed(ts_start); + pr_info("%-30s: %ld.%09lds\n", + description, ts_elapsed.tv_sec, ts_elapsed.tv_nsec); +} + +static void access_memory(struct kvm_vm *vm, int nr_vcpus, + enum access_type access, const char *description) +{ + memstress_set_write_percent(vm, (access == ACCESS_READ) ? 0 : 100); + iteration_work = ITERATION_ACCESS_MEMORY; + run_iteration(vm, nr_vcpus, description); +} + +static void mark_memory_idle(struct kvm_vm *vm, int nr_vcpus) +{ + /* + * Even though this parallelizes the work across vCPUs, this is still a + * very slow operation because page_idle forces the test to mark one pfn + * at a time and the clear_young notifier serializes on the KVM MMU + * lock. + */ + pr_debug("Marking VM memory idle (slow)...\n"); + iteration_work = ITERATION_MARK_IDLE; + run_iteration(vm, nr_vcpus, "Mark memory idle"); +} + +static void run_test(enum vm_guest_mode mode, void *arg) +{ + struct test_params *params = arg; + struct kvm_vm *vm; + int nr_vcpus = params->nr_vcpus; + + vm = memstress_create_vm(mode, nr_vcpus, params->vcpu_memory_bytes, 1, + params->backing_src, !overlap_memory_access); + + memstress_start_vcpu_threads(nr_vcpus, vcpu_thread_main); + + pr_info("\n"); + access_memory(vm, nr_vcpus, ACCESS_WRITE, "Populating memory"); + + /* As a control, read and write to the populated memory first. */ + access_memory(vm, nr_vcpus, ACCESS_WRITE, "Writing to populated memory"); + access_memory(vm, nr_vcpus, ACCESS_READ, "Reading from populated memory"); + + /* Repeat on memory that has been marked as idle. */ + mark_memory_idle(vm, nr_vcpus); + access_memory(vm, nr_vcpus, ACCESS_WRITE, "Writing to idle memory"); + mark_memory_idle(vm, nr_vcpus); + access_memory(vm, nr_vcpus, ACCESS_READ, "Reading from idle memory"); + + memstress_join_vcpu_threads(nr_vcpus); + memstress_destroy_vm(vm); +} + +static void help(char *name) +{ + puts(""); + printf("usage: %s [-h] [-m mode] [-b vcpu_bytes] [-v vcpus] [-o] [-s mem_type]\n", + name); + puts(""); + printf(" -h: Display this help message."); + guest_modes_help(); + printf(" -b: specify the size of the memory region which should be\n" + " dirtied by each vCPU. e.g. 10M or 3G.\n" + " (default: 1G)\n"); + printf(" -v: specify the number of vCPUs to run.\n"); + printf(" -o: Overlap guest memory accesses instead of partitioning\n" + " them into a separate region of memory for each vCPU.\n"); + backing_src_help("-s"); + puts(""); + exit(0); +} + +int main(int argc, char *argv[]) +{ + struct test_params params = { + .backing_src = DEFAULT_VM_MEM_SRC, + .vcpu_memory_bytes = DEFAULT_PER_VCPU_MEM_SIZE, + .nr_vcpus = 1, + }; + int page_idle_fd; + int opt; + + guest_modes_append_default(); + + while ((opt = getopt(argc, argv, "hm:b:v:os:")) != -1) { + switch (opt) { + case 'm': + guest_modes_cmdline(optarg); + break; + case 'b': + params.vcpu_memory_bytes = parse_size(optarg); + break; + case 'v': + params.nr_vcpus = atoi_positive("Number of vCPUs", optarg); + break; + case 'o': + overlap_memory_access = true; + break; + case 's': + params.backing_src = parse_backing_src_type(optarg); + break; + case 'h': + default: + help(argv[0]); + break; + } + } + + page_idle_fd = open("/sys/kernel/mm/page_idle/bitmap", O_RDWR); + __TEST_REQUIRE(page_idle_fd >= 0, + "CONFIG_IDLE_PAGE_TRACKING is not enabled"); + close(page_idle_fd); + + for_each_guest_mode(run_test, ¶ms); + + return 0; +} diff --git a/tools/testing/selftests/kvm/arch_timer.c b/tools/testing/selftests/kvm/arch_timer.c new file mode 100644 index 000000000000..acb2cb596332 --- /dev/null +++ b/tools/testing/selftests/kvm/arch_timer.c @@ -0,0 +1,257 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * arch_timer.c - Tests the arch timer IRQ functionality + * + * The guest's main thread configures the timer interrupt and waits + * for it to fire, with a timeout equal to the timer period. + * It asserts that the timeout doesn't exceed the timer period plus + * a user configurable error margin(default to 100us) + * + * On the other hand, upon receipt of an interrupt, the guest's interrupt + * handler validates the interrupt by checking if the architectural state + * is in compliance with the specifications. + * + * The test provides command-line options to configure the timer's + * period (-p), number of vCPUs (-n), iterations per stage (-i) and timer + * interrupt arrival error margin (-e). To stress-test the timer stack + * even more, an option to migrate the vCPUs across pCPUs (-m), at a + * particular rate, is also provided. + * + * Copyright (c) 2021, Google LLC. + */ +#include <stdlib.h> +#include <pthread.h> +#include <linux/sizes.h> +#include <linux/bitmap.h> +#include <sys/sysinfo.h> + +#include "timer_test.h" +#include "ucall_common.h" + +struct test_args test_args = { + .nr_vcpus = NR_VCPUS_DEF, + .nr_iter = NR_TEST_ITERS_DEF, + .timer_period_ms = TIMER_TEST_PERIOD_MS_DEF, + .migration_freq_ms = TIMER_TEST_MIGRATION_FREQ_MS, + .timer_err_margin_us = TIMER_TEST_ERR_MARGIN_US, + .reserved = 1, +}; + +struct kvm_vcpu *vcpus[KVM_MAX_VCPUS]; +struct test_vcpu_shared_data vcpu_shared_data[KVM_MAX_VCPUS]; + +static pthread_t pt_vcpu_run[KVM_MAX_VCPUS]; +static unsigned long *vcpu_done_map; +static pthread_mutex_t vcpu_done_map_lock; + +static void *test_vcpu_run(void *arg) +{ + unsigned int vcpu_idx = (unsigned long)arg; + struct ucall uc; + struct kvm_vcpu *vcpu = vcpus[vcpu_idx]; + struct kvm_vm *vm = vcpu->vm; + struct test_vcpu_shared_data *shared_data = &vcpu_shared_data[vcpu_idx]; + + vcpu_run(vcpu); + + /* Currently, any exit from guest is an indication of completion */ + pthread_mutex_lock(&vcpu_done_map_lock); + __set_bit(vcpu_idx, vcpu_done_map); + pthread_mutex_unlock(&vcpu_done_map_lock); + + switch (get_ucall(vcpu, &uc)) { + case UCALL_SYNC: + case UCALL_DONE: + break; + case UCALL_ABORT: + sync_global_from_guest(vm, *shared_data); + fprintf(stderr, "Guest assert failed, vcpu %u; stage; %u; iter: %u\n", + vcpu_idx, shared_data->guest_stage, shared_data->nr_iter); + REPORT_GUEST_ASSERT(uc); + break; + default: + TEST_FAIL("Unexpected guest exit"); + } + + pr_info("PASS(vCPU-%d).\n", vcpu_idx); + + return NULL; +} + +static uint32_t test_get_pcpu(void) +{ + uint32_t pcpu; + unsigned int nproc_conf; + cpu_set_t online_cpuset; + + nproc_conf = get_nprocs_conf(); + sched_getaffinity(0, sizeof(cpu_set_t), &online_cpuset); + + /* Randomly find an available pCPU to place a vCPU on */ + do { + pcpu = rand() % nproc_conf; + } while (!CPU_ISSET(pcpu, &online_cpuset)); + + return pcpu; +} + +static int test_migrate_vcpu(unsigned int vcpu_idx) +{ + int ret; + cpu_set_t cpuset; + uint32_t new_pcpu = test_get_pcpu(); + + CPU_ZERO(&cpuset); + CPU_SET(new_pcpu, &cpuset); + + pr_debug("Migrating vCPU: %u to pCPU: %u\n", vcpu_idx, new_pcpu); + + ret = pthread_setaffinity_np(pt_vcpu_run[vcpu_idx], + sizeof(cpuset), &cpuset); + + /* Allow the error where the vCPU thread is already finished */ + TEST_ASSERT(ret == 0 || ret == ESRCH, + "Failed to migrate the vCPU:%u to pCPU: %u; ret: %d", + vcpu_idx, new_pcpu, ret); + + return ret; +} + +static void *test_vcpu_migration(void *arg) +{ + unsigned int i, n_done; + bool vcpu_done; + + do { + usleep(msecs_to_usecs(test_args.migration_freq_ms)); + + for (n_done = 0, i = 0; i < test_args.nr_vcpus; i++) { + pthread_mutex_lock(&vcpu_done_map_lock); + vcpu_done = test_bit(i, vcpu_done_map); + pthread_mutex_unlock(&vcpu_done_map_lock); + + if (vcpu_done) { + n_done++; + continue; + } + + test_migrate_vcpu(i); + } + } while (test_args.nr_vcpus != n_done); + + return NULL; +} + +static void test_run(struct kvm_vm *vm) +{ + pthread_t pt_vcpu_migration; + unsigned int i; + int ret; + + pthread_mutex_init(&vcpu_done_map_lock, NULL); + vcpu_done_map = bitmap_zalloc(test_args.nr_vcpus); + TEST_ASSERT(vcpu_done_map, "Failed to allocate vcpu done bitmap"); + + for (i = 0; i < (unsigned long)test_args.nr_vcpus; i++) { + ret = pthread_create(&pt_vcpu_run[i], NULL, test_vcpu_run, + (void *)(unsigned long)i); + TEST_ASSERT(!ret, "Failed to create vCPU-%d pthread", i); + } + + /* Spawn a thread to control the vCPU migrations */ + if (test_args.migration_freq_ms) { + srand(time(NULL)); + + ret = pthread_create(&pt_vcpu_migration, NULL, + test_vcpu_migration, NULL); + TEST_ASSERT(!ret, "Failed to create the migration pthread"); + } + + + for (i = 0; i < test_args.nr_vcpus; i++) + pthread_join(pt_vcpu_run[i], NULL); + + if (test_args.migration_freq_ms) + pthread_join(pt_vcpu_migration, NULL); + + bitmap_free(vcpu_done_map); +} + +static void test_print_help(char *name) +{ + pr_info("Usage: %s [-h] [-n nr_vcpus] [-i iterations] [-p timer_period_ms]\n" + "\t\t [-m migration_freq_ms] [-o counter_offset]\n" + "\t\t [-e timer_err_margin_us]\n", name); + pr_info("\t-n: Number of vCPUs to configure (default: %u; max: %u)\n", + NR_VCPUS_DEF, KVM_MAX_VCPUS); + pr_info("\t-i: Number of iterations per stage (default: %u)\n", + NR_TEST_ITERS_DEF); + pr_info("\t-p: Periodicity (in ms) of the guest timer (default: %u)\n", + TIMER_TEST_PERIOD_MS_DEF); + pr_info("\t-m: Frequency (in ms) of vCPUs to migrate to different pCPU. 0 to turn off (default: %u)\n", + TIMER_TEST_MIGRATION_FREQ_MS); + pr_info("\t-o: Counter offset (in counter cycles, default: 0) [aarch64-only]\n"); + pr_info("\t-e: Interrupt arrival error margin (in us) of the guest timer (default: %u)\n", + TIMER_TEST_ERR_MARGIN_US); + pr_info("\t-h: print this help screen\n"); +} + +static bool parse_args(int argc, char *argv[]) +{ + int opt; + + while ((opt = getopt(argc, argv, "hn:i:p:m:o:e:")) != -1) { + switch (opt) { + case 'n': + test_args.nr_vcpus = atoi_positive("Number of vCPUs", optarg); + if (test_args.nr_vcpus > KVM_MAX_VCPUS) { + pr_info("Max allowed vCPUs: %u\n", + KVM_MAX_VCPUS); + goto err; + } + break; + case 'i': + test_args.nr_iter = atoi_positive("Number of iterations", optarg); + break; + case 'p': + test_args.timer_period_ms = atoi_positive("Periodicity", optarg); + break; + case 'm': + test_args.migration_freq_ms = atoi_non_negative("Frequency", optarg); + break; + case 'e': + test_args.timer_err_margin_us = atoi_non_negative("Error Margin", optarg); + break; + case 'o': + test_args.counter_offset = strtol(optarg, NULL, 0); + test_args.reserved = 0; + break; + case 'h': + default: + goto err; + } + } + + return true; + +err: + test_print_help(argv[0]); + return false; +} + +int main(int argc, char *argv[]) +{ + struct kvm_vm *vm; + + if (!parse_args(argc, argv)) + exit(KSFT_SKIP); + + __TEST_REQUIRE(!test_args.migration_freq_ms || get_nprocs() >= 2, + "At least two physical CPUs needed for vCPU migration"); + + vm = test_vm_create(); + test_run(vm); + test_vm_cleanup(vm); + + return 0; +} diff --git a/tools/testing/selftests/kvm/clear_dirty_log_test.c b/tools/testing/selftests/kvm/clear_dirty_log_test.c deleted file mode 100644 index 11672ec6f74e..000000000000 --- a/tools/testing/selftests/kvm/clear_dirty_log_test.c +++ /dev/null @@ -1,6 +0,0 @@ -#define USE_CLEAR_DIRTY_LOG -#define KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE (1 << 0) -#define KVM_DIRTY_LOG_INITIALLY_SET (1 << 1) -#define KVM_DIRTY_LOG_MANUAL_CAPS (KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE | \ - KVM_DIRTY_LOG_INITIALLY_SET) -#include "dirty_log_test.c" diff --git a/tools/testing/selftests/kvm/config b/tools/testing/selftests/kvm/config index 63ed533f73d6..8835fed09e9f 100644 --- a/tools/testing/selftests/kvm/config +++ b/tools/testing/selftests/kvm/config @@ -1,3 +1,5 @@ CONFIG_KVM=y CONFIG_KVM_INTEL=y CONFIG_KVM_AMD=y +CONFIG_USERFAULTFD=y +CONFIG_IDLE_PAGE_TRACKING=y diff --git a/tools/testing/selftests/kvm/demand_paging_test.c b/tools/testing/selftests/kvm/demand_paging_test.c index 360cd3ea4cd6..0202b78f8680 100644 --- a/tools/testing/selftests/kvm/demand_paging_test.c +++ b/tools/testing/selftests/kvm/demand_paging_test.c @@ -6,628 +6,323 @@ * Copyright (C) 2018, Red Hat, Inc. * Copyright (C) 2019, Google, Inc. */ - -#define _GNU_SOURCE /* for program_invocation_name */ - +#include <inttypes.h> #include <stdio.h> #include <stdlib.h> -#include <sys/syscall.h> -#include <unistd.h> -#include <asm/unistd.h> #include <time.h> -#include <poll.h> #include <pthread.h> -#include <linux/bitmap.h> -#include <linux/bitops.h> #include <linux/userfaultfd.h> +#include <sys/syscall.h> -#include "test_util.h" #include "kvm_util.h" -#include "processor.h" +#include "test_util.h" +#include "memstress.h" +#include "guest_modes.h" +#include "ucall_common.h" +#include "userfaultfd_util.h" #ifdef __NR_userfaultfd -/* The memory slot index demand page */ -#define TEST_MEM_SLOT_INDEX 1 - -/* Default guest test virtual memory offset */ -#define DEFAULT_GUEST_TEST_MEM 0xc0000000 - -#define DEFAULT_GUEST_TEST_MEM_SIZE (1 << 30) /* 1G */ - -#ifdef PRINT_PER_PAGE_UPDATES -#define PER_PAGE_DEBUG(...) printf(__VA_ARGS__) -#else -#define PER_PAGE_DEBUG(...) _no_printf(__VA_ARGS__) -#endif - -#ifdef PRINT_PER_VCPU_UPDATES -#define PER_VCPU_DEBUG(...) printf(__VA_ARGS__) -#else -#define PER_VCPU_DEBUG(...) _no_printf(__VA_ARGS__) -#endif - -#define MAX_VCPUS 512 - -/* - * Guest/Host shared variables. Ensure addr_gva2hva() and/or - * sync_global_to/from_guest() are used when accessing from - * the host. READ/WRITE_ONCE() should also be used with anything - * that may change. - */ -static uint64_t host_page_size; -static uint64_t guest_page_size; +static int nr_vcpus = 1; +static uint64_t guest_percpu_mem_size = DEFAULT_PER_VCPU_MEM_SIZE; +static size_t demand_paging_size; static char *guest_data_prototype; -/* - * Guest physical memory offset of the testing memory slot. - * This will be set to the topmost valid physical address minus - * the test memory size. - */ -static uint64_t guest_test_phys_mem; - -/* - * Guest virtual memory offset of the testing memory slot. - * Must not conflict with identity mapped test code. - */ -static uint64_t guest_test_virt_mem = DEFAULT_GUEST_TEST_MEM; - -struct vcpu_args { - uint64_t gva; - uint64_t pages; - - /* Only used by the host userspace part of the vCPU thread */ - int vcpu_id; - struct kvm_vm *vm; -}; - -static struct vcpu_args vcpu_args[MAX_VCPUS]; - -/* - * Continuously write to the first 8 bytes of each page in the demand paging - * memory region. - */ -static void guest_code(uint32_t vcpu_id) -{ - uint64_t gva; - uint64_t pages; - int i; - - /* Make sure vCPU args data structure is not corrupt. */ - GUEST_ASSERT(vcpu_args[vcpu_id].vcpu_id == vcpu_id); - - gva = vcpu_args[vcpu_id].gva; - pages = vcpu_args[vcpu_id].pages; - - for (i = 0; i < pages; i++) { - uint64_t addr = gva + (i * guest_page_size); - - addr &= ~(host_page_size - 1); - *(uint64_t *)addr = 0x0123456789ABCDEF; - } - - GUEST_SYNC(1); -} - -static void *vcpu_worker(void *data) +static void vcpu_worker(struct memstress_vcpu_args *vcpu_args) { + struct kvm_vcpu *vcpu = vcpu_args->vcpu; + int vcpu_idx = vcpu_args->vcpu_idx; + struct kvm_run *run = vcpu->run; + struct timespec start; + struct timespec ts_diff; int ret; - struct vcpu_args *args = (struct vcpu_args *)data; - struct kvm_vm *vm = args->vm; - int vcpu_id = args->vcpu_id; - struct kvm_run *run; - struct timespec start, end, ts_diff; - - vcpu_args_set(vm, vcpu_id, 1, vcpu_id); - run = vcpu_state(vm, vcpu_id); clock_gettime(CLOCK_MONOTONIC, &start); /* Let the guest access its memory */ - ret = _vcpu_run(vm, vcpu_id); - TEST_ASSERT(ret == 0, "vcpu_run failed: %d\n", ret); - if (get_ucall(vm, vcpu_id, NULL) != UCALL_SYNC) { + ret = _vcpu_run(vcpu); + TEST_ASSERT(ret == 0, "vcpu_run failed: %d", ret); + if (get_ucall(vcpu, NULL) != UCALL_SYNC) { TEST_ASSERT(false, - "Invalid guest sync status: exit_reason=%s\n", + "Invalid guest sync status: exit_reason=%s", exit_reason_str(run->exit_reason)); } - clock_gettime(CLOCK_MONOTONIC, &end); - ts_diff = timespec_sub(end, start); - PER_VCPU_DEBUG("vCPU %d execution time: %ld.%.9lds\n", vcpu_id, + ts_diff = timespec_elapsed(start); + PER_VCPU_DEBUG("vCPU %d execution time: %ld.%.9lds\n", vcpu_idx, ts_diff.tv_sec, ts_diff.tv_nsec); - - return NULL; -} - -#define PAGE_SHIFT_4K 12 -#define PTES_PER_4K_PT 512 - -static struct kvm_vm *create_vm(enum vm_guest_mode mode, int vcpus, - uint64_t vcpu_memory_bytes) -{ - struct kvm_vm *vm; - uint64_t pages = DEFAULT_GUEST_PHY_PAGES; - - /* Account for a few pages per-vCPU for stacks */ - pages += DEFAULT_STACK_PGS * vcpus; - - /* - * Reserve twice the ammount of memory needed to map the test region and - * the page table / stacks region, at 4k, for page tables. Do the - * calculation with 4K page size: the smallest of all archs. (e.g., 64K - * page size guest will need even less memory for page tables). - */ - pages += (2 * pages) / PTES_PER_4K_PT; - pages += ((2 * vcpus * vcpu_memory_bytes) >> PAGE_SHIFT_4K) / - PTES_PER_4K_PT; - pages = vm_adjust_num_guest_pages(mode, pages); - - pr_info("Testing guest mode: %s\n", vm_guest_mode_string(mode)); - - vm = _vm_create(mode, pages, O_RDWR); - kvm_vm_elf_load(vm, program_invocation_name, 0, 0); -#ifdef __x86_64__ - vm_create_irqchip(vm); -#endif - return vm; } -static int handle_uffd_page_request(int uffd, uint64_t addr) +static int handle_uffd_page_request(int uffd_mode, int uffd, + struct uffd_msg *msg) { - pid_t tid; + pid_t tid = syscall(__NR_gettid); + uint64_t addr = msg->arg.pagefault.address; struct timespec start; - struct timespec end; - struct uffdio_copy copy; + struct timespec ts_diff; int r; - tid = syscall(__NR_gettid); - - copy.src = (uint64_t)guest_data_prototype; - copy.dst = addr; - copy.len = host_page_size; - copy.mode = 0; - clock_gettime(CLOCK_MONOTONIC, &start); - r = ioctl(uffd, UFFDIO_COPY, ©); - if (r == -1) { - pr_info("Failed Paged in 0x%lx from thread %d with errno: %d\n", - addr, tid, errno); - return r; + if (uffd_mode == UFFDIO_REGISTER_MODE_MISSING) { + struct uffdio_copy copy; + + copy.src = (uint64_t)guest_data_prototype; + copy.dst = addr; + copy.len = demand_paging_size; + copy.mode = 0; + + r = ioctl(uffd, UFFDIO_COPY, ©); + /* + * With multiple vCPU threads fault on a single page and there are + * multiple readers for the UFFD, at least one of the UFFDIO_COPYs + * will fail with EEXIST: handle that case without signaling an + * error. + * + * Note that this also suppress any EEXISTs occurring from, + * e.g., the first UFFDIO_COPY/CONTINUEs on a page. That never + * happens here, but a realistic VMM might potentially maintain + * some external state to correctly surface EEXISTs to userspace + * (or prevent duplicate COPY/CONTINUEs in the first place). + */ + if (r == -1 && errno != EEXIST) { + pr_info("Failed UFFDIO_COPY in 0x%lx from thread %d, errno = %d\n", + addr, tid, errno); + return r; + } + } else if (uffd_mode == UFFDIO_REGISTER_MODE_MINOR) { + struct uffdio_continue cont = {0}; + + cont.range.start = addr; + cont.range.len = demand_paging_size; + + r = ioctl(uffd, UFFDIO_CONTINUE, &cont); + /* + * With multiple vCPU threads fault on a single page and there are + * multiple readers for the UFFD, at least one of the UFFDIO_COPYs + * will fail with EEXIST: handle that case without signaling an + * error. + * + * Note that this also suppress any EEXISTs occurring from, + * e.g., the first UFFDIO_COPY/CONTINUEs on a page. That never + * happens here, but a realistic VMM might potentially maintain + * some external state to correctly surface EEXISTs to userspace + * (or prevent duplicate COPY/CONTINUEs in the first place). + */ + if (r == -1 && errno != EEXIST) { + pr_info("Failed UFFDIO_CONTINUE in 0x%lx, thread %d, errno = %d\n", + addr, tid, errno); + return r; + } + } else { + TEST_FAIL("Invalid uffd mode %d", uffd_mode); } - clock_gettime(CLOCK_MONOTONIC, &end); + ts_diff = timespec_elapsed(start); - PER_PAGE_DEBUG("UFFDIO_COPY %d \t%ld ns\n", tid, - timespec_to_ns(timespec_sub(end, start))); + PER_PAGE_DEBUG("UFFD page-in %d \t%ld ns\n", tid, + timespec_to_ns(ts_diff)); PER_PAGE_DEBUG("Paged in %ld bytes at 0x%lx from thread %d\n", - host_page_size, addr, tid); + demand_paging_size, addr, tid); return 0; } -bool quit_uffd_thread; - -struct uffd_handler_args { - int uffd; - int pipefd; - useconds_t delay; +struct test_params { + int uffd_mode; + bool single_uffd; + useconds_t uffd_delay; + int readers_per_uffd; + enum vm_mem_backing_src_type src_type; + bool partition_vcpu_memory_access; }; -static void *uffd_handler_thread_fn(void *arg) -{ - struct uffd_handler_args *uffd_args = (struct uffd_handler_args *)arg; - int uffd = uffd_args->uffd; - int pipefd = uffd_args->pipefd; - useconds_t delay = uffd_args->delay; - int64_t pages = 0; - struct timespec start, end, ts_diff; - - clock_gettime(CLOCK_MONOTONIC, &start); - while (!quit_uffd_thread) { - struct uffd_msg msg; - struct pollfd pollfd[2]; - char tmp_chr; - int r; - uint64_t addr; - - pollfd[0].fd = uffd; - pollfd[0].events = POLLIN; - pollfd[1].fd = pipefd; - pollfd[1].events = POLLIN; - - r = poll(pollfd, 2, -1); - switch (r) { - case -1: - pr_info("poll err"); - continue; - case 0: - continue; - case 1: - break; - default: - pr_info("Polling uffd returned %d", r); - return NULL; - } - - if (pollfd[0].revents & POLLERR) { - pr_info("uffd revents has POLLERR"); - return NULL; - } - - if (pollfd[1].revents & POLLIN) { - r = read(pollfd[1].fd, &tmp_chr, 1); - TEST_ASSERT(r == 1, - "Error reading pipefd in UFFD thread\n"); - return NULL; - } - - if (!pollfd[0].revents & POLLIN) - continue; - - r = read(uffd, &msg, sizeof(msg)); - if (r == -1) { - if (errno == EAGAIN) - continue; - pr_info("Read of uffd gor errno %d", errno); - return NULL; - } - - if (r != sizeof(msg)) { - pr_info("Read on uffd returned unexpected size: %d bytes", r); - return NULL; - } - - if (!(msg.event & UFFD_EVENT_PAGEFAULT)) - continue; - - if (delay) - usleep(delay); - addr = msg.arg.pagefault.address; - r = handle_uffd_page_request(uffd, addr); - if (r < 0) - return NULL; - pages++; - } - - clock_gettime(CLOCK_MONOTONIC, &end); - ts_diff = timespec_sub(end, start); - PER_VCPU_DEBUG("userfaulted %ld pages over %ld.%.9lds. (%f/sec)\n", - pages, ts_diff.tv_sec, ts_diff.tv_nsec, - pages / ((double)ts_diff.tv_sec + (double)ts_diff.tv_nsec / 100000000.0)); - - return NULL; -} - -static int setup_demand_paging(struct kvm_vm *vm, - pthread_t *uffd_handler_thread, int pipefd, - useconds_t uffd_delay, - struct uffd_handler_args *uffd_args, - void *hva, uint64_t len) +static void prefault_mem(void *alias, uint64_t len) { - int uffd; - struct uffdio_api uffdio_api; - struct uffdio_register uffdio_register; - - uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK); - if (uffd == -1) { - pr_info("uffd creation failed\n"); - return -1; - } + size_t p; - uffdio_api.api = UFFD_API; - uffdio_api.features = 0; - if (ioctl(uffd, UFFDIO_API, &uffdio_api) == -1) { - pr_info("ioctl uffdio_api failed\n"); - return -1; + TEST_ASSERT(alias != NULL, "Alias required for minor faults"); + for (p = 0; p < (len / demand_paging_size); ++p) { + memcpy(alias + (p * demand_paging_size), + guest_data_prototype, demand_paging_size); } - - uffdio_register.range.start = (uint64_t)hva; - uffdio_register.range.len = len; - uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING; - if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register) == -1) { - pr_info("ioctl uffdio_register failed\n"); - return -1; - } - - if ((uffdio_register.ioctls & UFFD_API_RANGE_IOCTLS) != - UFFD_API_RANGE_IOCTLS) { - pr_info("unexpected userfaultfd ioctl set\n"); - return -1; - } - - uffd_args->uffd = uffd; - uffd_args->pipefd = pipefd; - uffd_args->delay = uffd_delay; - pthread_create(uffd_handler_thread, NULL, uffd_handler_thread_fn, - uffd_args); - - PER_VCPU_DEBUG("Created uffd thread for HVA range [%p, %p)\n", - hva, hva + len); - - return 0; } -static void run_test(enum vm_guest_mode mode, bool use_uffd, - useconds_t uffd_delay, int vcpus, - uint64_t vcpu_memory_bytes) +static void run_test(enum vm_guest_mode mode, void *arg) { - pthread_t *vcpu_threads; - pthread_t *uffd_handler_threads = NULL; - struct uffd_handler_args *uffd_args = NULL; - struct timespec start, end, ts_diff; - int *pipefds = NULL; + struct memstress_vcpu_args *vcpu_args; + struct test_params *p = arg; + struct uffd_desc **uffd_descs = NULL; + uint64_t uffd_region_size; + struct timespec start; + struct timespec ts_diff; + double vcpu_paging_rate; struct kvm_vm *vm; - uint64_t guest_num_pages; - int vcpu_id; - int r; - - vm = create_vm(mode, vcpus, vcpu_memory_bytes); - - guest_page_size = vm_get_page_size(vm); - - TEST_ASSERT(vcpu_memory_bytes % guest_page_size == 0, - "Guest memory size is not guest page size aligned."); - - guest_num_pages = (vcpus * vcpu_memory_bytes) / guest_page_size; - guest_num_pages = vm_adjust_num_guest_pages(mode, guest_num_pages); - - /* - * If there should be more memory in the guest test region than there - * can be pages in the guest, it will definitely cause problems. - */ - TEST_ASSERT(guest_num_pages < vm_get_max_gfn(vm), - "Requested more guest memory than address space allows.\n" - " guest pages: %lx max gfn: %x vcpus: %d wss: %lx]\n", - guest_num_pages, vm_get_max_gfn(vm), vcpus, - vcpu_memory_bytes); - - host_page_size = getpagesize(); - TEST_ASSERT(vcpu_memory_bytes % host_page_size == 0, - "Guest memory size is not host page size aligned."); + int i, num_uffds = 0; - guest_test_phys_mem = (vm_get_max_gfn(vm) - guest_num_pages) * - guest_page_size; - guest_test_phys_mem &= ~(host_page_size - 1); + vm = memstress_create_vm(mode, nr_vcpus, guest_percpu_mem_size, 1, + p->src_type, p->partition_vcpu_memory_access); -#ifdef __s390x__ - /* Align to 1M (segment size) */ - guest_test_phys_mem &= ~((1 << 20) - 1); -#endif + demand_paging_size = get_backing_src_pagesz(p->src_type); - pr_info("guest physical test memory offset: 0x%lx\n", guest_test_phys_mem); - - /* Add an extra memory slot for testing demand paging */ - vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, - guest_test_phys_mem, - TEST_MEM_SLOT_INDEX, - guest_num_pages, 0); - - /* Do mapping for the demand paging memory slot */ - virt_map(vm, guest_test_virt_mem, guest_test_phys_mem, guest_num_pages, 0); - - ucall_init(vm, NULL); - - guest_data_prototype = malloc(host_page_size); + guest_data_prototype = malloc(demand_paging_size); TEST_ASSERT(guest_data_prototype, "Failed to allocate buffer for guest data pattern"); - memset(guest_data_prototype, 0xAB, host_page_size); - - vcpu_threads = malloc(vcpus * sizeof(*vcpu_threads)); - TEST_ASSERT(vcpu_threads, "Memory allocation failed"); - - if (use_uffd) { - uffd_handler_threads = - malloc(vcpus * sizeof(*uffd_handler_threads)); - TEST_ASSERT(uffd_handler_threads, "Memory allocation failed"); - - uffd_args = malloc(vcpus * sizeof(*uffd_args)); - TEST_ASSERT(uffd_args, "Memory allocation failed"); - - pipefds = malloc(sizeof(int) * vcpus * 2); - TEST_ASSERT(pipefds, "Unable to allocate memory for pipefd"); + memset(guest_data_prototype, 0xAB, demand_paging_size); + + if (p->uffd_mode == UFFDIO_REGISTER_MODE_MINOR) { + num_uffds = p->single_uffd ? 1 : nr_vcpus; + for (i = 0; i < num_uffds; i++) { + vcpu_args = &memstress_args.vcpu_args[i]; + prefault_mem(addr_gpa2alias(vm, vcpu_args->gpa), + vcpu_args->pages * memstress_args.guest_page_size); + } } - for (vcpu_id = 0; vcpu_id < vcpus; vcpu_id++) { - vm_paddr_t vcpu_gpa; - void *vcpu_hva; - - vm_vcpu_add_default(vm, vcpu_id, guest_code); + if (p->uffd_mode) { + num_uffds = p->single_uffd ? 1 : nr_vcpus; + uffd_region_size = nr_vcpus * guest_percpu_mem_size / num_uffds; - vcpu_gpa = guest_test_phys_mem + (vcpu_id * vcpu_memory_bytes); - PER_VCPU_DEBUG("Added VCPU %d with test mem gpa [%lx, %lx)\n", - vcpu_id, vcpu_gpa, vcpu_gpa + vcpu_memory_bytes); + uffd_descs = malloc(num_uffds * sizeof(struct uffd_desc *)); + TEST_ASSERT(uffd_descs, "Memory allocation failed"); + for (i = 0; i < num_uffds; i++) { + struct memstress_vcpu_args *vcpu_args; + void *vcpu_hva; - /* Cache the HVA pointer of the region */ - vcpu_hva = addr_gpa2hva(vm, vcpu_gpa); + vcpu_args = &memstress_args.vcpu_args[i]; - if (use_uffd) { + /* Cache the host addresses of the region */ + vcpu_hva = addr_gpa2hva(vm, vcpu_args->gpa); /* * Set up user fault fd to handle demand paging * requests. */ - r = pipe2(&pipefds[vcpu_id * 2], - O_CLOEXEC | O_NONBLOCK); - TEST_ASSERT(!r, "Failed to set up pipefd"); - - r = setup_demand_paging(vm, - &uffd_handler_threads[vcpu_id], - pipefds[vcpu_id * 2], - uffd_delay, &uffd_args[vcpu_id], - vcpu_hva, vcpu_memory_bytes); - if (r < 0) - exit(-r); + uffd_descs[i] = uffd_setup_demand_paging( + p->uffd_mode, p->uffd_delay, vcpu_hva, + uffd_region_size, + p->readers_per_uffd, + &handle_uffd_page_request); } - -#ifdef __x86_64__ - vcpu_set_cpuid(vm, vcpu_id, kvm_get_supported_cpuid()); -#endif - - vcpu_args[vcpu_id].vm = vm; - vcpu_args[vcpu_id].vcpu_id = vcpu_id; - vcpu_args[vcpu_id].gva = guest_test_virt_mem + - (vcpu_id * vcpu_memory_bytes); - vcpu_args[vcpu_id].pages = vcpu_memory_bytes / guest_page_size; } - /* Export the shared variables to the guest */ - sync_global_to_guest(vm, host_page_size); - sync_global_to_guest(vm, guest_page_size); - sync_global_to_guest(vm, vcpu_args); - pr_info("Finished creating vCPUs and starting uffd threads\n"); clock_gettime(CLOCK_MONOTONIC, &start); - - for (vcpu_id = 0; vcpu_id < vcpus; vcpu_id++) { - pthread_create(&vcpu_threads[vcpu_id], NULL, vcpu_worker, - &vcpu_args[vcpu_id]); - } - + memstress_start_vcpu_threads(nr_vcpus, vcpu_worker); pr_info("Started all vCPUs\n"); - /* Wait for the vcpu threads to quit */ - for (vcpu_id = 0; vcpu_id < vcpus; vcpu_id++) { - pthread_join(vcpu_threads[vcpu_id], NULL); - PER_VCPU_DEBUG("Joined thread for vCPU %d\n", vcpu_id); - } - + memstress_join_vcpu_threads(nr_vcpus); + ts_diff = timespec_elapsed(start); pr_info("All vCPU threads joined\n"); - clock_gettime(CLOCK_MONOTONIC, &end); - - if (use_uffd) { - char c; - + if (p->uffd_mode) { /* Tell the user fault fd handler threads to quit */ - for (vcpu_id = 0; vcpu_id < vcpus; vcpu_id++) { - r = write(pipefds[vcpu_id * 2 + 1], &c, 1); - TEST_ASSERT(r == 1, "Unable to write to pipefd"); - - pthread_join(uffd_handler_threads[vcpu_id], NULL); - } + for (i = 0; i < num_uffds; i++) + uffd_stop_demand_paging(uffd_descs[i]); } - ts_diff = timespec_sub(end, start); - pr_info("Total guest execution time: %ld.%.9lds\n", + pr_info("Total guest execution time:\t%ld.%.9lds\n", ts_diff.tv_sec, ts_diff.tv_nsec); - pr_info("Overall demand paging rate: %f pgs/sec\n", - guest_num_pages / ((double)ts_diff.tv_sec + (double)ts_diff.tv_nsec / 100000000.0)); - ucall_uninit(vm); - kvm_vm_free(vm); + vcpu_paging_rate = memstress_args.vcpu_args[0].pages / + ((double)ts_diff.tv_sec + (double)ts_diff.tv_nsec / NSEC_PER_SEC); + pr_info("Per-vcpu demand paging rate:\t%f pgs/sec/vcpu\n", + vcpu_paging_rate); + pr_info("Overall demand paging rate:\t%f pgs/sec\n", + vcpu_paging_rate * nr_vcpus); + + memstress_destroy_vm(vm); free(guest_data_prototype); - free(vcpu_threads); - if (use_uffd) { - free(uffd_handler_threads); - free(uffd_args); - free(pipefds); - } + if (p->uffd_mode) + free(uffd_descs); } -struct guest_mode { - bool supported; - bool enabled; -}; -static struct guest_mode guest_modes[NUM_VM_MODES]; - -#define guest_mode_init(mode, supported, enabled) ({ \ - guest_modes[mode] = (struct guest_mode){ supported, enabled }; \ -}) - static void help(char *name) { - int i; - puts(""); - printf("usage: %s [-h] [-m mode] [-u] [-d uffd_delay_usec]\n" - " [-b memory] [-v vcpus]\n", name); - printf(" -m: specify the guest mode ID to test\n" - " (default: test all supported modes)\n" - " This option may be used multiple times.\n" - " Guest mode IDs:\n"); - for (i = 0; i < NUM_VM_MODES; ++i) { - printf(" %d: %s%s\n", i, vm_guest_mode_string(i), - guest_modes[i].supported ? " (supported)" : ""); - } - printf(" -u: use User Fault FD to handle vCPU page\n" - " faults.\n"); + printf("usage: %s [-h] [-m vm_mode] [-u uffd_mode] [-a]\n" + " [-d uffd_delay_usec] [-r readers_per_uffd] [-b memory]\n" + " [-s type] [-v vcpus] [-c cpu_list] [-o]\n", name); + guest_modes_help(); + printf(" -u: use userfaultfd to handle vCPU page faults. Mode is a\n" + " UFFD registration mode: 'MISSING' or 'MINOR'.\n"); + kvm_print_vcpu_pinning_help(); + printf(" -a: Use a single userfaultfd for all of guest memory, instead of\n" + " creating one for each region paged by a unique vCPU\n" + " Set implicitly with -o, and no effect without -u.\n"); printf(" -d: add a delay in usec to the User Fault\n" " FD handler to simulate demand paging\n" " overheads. Ignored without -u.\n"); + printf(" -r: Set the number of reader threads per uffd.\n"); printf(" -b: specify the size of the memory region which should be\n" " demand paged by each vCPU. e.g. 10M or 3G.\n" " Default: 1G\n"); + backing_src_help("-s"); printf(" -v: specify the number of vCPUs to run.\n"); + printf(" -o: Overlap guest memory accesses instead of partitioning\n" + " them into a separate region of memory for each vCPU.\n"); puts(""); exit(0); } int main(int argc, char *argv[]) { - bool mode_selected = false; - uint64_t vcpu_memory_bytes = DEFAULT_GUEST_TEST_MEM_SIZE; - int vcpus = 1; - unsigned int mode; - int opt, i; - bool use_uffd = false; - useconds_t uffd_delay = 0; - -#ifdef __x86_64__ - guest_mode_init(VM_MODE_PXXV48_4K, true, true); -#endif -#ifdef __aarch64__ - guest_mode_init(VM_MODE_P40V48_4K, true, true); - guest_mode_init(VM_MODE_P40V48_64K, true, true); - { - unsigned int limit = kvm_check_cap(KVM_CAP_ARM_VM_IPA_SIZE); - - if (limit >= 52) - guest_mode_init(VM_MODE_P52V48_64K, true, true); - if (limit >= 48) { - guest_mode_init(VM_MODE_P48V48_4K, true, true); - guest_mode_init(VM_MODE_P48V48_64K, true, true); - } - } -#endif -#ifdef __s390x__ - guest_mode_init(VM_MODE_P40V48_4K, true, true); -#endif - - while ((opt = getopt(argc, argv, "hm:ud:b:v:")) != -1) { + int max_vcpus = kvm_check_cap(KVM_CAP_MAX_VCPUS); + const char *cpulist = NULL; + struct test_params p = { + .src_type = DEFAULT_VM_MEM_SRC, + .partition_vcpu_memory_access = true, + .readers_per_uffd = 1, + .single_uffd = false, + }; + int opt; + + guest_modes_append_default(); + + while ((opt = getopt(argc, argv, "ahom:u:d:b:s:v:c:r:")) != -1) { switch (opt) { case 'm': - if (!mode_selected) { - for (i = 0; i < NUM_VM_MODES; ++i) - guest_modes[i].enabled = false; - mode_selected = true; - } - mode = strtoul(optarg, NULL, 10); - TEST_ASSERT(mode < NUM_VM_MODES, - "Guest mode ID %d too big", mode); - guest_modes[mode].enabled = true; + guest_modes_cmdline(optarg); break; case 'u': - use_uffd = true; + if (!strcmp("MISSING", optarg)) + p.uffd_mode = UFFDIO_REGISTER_MODE_MISSING; + else if (!strcmp("MINOR", optarg)) + p.uffd_mode = UFFDIO_REGISTER_MODE_MINOR; + TEST_ASSERT(p.uffd_mode, "UFFD mode must be 'MISSING' or 'MINOR'."); + break; + case 'a': + p.single_uffd = true; break; case 'd': - uffd_delay = strtoul(optarg, NULL, 0); - TEST_ASSERT(uffd_delay >= 0, - "A negative UFFD delay is not supported."); + p.uffd_delay = strtoul(optarg, NULL, 0); + TEST_ASSERT(p.uffd_delay >= 0, "A negative UFFD delay is not supported."); break; case 'b': - vcpu_memory_bytes = parse_size(optarg); + guest_percpu_mem_size = parse_size(optarg); + break; + case 's': + p.src_type = parse_backing_src_type(optarg); break; case 'v': - vcpus = atoi(optarg); - TEST_ASSERT(vcpus > 0, - "Must have a positive number of vCPUs"); - TEST_ASSERT(vcpus <= MAX_VCPUS, - "This test does not currently support\n" - "more than %d vCPUs.", MAX_VCPUS); + nr_vcpus = atoi_positive("Number of vCPUs", optarg); + TEST_ASSERT(nr_vcpus <= max_vcpus, + "Invalid number of vcpus, must be between 1 and %d", max_vcpus); + break; + case 'c': + cpulist = optarg; + break; + case 'o': + p.partition_vcpu_memory_access = false; + p.single_uffd = true; + break; + case 'r': + p.readers_per_uffd = atoi(optarg); + TEST_ASSERT(p.readers_per_uffd >= 1, + "Invalid number of readers per uffd %d: must be >=1", + p.readers_per_uffd); break; case 'h': default: @@ -636,15 +331,19 @@ int main(int argc, char *argv[]) } } - for (i = 0; i < NUM_VM_MODES; ++i) { - if (!guest_modes[i].enabled) - continue; - TEST_ASSERT(guest_modes[i].supported, - "Guest mode ID %d (%s) not supported.", - i, vm_guest_mode_string(i)); - run_test(i, use_uffd, uffd_delay, vcpus, vcpu_memory_bytes); + if (p.uffd_mode == UFFDIO_REGISTER_MODE_MINOR && + !backing_src_is_shared(p.src_type)) { + TEST_FAIL("userfaultfd MINOR mode requires shared memory; pick a different -s"); + } + + if (cpulist) { + kvm_parse_vcpu_pinning(cpulist, memstress_args.vcpu_to_pcpu, + nr_vcpus); + memstress_args.pin_vcpus = true; } + for_each_guest_mode(run_test, &p); + return 0; } diff --git a/tools/testing/selftests/kvm/dirty_log_perf_test.c b/tools/testing/selftests/kvm/dirty_log_perf_test.c new file mode 100644 index 000000000000..9f24303acb8c --- /dev/null +++ b/tools/testing/selftests/kvm/dirty_log_perf_test.c @@ -0,0 +1,432 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * KVM dirty page logging performance test + * + * Based on dirty_log_test.c + * + * Copyright (C) 2018, Red Hat, Inc. + * Copyright (C) 2020, Google, Inc. + */ + +#include <stdio.h> +#include <stdlib.h> +#include <time.h> +#include <pthread.h> +#include <linux/bitmap.h> + +#include "kvm_util.h" +#include "test_util.h" +#include "memstress.h" +#include "guest_modes.h" +#include "ucall_common.h" + +#ifdef __aarch64__ +#include "aarch64/vgic.h" + +static int gic_fd; + +static void arch_setup_vm(struct kvm_vm *vm, unsigned int nr_vcpus) +{ + /* + * The test can still run even if hardware does not support GICv3, as it + * is only an optimization to reduce guest exits. + */ + gic_fd = vgic_v3_setup(vm, nr_vcpus, 64); +} + +static void arch_cleanup_vm(struct kvm_vm *vm) +{ + if (gic_fd > 0) + close(gic_fd); +} + +#else /* __aarch64__ */ + +static void arch_setup_vm(struct kvm_vm *vm, unsigned int nr_vcpus) +{ +} + +static void arch_cleanup_vm(struct kvm_vm *vm) +{ +} + +#endif + +/* How many host loops to run by default (one KVM_GET_DIRTY_LOG for each loop)*/ +#define TEST_HOST_LOOP_N 2UL + +static int nr_vcpus = 1; +static uint64_t guest_percpu_mem_size = DEFAULT_PER_VCPU_MEM_SIZE; +static bool run_vcpus_while_disabling_dirty_logging; + +/* Host variables */ +static u64 dirty_log_manual_caps; +static bool host_quit; +static int iteration; +static int vcpu_last_completed_iteration[KVM_MAX_VCPUS]; + +static void vcpu_worker(struct memstress_vcpu_args *vcpu_args) +{ + struct kvm_vcpu *vcpu = vcpu_args->vcpu; + int vcpu_idx = vcpu_args->vcpu_idx; + uint64_t pages_count = 0; + struct kvm_run *run; + struct timespec start; + struct timespec ts_diff; + struct timespec total = (struct timespec){0}; + struct timespec avg; + int ret; + + run = vcpu->run; + + while (!READ_ONCE(host_quit)) { + int current_iteration = READ_ONCE(iteration); + + clock_gettime(CLOCK_MONOTONIC, &start); + ret = _vcpu_run(vcpu); + ts_diff = timespec_elapsed(start); + + TEST_ASSERT(ret == 0, "vcpu_run failed: %d", ret); + TEST_ASSERT(get_ucall(vcpu, NULL) == UCALL_SYNC, + "Invalid guest sync status: exit_reason=%s", + exit_reason_str(run->exit_reason)); + + pr_debug("Got sync event from vCPU %d\n", vcpu_idx); + vcpu_last_completed_iteration[vcpu_idx] = current_iteration; + pr_debug("vCPU %d updated last completed iteration to %d\n", + vcpu_idx, vcpu_last_completed_iteration[vcpu_idx]); + + if (current_iteration) { + pages_count += vcpu_args->pages; + total = timespec_add(total, ts_diff); + pr_debug("vCPU %d iteration %d dirty memory time: %ld.%.9lds\n", + vcpu_idx, current_iteration, ts_diff.tv_sec, + ts_diff.tv_nsec); + } else { + pr_debug("vCPU %d iteration %d populate memory time: %ld.%.9lds\n", + vcpu_idx, current_iteration, ts_diff.tv_sec, + ts_diff.tv_nsec); + } + + /* + * Keep running the guest while dirty logging is being disabled + * (iteration is negative) so that vCPUs are accessing memory + * for the entire duration of zapping collapsible SPTEs. + */ + while (current_iteration == READ_ONCE(iteration) && + READ_ONCE(iteration) >= 0 && !READ_ONCE(host_quit)) {} + } + + avg = timespec_div(total, vcpu_last_completed_iteration[vcpu_idx]); + pr_debug("\nvCPU %d dirtied 0x%lx pages over %d iterations in %ld.%.9lds. (Avg %ld.%.9lds/iteration)\n", + vcpu_idx, pages_count, vcpu_last_completed_iteration[vcpu_idx], + total.tv_sec, total.tv_nsec, avg.tv_sec, avg.tv_nsec); +} + +struct test_params { + unsigned long iterations; + uint64_t phys_offset; + bool partition_vcpu_memory_access; + enum vm_mem_backing_src_type backing_src; + int slots; + uint32_t write_percent; + bool random_access; +}; + +static void run_test(enum vm_guest_mode mode, void *arg) +{ + struct test_params *p = arg; + struct kvm_vm *vm; + unsigned long **bitmaps; + uint64_t guest_num_pages; + uint64_t host_num_pages; + uint64_t pages_per_slot; + struct timespec start; + struct timespec ts_diff; + struct timespec get_dirty_log_total = (struct timespec){0}; + struct timespec vcpu_dirty_total = (struct timespec){0}; + struct timespec avg; + struct timespec clear_dirty_log_total = (struct timespec){0}; + int i; + + vm = memstress_create_vm(mode, nr_vcpus, guest_percpu_mem_size, + p->slots, p->backing_src, + p->partition_vcpu_memory_access); + + memstress_set_write_percent(vm, p->write_percent); + + guest_num_pages = (nr_vcpus * guest_percpu_mem_size) >> vm->page_shift; + guest_num_pages = vm_adjust_num_guest_pages(mode, guest_num_pages); + host_num_pages = vm_num_host_pages(mode, guest_num_pages); + pages_per_slot = host_num_pages / p->slots; + + bitmaps = memstress_alloc_bitmaps(p->slots, pages_per_slot); + + if (dirty_log_manual_caps) + vm_enable_cap(vm, KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2, + dirty_log_manual_caps); + + arch_setup_vm(vm, nr_vcpus); + + /* Start the iterations */ + iteration = 0; + host_quit = false; + + clock_gettime(CLOCK_MONOTONIC, &start); + for (i = 0; i < nr_vcpus; i++) + vcpu_last_completed_iteration[i] = -1; + + /* + * Use 100% writes during the population phase to ensure all + * memory is actually populated and not just mapped to the zero + * page. The prevents expensive copy-on-write faults from + * occurring during the dirty memory iterations below, which + * would pollute the performance results. + */ + memstress_set_write_percent(vm, 100); + memstress_set_random_access(vm, false); + memstress_start_vcpu_threads(nr_vcpus, vcpu_worker); + + /* Allow the vCPUs to populate memory */ + pr_debug("Starting iteration %d - Populating\n", iteration); + for (i = 0; i < nr_vcpus; i++) { + while (READ_ONCE(vcpu_last_completed_iteration[i]) != + iteration) + ; + } + + ts_diff = timespec_elapsed(start); + pr_info("Populate memory time: %ld.%.9lds\n", + ts_diff.tv_sec, ts_diff.tv_nsec); + + /* Enable dirty logging */ + clock_gettime(CLOCK_MONOTONIC, &start); + memstress_enable_dirty_logging(vm, p->slots); + ts_diff = timespec_elapsed(start); + pr_info("Enabling dirty logging time: %ld.%.9lds\n\n", + ts_diff.tv_sec, ts_diff.tv_nsec); + + memstress_set_write_percent(vm, p->write_percent); + memstress_set_random_access(vm, p->random_access); + + while (iteration < p->iterations) { + /* + * Incrementing the iteration number will start the vCPUs + * dirtying memory again. + */ + clock_gettime(CLOCK_MONOTONIC, &start); + iteration++; + + pr_debug("Starting iteration %d\n", iteration); + for (i = 0; i < nr_vcpus; i++) { + while (READ_ONCE(vcpu_last_completed_iteration[i]) + != iteration) + ; + } + + ts_diff = timespec_elapsed(start); + vcpu_dirty_total = timespec_add(vcpu_dirty_total, ts_diff); + pr_info("Iteration %d dirty memory time: %ld.%.9lds\n", + iteration, ts_diff.tv_sec, ts_diff.tv_nsec); + + clock_gettime(CLOCK_MONOTONIC, &start); + memstress_get_dirty_log(vm, bitmaps, p->slots); + ts_diff = timespec_elapsed(start); + get_dirty_log_total = timespec_add(get_dirty_log_total, + ts_diff); + pr_info("Iteration %d get dirty log time: %ld.%.9lds\n", + iteration, ts_diff.tv_sec, ts_diff.tv_nsec); + + if (dirty_log_manual_caps) { + clock_gettime(CLOCK_MONOTONIC, &start); + memstress_clear_dirty_log(vm, bitmaps, p->slots, + pages_per_slot); + ts_diff = timespec_elapsed(start); + clear_dirty_log_total = timespec_add(clear_dirty_log_total, + ts_diff); + pr_info("Iteration %d clear dirty log time: %ld.%.9lds\n", + iteration, ts_diff.tv_sec, ts_diff.tv_nsec); + } + } + + /* + * Run vCPUs while dirty logging is being disabled to stress disabling + * in terms of both performance and correctness. Opt-in via command + * line as this significantly increases time to disable dirty logging. + */ + if (run_vcpus_while_disabling_dirty_logging) + WRITE_ONCE(iteration, -1); + + /* Disable dirty logging */ + clock_gettime(CLOCK_MONOTONIC, &start); + memstress_disable_dirty_logging(vm, p->slots); + ts_diff = timespec_elapsed(start); + pr_info("Disabling dirty logging time: %ld.%.9lds\n", + ts_diff.tv_sec, ts_diff.tv_nsec); + + /* + * Tell the vCPU threads to quit. No need to manually check that vCPUs + * have stopped running after disabling dirty logging, the join will + * wait for them to exit. + */ + host_quit = true; + memstress_join_vcpu_threads(nr_vcpus); + + avg = timespec_div(get_dirty_log_total, p->iterations); + pr_info("Get dirty log over %lu iterations took %ld.%.9lds. (Avg %ld.%.9lds/iteration)\n", + p->iterations, get_dirty_log_total.tv_sec, + get_dirty_log_total.tv_nsec, avg.tv_sec, avg.tv_nsec); + + if (dirty_log_manual_caps) { + avg = timespec_div(clear_dirty_log_total, p->iterations); + pr_info("Clear dirty log over %lu iterations took %ld.%.9lds. (Avg %ld.%.9lds/iteration)\n", + p->iterations, clear_dirty_log_total.tv_sec, + clear_dirty_log_total.tv_nsec, avg.tv_sec, avg.tv_nsec); + } + + memstress_free_bitmaps(bitmaps, p->slots); + arch_cleanup_vm(vm); + memstress_destroy_vm(vm); +} + +static void help(char *name) +{ + puts(""); + printf("usage: %s [-h] [-a] [-i iterations] [-p offset] [-g] " + "[-m mode] [-n] [-b vcpu bytes] [-v vcpus] [-o] [-r random seed ] [-s mem type]" + "[-x memslots] [-w percentage] [-c physical cpus to run test on]\n", name); + puts(""); + printf(" -a: access memory randomly rather than in order.\n"); + printf(" -i: specify iteration counts (default: %"PRIu64")\n", + TEST_HOST_LOOP_N); + printf(" -g: Do not enable KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2. This\n" + " makes KVM_GET_DIRTY_LOG clear the dirty log (i.e.\n" + " KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE is not enabled)\n" + " and writes will be tracked as soon as dirty logging is\n" + " enabled on the memslot (i.e. KVM_DIRTY_LOG_INITIALLY_SET\n" + " is not enabled).\n"); + printf(" -p: specify guest physical test memory offset\n" + " Warning: a low offset can conflict with the loaded test code.\n"); + guest_modes_help(); + printf(" -n: Run the vCPUs in nested mode (L2)\n"); + printf(" -e: Run vCPUs while dirty logging is being disabled. This\n" + " can significantly increase runtime, especially if there\n" + " isn't a dedicated pCPU for the main thread.\n"); + printf(" -b: specify the size of the memory region which should be\n" + " dirtied by each vCPU. e.g. 10M or 3G.\n" + " (default: 1G)\n"); + printf(" -v: specify the number of vCPUs to run.\n"); + printf(" -o: Overlap guest memory accesses instead of partitioning\n" + " them into a separate region of memory for each vCPU.\n"); + printf(" -r: specify the starting random seed.\n"); + backing_src_help("-s"); + printf(" -x: Split the memory region into this number of memslots.\n" + " (default: 1)\n"); + printf(" -w: specify the percentage of pages which should be written to\n" + " as an integer from 0-100 inclusive. This is probabilistic,\n" + " so -w X means each page has an X%% chance of writing\n" + " and a (100-X)%% chance of reading.\n" + " (default: 100 i.e. all pages are written to.)\n"); + kvm_print_vcpu_pinning_help(); + puts(""); + exit(0); +} + +int main(int argc, char *argv[]) +{ + int max_vcpus = kvm_check_cap(KVM_CAP_MAX_VCPUS); + const char *pcpu_list = NULL; + struct test_params p = { + .iterations = TEST_HOST_LOOP_N, + .partition_vcpu_memory_access = true, + .backing_src = DEFAULT_VM_MEM_SRC, + .slots = 1, + .write_percent = 100, + }; + int opt; + + /* Override the seed to be deterministic by default. */ + guest_random_seed = 1; + + dirty_log_manual_caps = + kvm_check_cap(KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2); + dirty_log_manual_caps &= (KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE | + KVM_DIRTY_LOG_INITIALLY_SET); + + guest_modes_append_default(); + + while ((opt = getopt(argc, argv, "ab:c:eghi:m:nop:r:s:v:x:w:")) != -1) { + switch (opt) { + case 'a': + p.random_access = true; + break; + case 'b': + guest_percpu_mem_size = parse_size(optarg); + break; + case 'c': + pcpu_list = optarg; + break; + case 'e': + /* 'e' is for evil. */ + run_vcpus_while_disabling_dirty_logging = true; + break; + case 'g': + dirty_log_manual_caps = 0; + break; + case 'h': + help(argv[0]); + break; + case 'i': + p.iterations = atoi_positive("Number of iterations", optarg); + break; + case 'm': + guest_modes_cmdline(optarg); + break; + case 'n': + memstress_args.nested = true; + break; + case 'o': + p.partition_vcpu_memory_access = false; + break; + case 'p': + p.phys_offset = strtoull(optarg, NULL, 0); + break; + case 'r': + guest_random_seed = atoi_positive("Random seed", optarg); + break; + case 's': + p.backing_src = parse_backing_src_type(optarg); + break; + case 'v': + nr_vcpus = atoi_positive("Number of vCPUs", optarg); + TEST_ASSERT(nr_vcpus <= max_vcpus, + "Invalid number of vcpus, must be between 1 and %d", max_vcpus); + break; + case 'w': + p.write_percent = atoi_non_negative("Write percentage", optarg); + TEST_ASSERT(p.write_percent <= 100, + "Write percentage must be between 0 and 100"); + break; + case 'x': + p.slots = atoi_positive("Number of slots", optarg); + break; + default: + help(argv[0]); + break; + } + } + + if (pcpu_list) { + kvm_parse_vcpu_pinning(pcpu_list, memstress_args.vcpu_to_pcpu, + nr_vcpus); + memstress_args.pin_vcpus = true; + } + + TEST_ASSERT(p.iterations >= 2, "The test should have at least two iterations"); + + pr_info("Test iterations: %"PRIu64"\n", p.iterations); + + for_each_guest_mode(run_test, &p); + + return 0; +} diff --git a/tools/testing/selftests/kvm/dirty_log_test.c b/tools/testing/selftests/kvm/dirty_log_test.c index 752ec158ac59..aacf80f57439 100644 --- a/tools/testing/selftests/kvm/dirty_log_test.c +++ b/tools/testing/selftests/kvm/dirty_log_test.c @@ -4,22 +4,26 @@ * * Copyright (C) 2018, Red Hat, Inc. */ - -#define _GNU_SOURCE /* for program_invocation_name */ - #include <stdio.h> #include <stdlib.h> -#include <unistd.h> -#include <time.h> #include <pthread.h> +#include <semaphore.h> +#include <sys/types.h> +#include <signal.h> +#include <errno.h> #include <linux/bitmap.h> #include <linux/bitops.h> +#include <linux/atomic.h> +#include <asm/barrier.h> -#include "test_util.h" #include "kvm_util.h" +#include "test_util.h" +#include "guest_modes.h" #include "processor.h" +#include "ucall_common.h" -#define VCPU_ID 1 +#define DIRTY_MEM_BITS 30 /* 1G */ +#define PAGE_SHIFT_4K 12 /* The memory slot index to track dirty pages */ #define TEST_MEM_SLOT_INDEX 1 @@ -41,22 +45,26 @@ # define BITOP_LE_SWIZZLE ((BITS_PER_LONG-1) & ~0x7) # define test_bit_le(nr, addr) \ test_bit((nr) ^ BITOP_LE_SWIZZLE, addr) -# define set_bit_le(nr, addr) \ - set_bit((nr) ^ BITOP_LE_SWIZZLE, addr) -# define clear_bit_le(nr, addr) \ - clear_bit((nr) ^ BITOP_LE_SWIZZLE, addr) -# define test_and_set_bit_le(nr, addr) \ - test_and_set_bit((nr) ^ BITOP_LE_SWIZZLE, addr) -# define test_and_clear_bit_le(nr, addr) \ - test_and_clear_bit((nr) ^ BITOP_LE_SWIZZLE, addr) +# define __set_bit_le(nr, addr) \ + __set_bit((nr) ^ BITOP_LE_SWIZZLE, addr) +# define __clear_bit_le(nr, addr) \ + __clear_bit((nr) ^ BITOP_LE_SWIZZLE, addr) +# define __test_and_set_bit_le(nr, addr) \ + __test_and_set_bit((nr) ^ BITOP_LE_SWIZZLE, addr) +# define __test_and_clear_bit_le(nr, addr) \ + __test_and_clear_bit((nr) ^ BITOP_LE_SWIZZLE, addr) #else -# define test_bit_le test_bit -# define set_bit_le set_bit -# define clear_bit_le clear_bit -# define test_and_set_bit_le test_and_set_bit -# define test_and_clear_bit_le test_and_clear_bit +# define test_bit_le test_bit +# define __set_bit_le __set_bit +# define __clear_bit_le __clear_bit +# define __test_and_set_bit_le __test_and_set_bit +# define __test_and_clear_bit_le __test_and_clear_bit #endif +#define TEST_DIRTY_RING_COUNT 65536 + +#define SIG_IPI SIGUSR1 + /* * Guest/Host shared variables. Ensure addr_gva2hva() and/or * sync_global_to/from_guest() are used when accessing from @@ -66,7 +74,6 @@ static uint64_t host_page_size; static uint64_t guest_page_size; static uint64_t guest_num_pages; -static uint64_t random_array[TEST_PAGES_PER_LOOP]; static uint64_t iteration; /* @@ -99,19 +106,19 @@ static void guest_code(void) */ for (i = 0; i < guest_num_pages; i++) { addr = guest_test_virt_mem + i * guest_page_size; - *(uint64_t *)addr = READ_ONCE(iteration); + vcpu_arch_put_guest(*(uint64_t *)addr, READ_ONCE(iteration)); } while (true) { for (i = 0; i < TEST_PAGES_PER_LOOP; i++) { addr = guest_test_virt_mem; - addr += (READ_ONCE(random_array[i]) % guest_num_pages) + addr += (guest_random_u64(&guest_rng) % guest_num_pages) * guest_page_size; - addr &= ~(host_page_size - 1); - *(uint64_t *)addr = READ_ONCE(iteration); + addr = align_down(addr, host_page_size); + + vcpu_arch_put_guest(*(uint64_t *)addr, READ_ONCE(iteration)); } - /* Tell the host that we need more random numbers */ GUEST_SYNC(1); } } @@ -128,6 +135,320 @@ static uint64_t host_dirty_count; static uint64_t host_clear_count; static uint64_t host_track_next_count; +/* Whether dirty ring reset is requested, or finished */ +static sem_t sem_vcpu_stop; +static sem_t sem_vcpu_cont; +/* + * This is only set by main thread, and only cleared by vcpu thread. It is + * used to request vcpu thread to stop at the next GUEST_SYNC, since GUEST_SYNC + * is the only place that we'll guarantee both "dirty bit" and "dirty data" + * will match. E.g., SIG_IPI won't guarantee that if the vcpu is interrupted + * after setting dirty bit but before the data is written. + */ +static atomic_t vcpu_sync_stop_requested; +/* + * This is updated by the vcpu thread to tell the host whether it's a + * ring-full event. It should only be read until a sem_wait() of + * sem_vcpu_stop and before vcpu continues to run. + */ +static bool dirty_ring_vcpu_ring_full; +/* + * This is only used for verifying the dirty pages. Dirty ring has a very + * tricky case when the ring just got full, kvm will do userspace exit due to + * ring full. When that happens, the very last PFN is set but actually the + * data is not changed (the guest WRITE is not really applied yet), because + * we found that the dirty ring is full, refused to continue the vcpu, and + * recorded the dirty gfn with the old contents. + * + * For this specific case, it's safe to skip checking this pfn for this + * bit, because it's a redundant bit, and when the write happens later the bit + * will be set again. We use this variable to always keep track of the latest + * dirty gfn we've collected, so that if a mismatch of data found later in the + * verifying process, we let it pass. + */ +static uint64_t dirty_ring_last_page; + +enum log_mode_t { + /* Only use KVM_GET_DIRTY_LOG for logging */ + LOG_MODE_DIRTY_LOG = 0, + + /* Use both KVM_[GET|CLEAR]_DIRTY_LOG for logging */ + LOG_MODE_CLEAR_LOG = 1, + + /* Use dirty ring for logging */ + LOG_MODE_DIRTY_RING = 2, + + LOG_MODE_NUM, + + /* Run all supported modes */ + LOG_MODE_ALL = LOG_MODE_NUM, +}; + +/* Mode of logging to test. Default is to run all supported modes */ +static enum log_mode_t host_log_mode_option = LOG_MODE_ALL; +/* Logging mode for current run */ +static enum log_mode_t host_log_mode; +static pthread_t vcpu_thread; +static uint32_t test_dirty_ring_count = TEST_DIRTY_RING_COUNT; + +static void vcpu_kick(void) +{ + pthread_kill(vcpu_thread, SIG_IPI); +} + +/* + * In our test we do signal tricks, let's use a better version of + * sem_wait to avoid signal interrupts + */ +static void sem_wait_until(sem_t *sem) +{ + int ret; + + do + ret = sem_wait(sem); + while (ret == -1 && errno == EINTR); +} + +static bool clear_log_supported(void) +{ + return kvm_has_cap(KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2); +} + +static void clear_log_create_vm_done(struct kvm_vm *vm) +{ + u64 manual_caps; + + manual_caps = kvm_check_cap(KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2); + TEST_ASSERT(manual_caps, "MANUAL_CAPS is zero!"); + manual_caps &= (KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE | + KVM_DIRTY_LOG_INITIALLY_SET); + vm_enable_cap(vm, KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2, manual_caps); +} + +static void dirty_log_collect_dirty_pages(struct kvm_vcpu *vcpu, int slot, + void *bitmap, uint32_t num_pages, + uint32_t *unused) +{ + kvm_vm_get_dirty_log(vcpu->vm, slot, bitmap); +} + +static void clear_log_collect_dirty_pages(struct kvm_vcpu *vcpu, int slot, + void *bitmap, uint32_t num_pages, + uint32_t *unused) +{ + kvm_vm_get_dirty_log(vcpu->vm, slot, bitmap); + kvm_vm_clear_dirty_log(vcpu->vm, slot, bitmap, 0, num_pages); +} + +/* Should only be called after a GUEST_SYNC */ +static void vcpu_handle_sync_stop(void) +{ + if (atomic_read(&vcpu_sync_stop_requested)) { + /* It means main thread is sleeping waiting */ + atomic_set(&vcpu_sync_stop_requested, false); + sem_post(&sem_vcpu_stop); + sem_wait_until(&sem_vcpu_cont); + } +} + +static void default_after_vcpu_run(struct kvm_vcpu *vcpu, int ret, int err) +{ + struct kvm_run *run = vcpu->run; + + TEST_ASSERT(ret == 0 || (ret == -1 && err == EINTR), + "vcpu run failed: errno=%d", err); + + TEST_ASSERT(get_ucall(vcpu, NULL) == UCALL_SYNC, + "Invalid guest sync status: exit_reason=%s", + exit_reason_str(run->exit_reason)); + + vcpu_handle_sync_stop(); +} + +static bool dirty_ring_supported(void) +{ + return (kvm_has_cap(KVM_CAP_DIRTY_LOG_RING) || + kvm_has_cap(KVM_CAP_DIRTY_LOG_RING_ACQ_REL)); +} + +static void dirty_ring_create_vm_done(struct kvm_vm *vm) +{ + uint64_t pages; + uint32_t limit; + + /* + * We rely on vcpu exit due to full dirty ring state. Adjust + * the ring buffer size to ensure we're able to reach the + * full dirty ring state. + */ + pages = (1ul << (DIRTY_MEM_BITS - vm->page_shift)) + 3; + pages = vm_adjust_num_guest_pages(vm->mode, pages); + if (vm->page_size < getpagesize()) + pages = vm_num_host_pages(vm->mode, pages); + + limit = 1 << (31 - __builtin_clz(pages)); + test_dirty_ring_count = 1 << (31 - __builtin_clz(test_dirty_ring_count)); + test_dirty_ring_count = min(limit, test_dirty_ring_count); + pr_info("dirty ring count: 0x%x\n", test_dirty_ring_count); + + /* + * Switch to dirty ring mode after VM creation but before any + * of the vcpu creation. + */ + vm_enable_dirty_ring(vm, test_dirty_ring_count * + sizeof(struct kvm_dirty_gfn)); +} + +static inline bool dirty_gfn_is_dirtied(struct kvm_dirty_gfn *gfn) +{ + return smp_load_acquire(&gfn->flags) == KVM_DIRTY_GFN_F_DIRTY; +} + +static inline void dirty_gfn_set_collected(struct kvm_dirty_gfn *gfn) +{ + smp_store_release(&gfn->flags, KVM_DIRTY_GFN_F_RESET); +} + +static uint32_t dirty_ring_collect_one(struct kvm_dirty_gfn *dirty_gfns, + int slot, void *bitmap, + uint32_t num_pages, uint32_t *fetch_index) +{ + struct kvm_dirty_gfn *cur; + uint32_t count = 0; + + while (true) { + cur = &dirty_gfns[*fetch_index % test_dirty_ring_count]; + if (!dirty_gfn_is_dirtied(cur)) + break; + TEST_ASSERT(cur->slot == slot, "Slot number didn't match: " + "%u != %u", cur->slot, slot); + TEST_ASSERT(cur->offset < num_pages, "Offset overflow: " + "0x%llx >= 0x%x", cur->offset, num_pages); + //pr_info("fetch 0x%x page %llu\n", *fetch_index, cur->offset); + __set_bit_le(cur->offset, bitmap); + dirty_ring_last_page = cur->offset; + dirty_gfn_set_collected(cur); + (*fetch_index)++; + count++; + } + + return count; +} + +static void dirty_ring_wait_vcpu(void) +{ + /* This makes sure that hardware PML cache flushed */ + vcpu_kick(); + sem_wait_until(&sem_vcpu_stop); +} + +static void dirty_ring_continue_vcpu(void) +{ + pr_info("Notifying vcpu to continue\n"); + sem_post(&sem_vcpu_cont); +} + +static void dirty_ring_collect_dirty_pages(struct kvm_vcpu *vcpu, int slot, + void *bitmap, uint32_t num_pages, + uint32_t *ring_buf_idx) +{ + uint32_t count = 0, cleared; + bool continued_vcpu = false; + + dirty_ring_wait_vcpu(); + + if (!dirty_ring_vcpu_ring_full) { + /* + * This is not a ring-full event, it's safe to allow + * vcpu to continue + */ + dirty_ring_continue_vcpu(); + continued_vcpu = true; + } + + /* Only have one vcpu */ + count = dirty_ring_collect_one(vcpu_map_dirty_ring(vcpu), + slot, bitmap, num_pages, + ring_buf_idx); + + cleared = kvm_vm_reset_dirty_ring(vcpu->vm); + + /* + * Cleared pages should be the same as collected, as KVM is supposed to + * clear only the entries that have been harvested. + */ + TEST_ASSERT(cleared == count, "Reset dirty pages (%u) mismatch " + "with collected (%u)", cleared, count); + + if (!continued_vcpu) { + TEST_ASSERT(dirty_ring_vcpu_ring_full, + "Didn't continue vcpu even without ring full"); + dirty_ring_continue_vcpu(); + } + + pr_info("Iteration %ld collected %u pages\n", iteration, count); +} + +static void dirty_ring_after_vcpu_run(struct kvm_vcpu *vcpu, int ret, int err) +{ + struct kvm_run *run = vcpu->run; + + /* A ucall-sync or ring-full event is allowed */ + if (get_ucall(vcpu, NULL) == UCALL_SYNC) { + /* We should allow this to continue */ + ; + } else if (run->exit_reason == KVM_EXIT_DIRTY_RING_FULL || + (ret == -1 && err == EINTR)) { + /* Update the flag first before pause */ + WRITE_ONCE(dirty_ring_vcpu_ring_full, + run->exit_reason == KVM_EXIT_DIRTY_RING_FULL); + sem_post(&sem_vcpu_stop); + pr_info("vcpu stops because %s...\n", + dirty_ring_vcpu_ring_full ? + "dirty ring is full" : "vcpu is kicked out"); + sem_wait_until(&sem_vcpu_cont); + pr_info("vcpu continues now.\n"); + } else { + TEST_ASSERT(false, "Invalid guest sync status: " + "exit_reason=%s", + exit_reason_str(run->exit_reason)); + } +} + +struct log_mode { + const char *name; + /* Return true if this mode is supported, otherwise false */ + bool (*supported)(void); + /* Hook when the vm creation is done (before vcpu creation) */ + void (*create_vm_done)(struct kvm_vm *vm); + /* Hook to collect the dirty pages into the bitmap provided */ + void (*collect_dirty_pages) (struct kvm_vcpu *vcpu, int slot, + void *bitmap, uint32_t num_pages, + uint32_t *ring_buf_idx); + /* Hook to call when after each vcpu run */ + void (*after_vcpu_run)(struct kvm_vcpu *vcpu, int ret, int err); +} log_modes[LOG_MODE_NUM] = { + { + .name = "dirty-log", + .collect_dirty_pages = dirty_log_collect_dirty_pages, + .after_vcpu_run = default_after_vcpu_run, + }, + { + .name = "clear-log", + .supported = clear_log_supported, + .create_vm_done = clear_log_create_vm_done, + .collect_dirty_pages = clear_log_collect_dirty_pages, + .after_vcpu_run = default_after_vcpu_run, + }, + { + .name = "dirty-ring", + .supported = dirty_ring_supported, + .create_vm_done = dirty_ring_create_vm_done, + .collect_dirty_pages = dirty_ring_collect_dirty_pages, + .after_vcpu_run = dirty_ring_after_vcpu_run, + }, +}; + /* * We use this bitmap to track some pages that should have its dirty * bit set in the _next_ iteration. For example, if we detected the @@ -137,39 +458,86 @@ static uint64_t host_track_next_count; */ static unsigned long *host_bmap_track; -static void generate_random_array(uint64_t *guest_array, uint64_t size) +static void log_modes_dump(void) { - uint64_t i; + int i; - for (i = 0; i < size; i++) - guest_array[i] = random(); + printf("all"); + for (i = 0; i < LOG_MODE_NUM; i++) + printf(", %s", log_modes[i].name); + printf("\n"); +} + +static bool log_mode_supported(void) +{ + struct log_mode *mode = &log_modes[host_log_mode]; + + if (mode->supported) + return mode->supported(); + + return true; +} + +static void log_mode_create_vm_done(struct kvm_vm *vm) +{ + struct log_mode *mode = &log_modes[host_log_mode]; + + if (mode->create_vm_done) + mode->create_vm_done(vm); +} + +static void log_mode_collect_dirty_pages(struct kvm_vcpu *vcpu, int slot, + void *bitmap, uint32_t num_pages, + uint32_t *ring_buf_idx) +{ + struct log_mode *mode = &log_modes[host_log_mode]; + + TEST_ASSERT(mode->collect_dirty_pages != NULL, + "collect_dirty_pages() is required for any log mode!"); + mode->collect_dirty_pages(vcpu, slot, bitmap, num_pages, ring_buf_idx); +} + +static void log_mode_after_vcpu_run(struct kvm_vcpu *vcpu, int ret, int err) +{ + struct log_mode *mode = &log_modes[host_log_mode]; + + if (mode->after_vcpu_run) + mode->after_vcpu_run(vcpu, ret, err); } static void *vcpu_worker(void *data) { int ret; - struct kvm_vm *vm = data; - uint64_t *guest_array; + struct kvm_vcpu *vcpu = data; uint64_t pages_count = 0; - struct kvm_run *run; + struct kvm_signal_mask *sigmask = alloca(offsetof(struct kvm_signal_mask, sigset) + + sizeof(sigset_t)); + sigset_t *sigset = (sigset_t *) &sigmask->sigset; - run = vcpu_state(vm, VCPU_ID); + /* + * SIG_IPI is unblocked atomically while in KVM_RUN. It causes the + * ioctl to return with -EINTR, but it is still pending and we need + * to accept it with the sigwait. + */ + sigmask->len = 8; + pthread_sigmask(0, NULL, sigset); + sigdelset(sigset, SIG_IPI); + vcpu_ioctl(vcpu, KVM_SET_SIGNAL_MASK, sigmask); - guest_array = addr_gva2hva(vm, (vm_vaddr_t)random_array); - generate_random_array(guest_array, TEST_PAGES_PER_LOOP); + sigemptyset(sigset); + sigaddset(sigset, SIG_IPI); while (!READ_ONCE(host_quit)) { + /* Clear any existing kick signals */ + pages_count += TEST_PAGES_PER_LOOP; /* Let the guest dirty the random pages */ - ret = _vcpu_run(vm, VCPU_ID); - TEST_ASSERT(ret == 0, "vcpu_run failed: %d\n", ret); - if (get_ucall(vm, VCPU_ID, NULL) == UCALL_SYNC) { - pages_count += TEST_PAGES_PER_LOOP; - generate_random_array(guest_array, TEST_PAGES_PER_LOOP); - } else { - TEST_FAIL("Invalid guest sync status: " - "exit_reason=%s\n", - exit_reason_str(run->exit_reason)); + ret = __vcpu_run(vcpu); + if (ret == -1 && errno == EINTR) { + int sig = -1; + sigwait(sigset, &sig); + assert(sig == SIG_IPI); } + log_mode_after_vcpu_run(vcpu, ret, errno); } pr_info("Dirtied %"PRIu64" pages\n", pages_count); @@ -182,12 +550,13 @@ static void vm_dirty_log_verify(enum vm_guest_mode mode, unsigned long *bmap) uint64_t step = vm_num_host_pages(mode, 1); uint64_t page; uint64_t *value_ptr; + uint64_t min_iter = 0; for (page = 0; page < host_num_pages; page += step) { value_ptr = host_test_mem + page * host_page_size; /* If this is a special page that we were tracking... */ - if (test_and_clear_bit_le(page, host_bmap_track)) { + if (__test_and_clear_bit_le(page, host_bmap_track)) { host_track_next_count++; TEST_ASSERT(test_bit_le(page, bmap), "Page %"PRIu64" should have its dirty bit " @@ -195,15 +564,65 @@ static void vm_dirty_log_verify(enum vm_guest_mode mode, unsigned long *bmap) page); } - if (test_bit_le(page, bmap)) { + if (__test_and_clear_bit_le(page, bmap)) { + bool matched; + host_dirty_count++; + /* * If the bit is set, the value written onto * the corresponding page should be either the * previous iteration number or the current one. */ - TEST_ASSERT(*value_ptr == iteration || - *value_ptr == iteration - 1, + matched = (*value_ptr == iteration || + *value_ptr == iteration - 1); + + if (host_log_mode == LOG_MODE_DIRTY_RING && !matched) { + if (*value_ptr == iteration - 2 && min_iter <= iteration - 2) { + /* + * Short answer: this case is special + * only for dirty ring test where the + * page is the last page before a kvm + * dirty ring full in iteration N-2. + * + * Long answer: Assuming ring size R, + * one possible condition is: + * + * main thr vcpu thr + * -------- -------- + * iter=1 + * write 1 to page 0~(R-1) + * full, vmexit + * collect 0~(R-1) + * kick vcpu + * write 1 to (R-1)~(2R-2) + * full, vmexit + * iter=2 + * collect (R-1)~(2R-2) + * kick vcpu + * write 1 to (2R-2) + * (NOTE!!! "1" cached in cpu reg) + * write 2 to (2R-1)~(3R-3) + * full, vmexit + * iter=3 + * collect (2R-2)~(3R-3) + * (here if we read value on page + * "2R-2" is 1, while iter=3!!!) + * + * This however can only happen once per iteration. + */ + min_iter = iteration - 1; + continue; + } else if (page == dirty_ring_last_page) { + /* + * Please refer to comments in + * dirty_ring_last_page. + */ + continue; + } + } + + TEST_ASSERT(matched, "Set page %"PRIu64" value %"PRIu64 " incorrect (iteration=%"PRIu64")", page, *value_ptr, iteration); @@ -238,42 +657,46 @@ static void vm_dirty_log_verify(enum vm_guest_mode mode, unsigned long *bmap) * should report its dirtyness in the * next run */ - set_bit_le(page, host_bmap_track); + __set_bit_le(page, host_bmap_track); } } } } -static struct kvm_vm *create_vm(enum vm_guest_mode mode, uint32_t vcpuid, +static struct kvm_vm *create_vm(enum vm_guest_mode mode, struct kvm_vcpu **vcpu, uint64_t extra_mem_pages, void *guest_code) { struct kvm_vm *vm; - uint64_t extra_pg_pages = extra_mem_pages / 512 * 2; pr_info("Testing guest mode: %s\n", vm_guest_mode_string(mode)); - vm = _vm_create(mode, DEFAULT_GUEST_PHY_PAGES + extra_pg_pages, O_RDWR); - kvm_vm_elf_load(vm, program_invocation_name, 0, 0); -#ifdef __x86_64__ - vm_create_irqchip(vm); -#endif - vm_vcpu_add_default(vm, vcpuid, guest_code); + vm = __vm_create(VM_SHAPE(mode), 1, extra_mem_pages); + + log_mode_create_vm_done(vm); + *vcpu = vm_vcpu_add(vm, 0, guest_code); return vm; } -#define DIRTY_MEM_BITS 30 /* 1G */ -#define PAGE_SHIFT_4K 12 - -#ifdef USE_CLEAR_DIRTY_LOG -static u64 dirty_log_manual_caps; -#endif +struct test_params { + unsigned long iterations; + unsigned long interval; + uint64_t phys_offset; +}; -static void run_test(enum vm_guest_mode mode, unsigned long iterations, - unsigned long interval, uint64_t phys_offset) +static void run_test(enum vm_guest_mode mode, void *arg) { - pthread_t vcpu_thread; + struct test_params *p = arg; + struct kvm_vcpu *vcpu; struct kvm_vm *vm; unsigned long *bmap; + uint32_t ring_buf_idx = 0; + int sem_val; + + if (!log_mode_supported()) { + print_skip("Log mode '%s' not supported", + log_modes[host_log_mode].name); + return; + } /* * We reserve page table for 2 times of extra dirty mem which @@ -283,47 +706,37 @@ static void run_test(enum vm_guest_mode mode, unsigned long iterations, * (e.g., 64K page size guest will need even less memory for * page tables). */ - vm = create_vm(mode, VCPU_ID, - 2ul << (DIRTY_MEM_BITS - PAGE_SHIFT_4K), - guest_code); + vm = create_vm(mode, &vcpu, + 2ul << (DIRTY_MEM_BITS - PAGE_SHIFT_4K), guest_code); - guest_page_size = vm_get_page_size(vm); + guest_page_size = vm->page_size; /* * A little more than 1G of guest page sized pages. Cover the * case where the size is not aligned to 64 pages. */ - guest_num_pages = (1ul << (DIRTY_MEM_BITS - - vm_get_page_shift(vm))) + 3; + guest_num_pages = (1ul << (DIRTY_MEM_BITS - vm->page_shift)) + 3; guest_num_pages = vm_adjust_num_guest_pages(mode, guest_num_pages); host_page_size = getpagesize(); host_num_pages = vm_num_host_pages(mode, guest_num_pages); - if (!phys_offset) { - guest_test_phys_mem = (vm_get_max_gfn(vm) - - guest_num_pages) * guest_page_size; - guest_test_phys_mem &= ~(host_page_size - 1); + if (!p->phys_offset) { + guest_test_phys_mem = (vm->max_gfn - guest_num_pages) * + guest_page_size; + guest_test_phys_mem = align_down(guest_test_phys_mem, host_page_size); } else { - guest_test_phys_mem = phys_offset; + guest_test_phys_mem = p->phys_offset; } #ifdef __s390x__ /* Align to 1M (segment size) */ - guest_test_phys_mem &= ~((1 << 20) - 1); + guest_test_phys_mem = align_down(guest_test_phys_mem, 1 << 20); #endif pr_info("guest physical test memory offset: 0x%lx\n", guest_test_phys_mem); - bmap = bitmap_alloc(host_num_pages); - host_bmap_track = bitmap_alloc(host_num_pages); - -#ifdef USE_CLEAR_DIRTY_LOG - struct kvm_enable_cap cap = {}; - - cap.cap = KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2; - cap.args[0] = dirty_log_manual_caps; - vm_enable_cap(vm, &cap); -#endif + bmap = bitmap_zalloc(host_num_pages); + host_bmap_track = bitmap_zalloc(host_num_pages); /* Add an extra memory slot for testing dirty logging */ vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, @@ -333,16 +746,11 @@ static void run_test(enum vm_guest_mode mode, unsigned long iterations, KVM_MEM_LOG_DIRTY_PAGES); /* Do mapping for the dirty track memory slot */ - virt_map(vm, guest_test_virt_mem, guest_test_phys_mem, guest_num_pages, 0); + virt_map(vm, guest_test_virt_mem, guest_test_phys_mem, guest_num_pages); /* Cache the HVA pointer of the region */ host_test_mem = addr_gpa2hva(vm, (vm_paddr_t)guest_test_phys_mem); -#ifdef __x86_64__ - vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid()); -#endif - ucall_init(vm, NULL); - /* Export the shared variables to the guest */ sync_global_to_guest(vm, host_page_size); sync_global_to_guest(vm, guest_page_size); @@ -352,28 +760,62 @@ static void run_test(enum vm_guest_mode mode, unsigned long iterations, /* Start the iterations */ iteration = 1; sync_global_to_guest(vm, iteration); - host_quit = false; + WRITE_ONCE(host_quit, false); host_dirty_count = 0; host_clear_count = 0; host_track_next_count = 0; + WRITE_ONCE(dirty_ring_vcpu_ring_full, false); - pthread_create(&vcpu_thread, NULL, vcpu_worker, vm); + /* + * Ensure the previous iteration didn't leave a dangling semaphore, i.e. + * that the main task and vCPU worker were synchronized and completed + * verification of all iterations. + */ + sem_getvalue(&sem_vcpu_stop, &sem_val); + TEST_ASSERT_EQ(sem_val, 0); + sem_getvalue(&sem_vcpu_cont, &sem_val); + TEST_ASSERT_EQ(sem_val, 0); + + pthread_create(&vcpu_thread, NULL, vcpu_worker, vcpu); - while (iteration < iterations) { + while (iteration < p->iterations) { /* Give the vcpu thread some time to dirty some pages */ - usleep(interval * 1000); - kvm_vm_get_dirty_log(vm, TEST_MEM_SLOT_INDEX, bmap); -#ifdef USE_CLEAR_DIRTY_LOG - kvm_vm_clear_dirty_log(vm, TEST_MEM_SLOT_INDEX, bmap, 0, - host_num_pages); -#endif + usleep(p->interval * 1000); + log_mode_collect_dirty_pages(vcpu, TEST_MEM_SLOT_INDEX, + bmap, host_num_pages, + &ring_buf_idx); + + /* + * See vcpu_sync_stop_requested definition for details on why + * we need to stop vcpu when verify data. + */ + atomic_set(&vcpu_sync_stop_requested, true); + sem_wait_until(&sem_vcpu_stop); + /* + * NOTE: for dirty ring, it's possible that we didn't stop at + * GUEST_SYNC but instead we stopped because ring is full; + * that's okay too because ring full means we're only missing + * the flush of the last page, and since we handle the last + * page specially verification will succeed anyway. + */ + assert(host_log_mode == LOG_MODE_DIRTY_RING || + atomic_read(&vcpu_sync_stop_requested) == false); vm_dirty_log_verify(mode, bmap); - iteration++; + + /* + * Set host_quit before sem_vcpu_cont in the final iteration to + * ensure that the vCPU worker doesn't resume the guest. As + * above, the dirty ring test may stop and wait even when not + * explicitly request to do so, i.e. would hang waiting for a + * "continue" if it's allowed to resume the guest. + */ + if (++iteration == p->iterations) + WRITE_ONCE(host_quit, true); + + sem_post(&sem_vcpu_cont); sync_global_to_guest(vm, iteration); } - /* Tell the vcpu thread to quit */ - host_quit = true; pthread_join(vcpu_thread, NULL); pr_info("Total bits checked: dirty (%"PRIu64"), clear (%"PRIu64"), " @@ -382,109 +824,82 @@ static void run_test(enum vm_guest_mode mode, unsigned long iterations, free(bmap); free(host_bmap_track); - ucall_uninit(vm); kvm_vm_free(vm); } -struct guest_mode { - bool supported; - bool enabled; -}; -static struct guest_mode guest_modes[NUM_VM_MODES]; - -#define guest_mode_init(mode, supported, enabled) ({ \ - guest_modes[mode] = (struct guest_mode){ supported, enabled }; \ -}) - static void help(char *name) { - int i; - puts(""); printf("usage: %s [-h] [-i iterations] [-I interval] " "[-p offset] [-m mode]\n", name); puts(""); + printf(" -c: hint to dirty ring size, in number of entries\n"); + printf(" (only useful for dirty-ring test; default: %"PRIu32")\n", + TEST_DIRTY_RING_COUNT); printf(" -i: specify iteration counts (default: %"PRIu64")\n", TEST_HOST_LOOP_N); printf(" -I: specify interval in ms (default: %"PRIu64" ms)\n", TEST_HOST_LOOP_INTERVAL); printf(" -p: specify guest physical test memory offset\n" " Warning: a low offset can conflict with the loaded test code.\n"); - printf(" -m: specify the guest mode ID to test " - "(default: test all supported modes)\n" - " This option may be used multiple times.\n" - " Guest mode IDs:\n"); - for (i = 0; i < NUM_VM_MODES; ++i) { - printf(" %d: %s%s\n", i, vm_guest_mode_string(i), - guest_modes[i].supported ? " (supported)" : ""); - } + printf(" -M: specify the host logging mode " + "(default: run all log modes). Supported modes: \n\t"); + log_modes_dump(); + guest_modes_help(); puts(""); exit(0); } int main(int argc, char *argv[]) { - unsigned long iterations = TEST_HOST_LOOP_N; - unsigned long interval = TEST_HOST_LOOP_INTERVAL; - bool mode_selected = false; - uint64_t phys_offset = 0; - unsigned int mode; + struct test_params p = { + .iterations = TEST_HOST_LOOP_N, + .interval = TEST_HOST_LOOP_INTERVAL, + }; int opt, i; + sigset_t sigset; -#ifdef USE_CLEAR_DIRTY_LOG - dirty_log_manual_caps = - kvm_check_cap(KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2); - if (!dirty_log_manual_caps) { - print_skip("KVM_CLEAR_DIRTY_LOG not available"); - exit(KSFT_SKIP); - } - dirty_log_manual_caps &= (KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE | - KVM_DIRTY_LOG_INITIALLY_SET); -#endif - -#ifdef __x86_64__ - guest_mode_init(VM_MODE_PXXV48_4K, true, true); -#endif -#ifdef __aarch64__ - guest_mode_init(VM_MODE_P40V48_4K, true, true); - guest_mode_init(VM_MODE_P40V48_64K, true, true); - - { - unsigned int limit = kvm_check_cap(KVM_CAP_ARM_VM_IPA_SIZE); + sem_init(&sem_vcpu_stop, 0, 0); + sem_init(&sem_vcpu_cont, 0, 0); - if (limit >= 52) - guest_mode_init(VM_MODE_P52V48_64K, true, true); - if (limit >= 48) { - guest_mode_init(VM_MODE_P48V48_4K, true, true); - guest_mode_init(VM_MODE_P48V48_64K, true, true); - } - } -#endif -#ifdef __s390x__ - guest_mode_init(VM_MODE_P40V48_4K, true, true); -#endif + guest_modes_append_default(); - while ((opt = getopt(argc, argv, "hi:I:p:m:")) != -1) { + while ((opt = getopt(argc, argv, "c:hi:I:p:m:M:")) != -1) { switch (opt) { + case 'c': + test_dirty_ring_count = strtol(optarg, NULL, 10); + break; case 'i': - iterations = strtol(optarg, NULL, 10); + p.iterations = strtol(optarg, NULL, 10); break; case 'I': - interval = strtol(optarg, NULL, 10); + p.interval = strtol(optarg, NULL, 10); break; case 'p': - phys_offset = strtoull(optarg, NULL, 0); + p.phys_offset = strtoull(optarg, NULL, 0); break; case 'm': - if (!mode_selected) { - for (i = 0; i < NUM_VM_MODES; ++i) - guest_modes[i].enabled = false; - mode_selected = true; + guest_modes_cmdline(optarg); + break; + case 'M': + if (!strcmp(optarg, "all")) { + host_log_mode_option = LOG_MODE_ALL; + break; + } + for (i = 0; i < LOG_MODE_NUM; i++) { + if (!strcmp(optarg, log_modes[i].name)) { + pr_info("Setting log mode to: '%s'\n", + optarg); + host_log_mode_option = i; + break; + } + } + if (i == LOG_MODE_NUM) { + printf("Log mode '%s' invalid. Please choose " + "from: ", optarg); + log_modes_dump(); + exit(1); } - mode = strtoul(optarg, NULL, 10); - TEST_ASSERT(mode < NUM_VM_MODES, - "Guest mode ID %d too big", mode); - guest_modes[mode].enabled = true; break; case 'h': default: @@ -493,21 +908,29 @@ int main(int argc, char *argv[]) } } - TEST_ASSERT(iterations > 2, "Iterations must be greater than two"); - TEST_ASSERT(interval > 0, "Interval must be greater than zero"); + TEST_ASSERT(p.iterations > 2, "Iterations must be greater than two"); + TEST_ASSERT(p.interval > 0, "Interval must be greater than zero"); pr_info("Test iterations: %"PRIu64", interval: %"PRIu64" (ms)\n", - iterations, interval); + p.iterations, p.interval); srandom(time(0)); - for (i = 0; i < NUM_VM_MODES; ++i) { - if (!guest_modes[i].enabled) - continue; - TEST_ASSERT(guest_modes[i].supported, - "Guest mode ID %d (%s) not supported.", - i, vm_guest_mode_string(i)); - run_test(i, iterations, interval, phys_offset); + /* Ensure that vCPU threads start with SIG_IPI blocked. */ + sigemptyset(&sigset); + sigaddset(&sigset, SIG_IPI); + pthread_sigmask(SIG_BLOCK, &sigset, NULL); + + if (host_log_mode_option == LOG_MODE_ALL) { + /* Run each log mode */ + for (i = 0; i < LOG_MODE_NUM; i++) { + pr_info("Testing Log Mode '%s'\n", log_modes[i].name); + host_log_mode = i; + for_each_guest_mode(run_test, &p); + } + } else { + host_log_mode = host_log_mode_option; + for_each_guest_mode(run_test, &p); } return 0; diff --git a/tools/testing/selftests/kvm/get-reg-list.c b/tools/testing/selftests/kvm/get-reg-list.c new file mode 100644 index 000000000000..91f05f78e824 --- /dev/null +++ b/tools/testing/selftests/kvm/get-reg-list.c @@ -0,0 +1,402 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Check for KVM_GET_REG_LIST regressions. + * + * Copyright (C) 2020, Red Hat, Inc. + * + * When attempting to migrate from a host with an older kernel to a host + * with a newer kernel we allow the newer kernel on the destination to + * list new registers with get-reg-list. We assume they'll be unused, at + * least until the guest reboots, and so they're relatively harmless. + * However, if the destination host with the newer kernel is missing + * registers which the source host with the older kernel has, then that's + * a regression in get-reg-list. This test checks for that regression by + * checking the current list against a blessed list. We should never have + * missing registers, but if new ones appear then they can probably be + * added to the blessed list. A completely new blessed list can be created + * by running the test with the --list command line argument. + * + * The blessed list should be created from the oldest possible kernel. + */ +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <sys/types.h> +#include <sys/wait.h> +#include "kvm_util.h" +#include "test_util.h" +#include "processor.h" + +static struct kvm_reg_list *reg_list; +static __u64 *blessed_reg, blessed_n; + +extern struct vcpu_reg_list *vcpu_configs[]; +extern int vcpu_configs_n; + +#define for_each_reg(i) \ + for ((i) = 0; (i) < reg_list->n; ++(i)) + +#define for_each_reg_filtered(i) \ + for_each_reg(i) \ + if (!filter_reg(reg_list->reg[i])) + +#define for_each_missing_reg(i) \ + for ((i) = 0; (i) < blessed_n; ++(i)) \ + if (!find_reg(reg_list->reg, reg_list->n, blessed_reg[i])) \ + if (check_supported_reg(vcpu, blessed_reg[i])) + +#define for_each_new_reg(i) \ + for_each_reg_filtered(i) \ + if (!find_reg(blessed_reg, blessed_n, reg_list->reg[i])) + +#define for_each_present_blessed_reg(i) \ + for_each_reg(i) \ + if (find_reg(blessed_reg, blessed_n, reg_list->reg[i])) + +static const char *config_name(struct vcpu_reg_list *c) +{ + struct vcpu_reg_sublist *s; + int len = 0; + + if (c->name) + return c->name; + + for_each_sublist(c, s) + len += strlen(s->name) + 1; + + c->name = malloc(len); + + len = 0; + for_each_sublist(c, s) { + if (!strcmp(s->name, "base")) + continue; + if (len) + c->name[len++] = '+'; + strcpy(c->name + len, s->name); + len += strlen(s->name); + } + c->name[len] = '\0'; + + return c->name; +} + +bool __weak check_supported_reg(struct kvm_vcpu *vcpu, __u64 reg) +{ + return true; +} + +bool __weak filter_reg(__u64 reg) +{ + return false; +} + +static bool find_reg(__u64 regs[], __u64 nr_regs, __u64 reg) +{ + int i; + + for (i = 0; i < nr_regs; ++i) + if (reg == regs[i]) + return true; + return false; +} + +void __weak print_reg(const char *prefix, __u64 id) +{ + printf("\t0x%llx,\n", id); +} + +bool __weak check_reject_set(int err) +{ + return true; +} + +void __weak finalize_vcpu(struct kvm_vcpu *vcpu, struct vcpu_reg_list *c) +{ +} + +#ifdef __aarch64__ +static void prepare_vcpu_init(struct vcpu_reg_list *c, struct kvm_vcpu_init *init) +{ + struct vcpu_reg_sublist *s; + + for_each_sublist(c, s) + if (s->capability) + init->features[s->feature / 32] |= 1 << (s->feature % 32); +} + +static struct kvm_vcpu *vcpu_config_get_vcpu(struct vcpu_reg_list *c, struct kvm_vm *vm) +{ + struct kvm_vcpu_init init = { .target = -1, }; + struct kvm_vcpu *vcpu; + + prepare_vcpu_init(c, &init); + vcpu = __vm_vcpu_add(vm, 0); + aarch64_vcpu_setup(vcpu, &init); + + return vcpu; +} +#else +static struct kvm_vcpu *vcpu_config_get_vcpu(struct vcpu_reg_list *c, struct kvm_vm *vm) +{ + return __vm_vcpu_add(vm, 0); +} +#endif + +static void check_supported(struct vcpu_reg_list *c) +{ + struct vcpu_reg_sublist *s; + + for_each_sublist(c, s) { + if (!s->capability) + continue; + + __TEST_REQUIRE(kvm_has_cap(s->capability), + "%s: %s not available, skipping tests", + config_name(c), s->name); + } +} + +static bool print_list; +static bool print_filtered; + +static void run_test(struct vcpu_reg_list *c) +{ + int new_regs = 0, missing_regs = 0, i, n; + int failed_get = 0, failed_set = 0, failed_reject = 0; + int skipped_set = 0; + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; + struct vcpu_reg_sublist *s; + + check_supported(c); + + vm = vm_create_barebones(); + vcpu = vcpu_config_get_vcpu(c, vm); + finalize_vcpu(vcpu, c); + + reg_list = vcpu_get_reg_list(vcpu); + + if (print_list || print_filtered) { + putchar('\n'); + for_each_reg(i) { + __u64 id = reg_list->reg[i]; + if ((print_list && !filter_reg(id)) || + (print_filtered && filter_reg(id))) + print_reg(config_name(c), id); + } + putchar('\n'); + return; + } + + for_each_sublist(c, s) + blessed_n += s->regs_n; + blessed_reg = calloc(blessed_n, sizeof(__u64)); + + n = 0; + for_each_sublist(c, s) { + for (i = 0; i < s->regs_n; ++i) + blessed_reg[n++] = s->regs[i]; + } + + /* + * We only test that we can get the register and then write back the + * same value. Some registers may allow other values to be written + * back, but others only allow some bits to be changed, and at least + * for ID registers set will fail if the value does not exactly match + * what was returned by get. If registers that allow other values to + * be written need to have the other values tested, then we should + * create a new set of tests for those in a new independent test + * executable. + * + * Only do the get/set tests on present, blessed list registers, + * since we don't know the capabilities of any new registers. + */ + for_each_present_blessed_reg(i) { + uint8_t addr[2048 / 8]; + struct kvm_one_reg reg = { + .id = reg_list->reg[i], + .addr = (__u64)&addr, + }; + bool reject_reg = false, skip_reg = false; + int ret; + + ret = __vcpu_get_reg(vcpu, reg_list->reg[i], &addr); + if (ret) { + printf("%s: Failed to get ", config_name(c)); + print_reg(config_name(c), reg.id); + putchar('\n'); + ++failed_get; + } + + for_each_sublist(c, s) { + /* rejects_set registers are rejected for set operation */ + if (s->rejects_set && find_reg(s->rejects_set, s->rejects_set_n, reg.id)) { + reject_reg = true; + ret = __vcpu_ioctl(vcpu, KVM_SET_ONE_REG, ®); + if (ret != -1 || !check_reject_set(errno)) { + printf("%s: Failed to reject (ret=%d, errno=%d) ", config_name(c), ret, errno); + print_reg(config_name(c), reg.id); + putchar('\n'); + ++failed_reject; + } + break; + } + + /* skips_set registers are skipped for set operation */ + if (s->skips_set && find_reg(s->skips_set, s->skips_set_n, reg.id)) { + skip_reg = true; + ++skipped_set; + break; + } + } + + if (!reject_reg && !skip_reg) { + ret = __vcpu_ioctl(vcpu, KVM_SET_ONE_REG, ®); + if (ret) { + printf("%s: Failed to set ", config_name(c)); + print_reg(config_name(c), reg.id); + putchar('\n'); + ++failed_set; + } + } + } + + for_each_new_reg(i) + ++new_regs; + + for_each_missing_reg(i) + ++missing_regs; + + if (new_regs || missing_regs) { + n = 0; + for_each_reg_filtered(i) + ++n; + + printf("%s: Number blessed registers: %5lld\n", config_name(c), blessed_n); + printf("%s: Number registers: %5lld (includes %lld filtered registers)\n", + config_name(c), reg_list->n, reg_list->n - n); + } + + if (new_regs) { + printf("\n%s: There are %d new registers.\n" + "Consider adding them to the blessed reg " + "list with the following lines:\n\n", config_name(c), new_regs); + for_each_new_reg(i) + print_reg(config_name(c), reg_list->reg[i]); + putchar('\n'); + } + + if (missing_regs) { + printf("\n%s: There are %d missing registers.\n" + "The following lines are missing registers:\n\n", config_name(c), missing_regs); + for_each_missing_reg(i) + print_reg(config_name(c), blessed_reg[i]); + putchar('\n'); + } + + TEST_ASSERT(!missing_regs && !failed_get && !failed_set && !failed_reject, + "%s: There are %d missing registers; %d registers failed get; " + "%d registers failed set; %d registers failed reject; %d registers skipped set", + config_name(c), missing_regs, failed_get, failed_set, failed_reject, skipped_set); + + pr_info("%s: PASS\n", config_name(c)); + blessed_n = 0; + free(blessed_reg); + free(reg_list); + kvm_vm_free(vm); +} + +static void help(void) +{ + struct vcpu_reg_list *c; + int i; + + printf( + "\n" + "usage: get-reg-list [--config=<selection>] [--list] [--list-filtered]\n\n" + " --config=<selection> Used to select a specific vcpu configuration for the test/listing\n" + " '<selection>' may be\n"); + + for (i = 0; i < vcpu_configs_n; ++i) { + c = vcpu_configs[i]; + printf( + " '%s'\n", config_name(c)); + } + + printf( + "\n" + " --list Print the register list rather than test it (requires --config)\n" + " --list-filtered Print registers that would normally be filtered out (requires --config)\n" + "\n" + ); +} + +static struct vcpu_reg_list *parse_config(const char *config) +{ + struct vcpu_reg_list *c = NULL; + int i; + + if (config[8] != '=') + help(), exit(1); + + for (i = 0; i < vcpu_configs_n; ++i) { + c = vcpu_configs[i]; + if (strcmp(config_name(c), &config[9]) == 0) + break; + } + + if (i == vcpu_configs_n) + help(), exit(1); + + return c; +} + +int main(int ac, char **av) +{ + struct vcpu_reg_list *c, *sel = NULL; + int i, ret = 0; + pid_t pid; + + for (i = 1; i < ac; ++i) { + if (strncmp(av[i], "--config", 8) == 0) + sel = parse_config(av[i]); + else if (strcmp(av[i], "--list") == 0) + print_list = true; + else if (strcmp(av[i], "--list-filtered") == 0) + print_filtered = true; + else if (strcmp(av[i], "--help") == 0 || strcmp(av[1], "-h") == 0) + help(), exit(0); + else + help(), exit(1); + } + + if (print_list || print_filtered) { + /* + * We only want to print the register list of a single config. + */ + if (!sel) + help(), exit(1); + } + + for (i = 0; i < vcpu_configs_n; ++i) { + c = vcpu_configs[i]; + if (sel && c != sel) + continue; + + pid = fork(); + + if (!pid) { + run_test(c); + exit(0); + } else { + int wstatus; + pid_t wpid = wait(&wstatus); + TEST_ASSERT(wpid == pid && WIFEXITED(wstatus), "wait: Unexpected return"); + if (WEXITSTATUS(wstatus) && WEXITSTATUS(wstatus) != KSFT_SKIP) + ret = KSFT_FAIL; + } + } + + return ret; +} diff --git a/tools/testing/selftests/kvm/guest_memfd_test.c b/tools/testing/selftests/kvm/guest_memfd_test.c new file mode 100644 index 000000000000..ba0c8e996035 --- /dev/null +++ b/tools/testing/selftests/kvm/guest_memfd_test.c @@ -0,0 +1,199 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright Intel Corporation, 2023 + * + * Author: Chao Peng <chao.p.peng@linux.intel.com> + */ +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <errno.h> +#include <stdio.h> +#include <fcntl.h> + +#include <linux/bitmap.h> +#include <linux/falloc.h> +#include <sys/mman.h> +#include <sys/types.h> +#include <sys/stat.h> + +#include "kvm_util.h" +#include "test_util.h" + +static void test_file_read_write(int fd) +{ + char buf[64]; + + TEST_ASSERT(read(fd, buf, sizeof(buf)) < 0, + "read on a guest_mem fd should fail"); + TEST_ASSERT(write(fd, buf, sizeof(buf)) < 0, + "write on a guest_mem fd should fail"); + TEST_ASSERT(pread(fd, buf, sizeof(buf), 0) < 0, + "pread on a guest_mem fd should fail"); + TEST_ASSERT(pwrite(fd, buf, sizeof(buf), 0) < 0, + "pwrite on a guest_mem fd should fail"); +} + +static void test_mmap(int fd, size_t page_size) +{ + char *mem; + + mem = mmap(NULL, page_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + TEST_ASSERT_EQ(mem, MAP_FAILED); +} + +static void test_file_size(int fd, size_t page_size, size_t total_size) +{ + struct stat sb; + int ret; + + ret = fstat(fd, &sb); + TEST_ASSERT(!ret, "fstat should succeed"); + TEST_ASSERT_EQ(sb.st_size, total_size); + TEST_ASSERT_EQ(sb.st_blksize, page_size); +} + +static void test_fallocate(int fd, size_t page_size, size_t total_size) +{ + int ret; + + ret = fallocate(fd, FALLOC_FL_KEEP_SIZE, 0, total_size); + TEST_ASSERT(!ret, "fallocate with aligned offset and size should succeed"); + + ret = fallocate(fd, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE, + page_size - 1, page_size); + TEST_ASSERT(ret, "fallocate with unaligned offset should fail"); + + ret = fallocate(fd, FALLOC_FL_KEEP_SIZE, total_size, page_size); + TEST_ASSERT(ret, "fallocate beginning at total_size should fail"); + + ret = fallocate(fd, FALLOC_FL_KEEP_SIZE, total_size + page_size, page_size); + TEST_ASSERT(ret, "fallocate beginning after total_size should fail"); + + ret = fallocate(fd, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE, + total_size, page_size); + TEST_ASSERT(!ret, "fallocate(PUNCH_HOLE) at total_size should succeed"); + + ret = fallocate(fd, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE, + total_size + page_size, page_size); + TEST_ASSERT(!ret, "fallocate(PUNCH_HOLE) after total_size should succeed"); + + ret = fallocate(fd, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE, + page_size, page_size - 1); + TEST_ASSERT(ret, "fallocate with unaligned size should fail"); + + ret = fallocate(fd, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE, + page_size, page_size); + TEST_ASSERT(!ret, "fallocate(PUNCH_HOLE) with aligned offset and size should succeed"); + + ret = fallocate(fd, FALLOC_FL_KEEP_SIZE, page_size, page_size); + TEST_ASSERT(!ret, "fallocate to restore punched hole should succeed"); +} + +static void test_invalid_punch_hole(int fd, size_t page_size, size_t total_size) +{ + struct { + off_t offset; + off_t len; + } testcases[] = { + {0, 1}, + {0, page_size - 1}, + {0, page_size + 1}, + + {1, 1}, + {1, page_size - 1}, + {1, page_size}, + {1, page_size + 1}, + + {page_size, 1}, + {page_size, page_size - 1}, + {page_size, page_size + 1}, + }; + int ret, i; + + for (i = 0; i < ARRAY_SIZE(testcases); i++) { + ret = fallocate(fd, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE, + testcases[i].offset, testcases[i].len); + TEST_ASSERT(ret == -1 && errno == EINVAL, + "PUNCH_HOLE with !PAGE_SIZE offset (%lx) and/or length (%lx) should fail", + testcases[i].offset, testcases[i].len); + } +} + +static void test_create_guest_memfd_invalid(struct kvm_vm *vm) +{ + size_t page_size = getpagesize(); + uint64_t flag; + size_t size; + int fd; + + for (size = 1; size < page_size; size++) { + fd = __vm_create_guest_memfd(vm, size, 0); + TEST_ASSERT(fd == -1 && errno == EINVAL, + "guest_memfd() with non-page-aligned page size '0x%lx' should fail with EINVAL", + size); + } + + for (flag = 0; flag; flag <<= 1) { + fd = __vm_create_guest_memfd(vm, page_size, flag); + TEST_ASSERT(fd == -1 && errno == EINVAL, + "guest_memfd() with flag '0x%lx' should fail with EINVAL", + flag); + } +} + +static void test_create_guest_memfd_multiple(struct kvm_vm *vm) +{ + int fd1, fd2, ret; + struct stat st1, st2; + + fd1 = __vm_create_guest_memfd(vm, 4096, 0); + TEST_ASSERT(fd1 != -1, "memfd creation should succeed"); + + ret = fstat(fd1, &st1); + TEST_ASSERT(ret != -1, "memfd fstat should succeed"); + TEST_ASSERT(st1.st_size == 4096, "memfd st_size should match requested size"); + + fd2 = __vm_create_guest_memfd(vm, 8192, 0); + TEST_ASSERT(fd2 != -1, "memfd creation should succeed"); + + ret = fstat(fd2, &st2); + TEST_ASSERT(ret != -1, "memfd fstat should succeed"); + TEST_ASSERT(st2.st_size == 8192, "second memfd st_size should match requested size"); + + ret = fstat(fd1, &st1); + TEST_ASSERT(ret != -1, "memfd fstat should succeed"); + TEST_ASSERT(st1.st_size == 4096, "first memfd st_size should still match requested size"); + TEST_ASSERT(st1.st_ino != st2.st_ino, "different memfd should have different inode numbers"); + + close(fd2); + close(fd1); +} + +int main(int argc, char *argv[]) +{ + size_t page_size; + size_t total_size; + int fd; + struct kvm_vm *vm; + + TEST_REQUIRE(kvm_has_cap(KVM_CAP_GUEST_MEMFD)); + + page_size = getpagesize(); + total_size = page_size * 4; + + vm = vm_create_barebones(); + + test_create_guest_memfd_invalid(vm); + test_create_guest_memfd_multiple(vm); + + fd = vm_create_guest_memfd(vm, total_size, 0); + + test_file_read_write(fd); + test_mmap(fd, page_size); + test_file_size(fd, page_size, total_size); + test_fallocate(fd, page_size, total_size); + test_invalid_punch_hole(fd, page_size, total_size); + + close(fd); +} diff --git a/tools/testing/selftests/kvm/guest_print_test.c b/tools/testing/selftests/kvm/guest_print_test.c new file mode 100644 index 000000000000..8092c2d0f5d6 --- /dev/null +++ b/tools/testing/selftests/kvm/guest_print_test.c @@ -0,0 +1,220 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * A test for GUEST_PRINTF + * + * Copyright 2022, Google, Inc. and/or its affiliates. + */ +#include <fcntl.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/ioctl.h> + +#include "test_util.h" +#include "kvm_util.h" +#include "processor.h" +#include "ucall_common.h" + +struct guest_vals { + uint64_t a; + uint64_t b; + uint64_t type; +}; + +static struct guest_vals vals; + +/* GUEST_PRINTF()/GUEST_ASSERT_FMT() does not support float or double. */ +#define TYPE_LIST \ +TYPE(test_type_i64, I64, "%ld", int64_t) \ +TYPE(test_type_u64, U64u, "%lu", uint64_t) \ +TYPE(test_type_x64, U64x, "0x%lx", uint64_t) \ +TYPE(test_type_X64, U64X, "0x%lX", uint64_t) \ +TYPE(test_type_u32, U32u, "%u", uint32_t) \ +TYPE(test_type_x32, U32x, "0x%x", uint32_t) \ +TYPE(test_type_X32, U32X, "0x%X", uint32_t) \ +TYPE(test_type_int, INT, "%d", int) \ +TYPE(test_type_char, CHAR, "%c", char) \ +TYPE(test_type_str, STR, "'%s'", const char *) \ +TYPE(test_type_ptr, PTR, "%p", uintptr_t) + +enum args_type { +#define TYPE(fn, ext, fmt_t, T) TYPE_##ext, + TYPE_LIST +#undef TYPE +}; + +static void run_test(struct kvm_vcpu *vcpu, const char *expected_printf, + const char *expected_assert); + +#define BUILD_TYPE_STRINGS_AND_HELPER(fn, ext, fmt_t, T) \ +const char *PRINTF_FMT_##ext = "Got params a = " fmt_t " and b = " fmt_t; \ +const char *ASSERT_FMT_##ext = "Expected " fmt_t ", got " fmt_t " instead"; \ +static void fn(struct kvm_vcpu *vcpu, T a, T b) \ +{ \ + char expected_printf[UCALL_BUFFER_LEN]; \ + char expected_assert[UCALL_BUFFER_LEN]; \ + \ + snprintf(expected_printf, UCALL_BUFFER_LEN, PRINTF_FMT_##ext, a, b); \ + snprintf(expected_assert, UCALL_BUFFER_LEN, ASSERT_FMT_##ext, a, b); \ + vals = (struct guest_vals){ (uint64_t)a, (uint64_t)b, TYPE_##ext }; \ + sync_global_to_guest(vcpu->vm, vals); \ + run_test(vcpu, expected_printf, expected_assert); \ +} + +#define TYPE(fn, ext, fmt_t, T) \ + BUILD_TYPE_STRINGS_AND_HELPER(fn, ext, fmt_t, T) + TYPE_LIST +#undef TYPE + +static void guest_code(void) +{ + while (1) { + switch (vals.type) { +#define TYPE(fn, ext, fmt_t, T) \ + case TYPE_##ext: \ + GUEST_PRINTF(PRINTF_FMT_##ext, vals.a, vals.b); \ + __GUEST_ASSERT(vals.a == vals.b, \ + ASSERT_FMT_##ext, vals.a, vals.b); \ + break; + TYPE_LIST +#undef TYPE + default: + GUEST_SYNC(vals.type); + } + + GUEST_DONE(); + } +} + +/* + * Unfortunately this gets a little messy because 'assert_msg' doesn't + * just contains the matching string, it also contains additional assert + * info. Fortunately the part that matches should be at the very end of + * 'assert_msg'. + */ +static void ucall_abort(const char *assert_msg, const char *expected_assert_msg) +{ + int len_str = strlen(assert_msg); + int len_substr = strlen(expected_assert_msg); + int offset = len_str - len_substr; + + TEST_ASSERT(len_substr <= len_str, + "Expected '%s' to be a substring of '%s'", + assert_msg, expected_assert_msg); + + TEST_ASSERT(strcmp(&assert_msg[offset], expected_assert_msg) == 0, + "Unexpected mismatch. Expected: '%s', got: '%s'", + expected_assert_msg, &assert_msg[offset]); +} + +static void run_test(struct kvm_vcpu *vcpu, const char *expected_printf, + const char *expected_assert) +{ + struct kvm_run *run = vcpu->run; + struct ucall uc; + + while (1) { + vcpu_run(vcpu); + + TEST_ASSERT(run->exit_reason == UCALL_EXIT_REASON, + "Unexpected exit reason: %u (%s),", + run->exit_reason, exit_reason_str(run->exit_reason)); + + switch (get_ucall(vcpu, &uc)) { + case UCALL_SYNC: + TEST_FAIL("Unknown 'args_type' = %lu", uc.args[1]); + break; + case UCALL_PRINTF: + TEST_ASSERT(strcmp(uc.buffer, expected_printf) == 0, + "Unexpected mismatch. Expected: '%s', got: '%s'", + expected_printf, uc.buffer); + break; + case UCALL_ABORT: + ucall_abort(uc.buffer, expected_assert); + break; + case UCALL_DONE: + return; + default: + TEST_FAIL("Unknown ucall %lu", uc.cmd); + } + } +} + +static void guest_code_limits(void) +{ + char test_str[UCALL_BUFFER_LEN + 10]; + + memset(test_str, 'a', sizeof(test_str)); + test_str[sizeof(test_str) - 1] = 0; + + GUEST_PRINTF("%s", test_str); +} + +static void test_limits(void) +{ + struct kvm_vcpu *vcpu; + struct kvm_run *run; + struct kvm_vm *vm; + struct ucall uc; + + vm = vm_create_with_one_vcpu(&vcpu, guest_code_limits); + run = vcpu->run; + vcpu_run(vcpu); + + TEST_ASSERT(run->exit_reason == UCALL_EXIT_REASON, + "Unexpected exit reason: %u (%s),", + run->exit_reason, exit_reason_str(run->exit_reason)); + + TEST_ASSERT(get_ucall(vcpu, &uc) == UCALL_ABORT, + "Unexpected ucall command: %lu, Expected: %u (UCALL_ABORT)", + uc.cmd, UCALL_ABORT); + + kvm_vm_free(vm); +} + +int main(int argc, char *argv[]) +{ + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; + + vm = vm_create_with_one_vcpu(&vcpu, guest_code); + + test_type_i64(vcpu, -1, -1); + test_type_i64(vcpu, -1, 1); + test_type_i64(vcpu, 0x1234567890abcdef, 0x1234567890abcdef); + test_type_i64(vcpu, 0x1234567890abcdef, 0x1234567890abcdee); + + test_type_u64(vcpu, 0x1234567890abcdef, 0x1234567890abcdef); + test_type_u64(vcpu, 0x1234567890abcdef, 0x1234567890abcdee); + test_type_x64(vcpu, 0x1234567890abcdef, 0x1234567890abcdef); + test_type_x64(vcpu, 0x1234567890abcdef, 0x1234567890abcdee); + test_type_X64(vcpu, 0x1234567890abcdef, 0x1234567890abcdef); + test_type_X64(vcpu, 0x1234567890abcdef, 0x1234567890abcdee); + + test_type_u32(vcpu, 0x90abcdef, 0x90abcdef); + test_type_u32(vcpu, 0x90abcdef, 0x90abcdee); + test_type_x32(vcpu, 0x90abcdef, 0x90abcdef); + test_type_x32(vcpu, 0x90abcdef, 0x90abcdee); + test_type_X32(vcpu, 0x90abcdef, 0x90abcdef); + test_type_X32(vcpu, 0x90abcdef, 0x90abcdee); + + test_type_int(vcpu, -1, -1); + test_type_int(vcpu, -1, 1); + test_type_int(vcpu, 1, 1); + + test_type_char(vcpu, 'a', 'a'); + test_type_char(vcpu, 'a', 'A'); + test_type_char(vcpu, 'a', 'b'); + + test_type_str(vcpu, "foo", "foo"); + test_type_str(vcpu, "foo", "bar"); + + test_type_ptr(vcpu, 0x1234567890abcdef, 0x1234567890abcdef); + test_type_ptr(vcpu, 0x1234567890abcdef, 0x1234567890abcdee); + + kvm_vm_free(vm); + + test_limits(); + + return 0; +} diff --git a/tools/testing/selftests/kvm/hardware_disable_test.c b/tools/testing/selftests/kvm/hardware_disable_test.c new file mode 100644 index 000000000000..bce73bcb973c --- /dev/null +++ b/tools/testing/selftests/kvm/hardware_disable_test.c @@ -0,0 +1,181 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * This test is intended to reproduce a crash that happens when + * kvm_arch_hardware_disable is called and it attempts to unregister the user + * return notifiers. + */ +#include <fcntl.h> +#include <pthread.h> +#include <semaphore.h> +#include <stdint.h> +#include <stdlib.h> +#include <unistd.h> +#include <sys/wait.h> + +#include <test_util.h> + +#include "kvm_util.h" + +#define VCPU_NUM 4 +#define SLEEPING_THREAD_NUM (1 << 4) +#define FORK_NUM (1ULL << 9) +#define DELAY_US_MAX 2000 +#define GUEST_CODE_PIO_PORT 4 + +sem_t *sem; + +static void guest_code(void) +{ + for (;;) + ; /* Some busy work */ + printf("Should not be reached.\n"); +} + +static void *run_vcpu(void *arg) +{ + struct kvm_vcpu *vcpu = arg; + struct kvm_run *run = vcpu->run; + + vcpu_run(vcpu); + + TEST_ASSERT(false, "%s: exited with reason %d: %s", + __func__, run->exit_reason, + exit_reason_str(run->exit_reason)); + pthread_exit(NULL); +} + +static void *sleeping_thread(void *arg) +{ + int fd; + + while (true) { + fd = open("/dev/null", O_RDWR); + close(fd); + } + TEST_ASSERT(false, "%s: exited", __func__); + pthread_exit(NULL); +} + +static inline void check_create_thread(pthread_t *thread, pthread_attr_t *attr, + void *(*f)(void *), void *arg) +{ + int r; + + r = pthread_create(thread, attr, f, arg); + TEST_ASSERT(r == 0, "%s: failed to create thread", __func__); +} + +static inline void check_set_affinity(pthread_t thread, cpu_set_t *cpu_set) +{ + int r; + + r = pthread_setaffinity_np(thread, sizeof(cpu_set_t), cpu_set); + TEST_ASSERT(r == 0, "%s: failed set affinity", __func__); +} + +static inline void check_join(pthread_t thread, void **retval) +{ + int r; + + r = pthread_join(thread, retval); + TEST_ASSERT(r == 0, "%s: failed to join thread", __func__); +} + +static void run_test(uint32_t run) +{ + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; + cpu_set_t cpu_set; + pthread_t threads[VCPU_NUM]; + pthread_t throw_away; + void *b; + uint32_t i, j; + + CPU_ZERO(&cpu_set); + for (i = 0; i < VCPU_NUM; i++) + CPU_SET(i, &cpu_set); + + vm = vm_create(VCPU_NUM); + + pr_debug("%s: [%d] start vcpus\n", __func__, run); + for (i = 0; i < VCPU_NUM; ++i) { + vcpu = vm_vcpu_add(vm, i, guest_code); + + check_create_thread(&threads[i], NULL, run_vcpu, vcpu); + check_set_affinity(threads[i], &cpu_set); + + for (j = 0; j < SLEEPING_THREAD_NUM; ++j) { + check_create_thread(&throw_away, NULL, sleeping_thread, + (void *)NULL); + check_set_affinity(throw_away, &cpu_set); + } + } + pr_debug("%s: [%d] all threads launched\n", __func__, run); + sem_post(sem); + for (i = 0; i < VCPU_NUM; ++i) + check_join(threads[i], &b); + /* Should not be reached */ + TEST_ASSERT(false, "%s: [%d] child escaped the ninja", __func__, run); +} + +void wait_for_child_setup(pid_t pid) +{ + /* + * Wait for the child to post to the semaphore, but wake up periodically + * to check if the child exited prematurely. + */ + for (;;) { + const struct timespec wait_period = { .tv_sec = 1 }; + int status; + + if (!sem_timedwait(sem, &wait_period)) + return; + + /* Child is still running, keep waiting. */ + if (pid != waitpid(pid, &status, WNOHANG)) + continue; + + /* + * Child is no longer running, which is not expected. + * + * If it exited with a non-zero status, we explicitly forward + * the child's status in case it exited with KSFT_SKIP. + */ + if (WIFEXITED(status)) + exit(WEXITSTATUS(status)); + else + TEST_ASSERT(false, "Child exited unexpectedly"); + } +} + +int main(int argc, char **argv) +{ + uint32_t i; + int s, r; + pid_t pid; + + sem = sem_open("vm_sem", O_CREAT | O_EXCL, 0644, 0); + sem_unlink("vm_sem"); + + for (i = 0; i < FORK_NUM; ++i) { + pid = fork(); + TEST_ASSERT(pid >= 0, "%s: unable to fork", __func__); + if (pid == 0) + run_test(i); /* This function always exits */ + + pr_debug("%s: [%d] waiting semaphore\n", __func__, i); + wait_for_child_setup(pid); + r = (rand() % DELAY_US_MAX) + 1; + pr_debug("%s: [%d] waiting %dus\n", __func__, i, r); + usleep(r); + r = waitpid(pid, &s, WNOHANG); + TEST_ASSERT(r != pid, + "%s: [%d] child exited unexpectedly status: [%d]", + __func__, i, s); + pr_debug("%s: [%d] killing child\n", __func__, i); + kill(pid, SIGKILL); + } + + sem_destroy(sem); + exit(0); +} diff --git a/tools/testing/selftests/kvm/include/aarch64/arch_timer.h b/tools/testing/selftests/kvm/include/aarch64/arch_timer.h new file mode 100644 index 000000000000..b3e97525cb55 --- /dev/null +++ b/tools/testing/selftests/kvm/include/aarch64/arch_timer.h @@ -0,0 +1,142 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * ARM Generic Timer specific interface + */ + +#ifndef SELFTEST_KVM_ARCH_TIMER_H +#define SELFTEST_KVM_ARCH_TIMER_H + +#include "processor.h" + +enum arch_timer { + VIRTUAL, + PHYSICAL, +}; + +#define CTL_ENABLE (1 << 0) +#define CTL_IMASK (1 << 1) +#define CTL_ISTATUS (1 << 2) + +#define msec_to_cycles(msec) \ + (timer_get_cntfrq() * (uint64_t)(msec) / 1000) + +#define usec_to_cycles(usec) \ + (timer_get_cntfrq() * (uint64_t)(usec) / 1000000) + +#define cycles_to_usec(cycles) \ + ((uint64_t)(cycles) * 1000000 / timer_get_cntfrq()) + +static inline uint32_t timer_get_cntfrq(void) +{ + return read_sysreg(cntfrq_el0); +} + +static inline uint64_t timer_get_cntct(enum arch_timer timer) +{ + isb(); + + switch (timer) { + case VIRTUAL: + return read_sysreg(cntvct_el0); + case PHYSICAL: + return read_sysreg(cntpct_el0); + default: + GUEST_FAIL("Unexpected timer type = %u", timer); + } + + /* We should not reach here */ + return 0; +} + +static inline void timer_set_cval(enum arch_timer timer, uint64_t cval) +{ + switch (timer) { + case VIRTUAL: + write_sysreg(cval, cntv_cval_el0); + break; + case PHYSICAL: + write_sysreg(cval, cntp_cval_el0); + break; + default: + GUEST_FAIL("Unexpected timer type = %u", timer); + } + + isb(); +} + +static inline uint64_t timer_get_cval(enum arch_timer timer) +{ + switch (timer) { + case VIRTUAL: + return read_sysreg(cntv_cval_el0); + case PHYSICAL: + return read_sysreg(cntp_cval_el0); + default: + GUEST_FAIL("Unexpected timer type = %u", timer); + } + + /* We should not reach here */ + return 0; +} + +static inline void timer_set_tval(enum arch_timer timer, uint32_t tval) +{ + switch (timer) { + case VIRTUAL: + write_sysreg(tval, cntv_tval_el0); + break; + case PHYSICAL: + write_sysreg(tval, cntp_tval_el0); + break; + default: + GUEST_FAIL("Unexpected timer type = %u", timer); + } + + isb(); +} + +static inline void timer_set_ctl(enum arch_timer timer, uint32_t ctl) +{ + switch (timer) { + case VIRTUAL: + write_sysreg(ctl, cntv_ctl_el0); + break; + case PHYSICAL: + write_sysreg(ctl, cntp_ctl_el0); + break; + default: + GUEST_FAIL("Unexpected timer type = %u", timer); + } + + isb(); +} + +static inline uint32_t timer_get_ctl(enum arch_timer timer) +{ + switch (timer) { + case VIRTUAL: + return read_sysreg(cntv_ctl_el0); + case PHYSICAL: + return read_sysreg(cntp_ctl_el0); + default: + GUEST_FAIL("Unexpected timer type = %u", timer); + } + + /* We should not reach here */ + return 0; +} + +static inline void timer_set_next_cval_ms(enum arch_timer timer, uint32_t msec) +{ + uint64_t now_ct = timer_get_cntct(timer); + uint64_t next_ct = now_ct + msec_to_cycles(msec); + + timer_set_cval(timer, next_ct); +} + +static inline void timer_set_next_tval_ms(enum arch_timer timer, uint32_t msec) +{ + timer_set_tval(timer, msec_to_cycles(msec)); +} + +#endif /* SELFTEST_KVM_ARCH_TIMER_H */ diff --git a/tools/testing/selftests/kvm/include/aarch64/delay.h b/tools/testing/selftests/kvm/include/aarch64/delay.h new file mode 100644 index 000000000000..329e4f5079ea --- /dev/null +++ b/tools/testing/selftests/kvm/include/aarch64/delay.h @@ -0,0 +1,25 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * ARM simple delay routines + */ + +#ifndef SELFTEST_KVM_ARM_DELAY_H +#define SELFTEST_KVM_ARM_DELAY_H + +#include "arch_timer.h" + +static inline void __delay(uint64_t cycles) +{ + enum arch_timer timer = VIRTUAL; + uint64_t start = timer_get_cntct(timer); + + while ((timer_get_cntct(timer) - start) < cycles) + cpu_relax(); +} + +static inline void udelay(unsigned long usec) +{ + __delay(usec_to_cycles(usec)); +} + +#endif /* SELFTEST_KVM_ARM_DELAY_H */ diff --git a/tools/testing/selftests/kvm/include/aarch64/gic.h b/tools/testing/selftests/kvm/include/aarch64/gic.h new file mode 100644 index 000000000000..baeb3c859389 --- /dev/null +++ b/tools/testing/selftests/kvm/include/aarch64/gic.h @@ -0,0 +1,64 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * ARM Generic Interrupt Controller (GIC) specific defines + */ + +#ifndef SELFTEST_KVM_GIC_H +#define SELFTEST_KVM_GIC_H + +#include <asm/kvm.h> + +enum gic_type { + GIC_V3, + GIC_TYPE_MAX, +}; + +/* + * Note that the redistributor frames are at the end, as the range scales + * with the number of vCPUs in the VM. + */ +#define GITS_BASE_GPA 0x8000000ULL +#define GICD_BASE_GPA (GITS_BASE_GPA + KVM_VGIC_V3_ITS_SIZE) +#define GICR_BASE_GPA (GICD_BASE_GPA + KVM_VGIC_V3_DIST_SIZE) + +/* The GIC is identity-mapped into the guest at the time of setup. */ +#define GITS_BASE_GVA ((volatile void *)GITS_BASE_GPA) +#define GICD_BASE_GVA ((volatile void *)GICD_BASE_GPA) +#define GICR_BASE_GVA ((volatile void *)GICR_BASE_GPA) + +#define MIN_SGI 0 +#define MIN_PPI 16 +#define MIN_SPI 32 +#define MAX_SPI 1019 +#define IAR_SPURIOUS 1023 + +#define INTID_IS_SGI(intid) (0 <= (intid) && (intid) < MIN_PPI) +#define INTID_IS_PPI(intid) (MIN_PPI <= (intid) && (intid) < MIN_SPI) +#define INTID_IS_SPI(intid) (MIN_SPI <= (intid) && (intid) <= MAX_SPI) + +void gic_init(enum gic_type type, unsigned int nr_cpus); +void gic_irq_enable(unsigned int intid); +void gic_irq_disable(unsigned int intid); +unsigned int gic_get_and_ack_irq(void); +void gic_set_eoi(unsigned int intid); +void gic_set_dir(unsigned int intid); + +/* + * Sets the EOI mode. When split is false, EOI just drops the priority. When + * split is true, EOI drops the priority and deactivates the interrupt. + */ +void gic_set_eoi_split(bool split); +void gic_set_priority_mask(uint64_t mask); +void gic_set_priority(uint32_t intid, uint32_t prio); +void gic_irq_set_active(unsigned int intid); +void gic_irq_clear_active(unsigned int intid); +bool gic_irq_get_active(unsigned int intid); +void gic_irq_set_pending(unsigned int intid); +void gic_irq_clear_pending(unsigned int intid); +bool gic_irq_get_pending(unsigned int intid); +void gic_irq_set_config(unsigned int intid, bool is_edge); + +void gic_rdist_enable_lpis(vm_paddr_t cfg_table, size_t cfg_table_size, + vm_paddr_t pend_table); + +#endif /* SELFTEST_KVM_GIC_H */ diff --git a/tools/testing/selftests/kvm/include/aarch64/gic_v3.h b/tools/testing/selftests/kvm/include/aarch64/gic_v3.h new file mode 100644 index 000000000000..a76615fa39a1 --- /dev/null +++ b/tools/testing/selftests/kvm/include/aarch64/gic_v3.h @@ -0,0 +1,604 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2013, 2014 ARM Limited, All Rights Reserved. + * Author: Marc Zyngier <marc.zyngier@arm.com> + */ +#ifndef __SELFTESTS_GIC_V3_H +#define __SELFTESTS_GIC_V3_H + +/* + * Distributor registers. We assume we're running non-secure, with ARE + * being set. Secure-only and non-ARE registers are not described. + */ +#define GICD_CTLR 0x0000 +#define GICD_TYPER 0x0004 +#define GICD_IIDR 0x0008 +#define GICD_TYPER2 0x000C +#define GICD_STATUSR 0x0010 +#define GICD_SETSPI_NSR 0x0040 +#define GICD_CLRSPI_NSR 0x0048 +#define GICD_SETSPI_SR 0x0050 +#define GICD_CLRSPI_SR 0x0058 +#define GICD_IGROUPR 0x0080 +#define GICD_ISENABLER 0x0100 +#define GICD_ICENABLER 0x0180 +#define GICD_ISPENDR 0x0200 +#define GICD_ICPENDR 0x0280 +#define GICD_ISACTIVER 0x0300 +#define GICD_ICACTIVER 0x0380 +#define GICD_IPRIORITYR 0x0400 +#define GICD_ICFGR 0x0C00 +#define GICD_IGRPMODR 0x0D00 +#define GICD_NSACR 0x0E00 +#define GICD_IGROUPRnE 0x1000 +#define GICD_ISENABLERnE 0x1200 +#define GICD_ICENABLERnE 0x1400 +#define GICD_ISPENDRnE 0x1600 +#define GICD_ICPENDRnE 0x1800 +#define GICD_ISACTIVERnE 0x1A00 +#define GICD_ICACTIVERnE 0x1C00 +#define GICD_IPRIORITYRnE 0x2000 +#define GICD_ICFGRnE 0x3000 +#define GICD_IROUTER 0x6000 +#define GICD_IROUTERnE 0x8000 +#define GICD_IDREGS 0xFFD0 +#define GICD_PIDR2 0xFFE8 + +#define ESPI_BASE_INTID 4096 + +/* + * Those registers are actually from GICv2, but the spec demands that they + * are implemented as RES0 if ARE is 1 (which we do in KVM's emulated GICv3). + */ +#define GICD_ITARGETSR 0x0800 +#define GICD_SGIR 0x0F00 +#define GICD_CPENDSGIR 0x0F10 +#define GICD_SPENDSGIR 0x0F20 + +#define GICD_CTLR_RWP (1U << 31) +#define GICD_CTLR_nASSGIreq (1U << 8) +#define GICD_CTLR_DS (1U << 6) +#define GICD_CTLR_ARE_NS (1U << 4) +#define GICD_CTLR_ENABLE_G1A (1U << 1) +#define GICD_CTLR_ENABLE_G1 (1U << 0) + +#define GICD_IIDR_IMPLEMENTER_SHIFT 0 +#define GICD_IIDR_IMPLEMENTER_MASK (0xfff << GICD_IIDR_IMPLEMENTER_SHIFT) +#define GICD_IIDR_REVISION_SHIFT 12 +#define GICD_IIDR_REVISION_MASK (0xf << GICD_IIDR_REVISION_SHIFT) +#define GICD_IIDR_VARIANT_SHIFT 16 +#define GICD_IIDR_VARIANT_MASK (0xf << GICD_IIDR_VARIANT_SHIFT) +#define GICD_IIDR_PRODUCT_ID_SHIFT 24 +#define GICD_IIDR_PRODUCT_ID_MASK (0xff << GICD_IIDR_PRODUCT_ID_SHIFT) + + +/* + * In systems with a single security state (what we emulate in KVM) + * the meaning of the interrupt group enable bits is slightly different + */ +#define GICD_CTLR_ENABLE_SS_G1 (1U << 1) +#define GICD_CTLR_ENABLE_SS_G0 (1U << 0) + +#define GICD_TYPER_RSS (1U << 26) +#define GICD_TYPER_LPIS (1U << 17) +#define GICD_TYPER_MBIS (1U << 16) +#define GICD_TYPER_ESPI (1U << 8) + +#define GICD_TYPER_ID_BITS(typer) ((((typer) >> 19) & 0x1f) + 1) +#define GICD_TYPER_NUM_LPIS(typer) ((((typer) >> 11) & 0x1f) + 1) +#define GICD_TYPER_SPIS(typer) ((((typer) & 0x1f) + 1) * 32) +#define GICD_TYPER_ESPIS(typer) \ + (((typer) & GICD_TYPER_ESPI) ? GICD_TYPER_SPIS((typer) >> 27) : 0) + +#define GICD_TYPER2_nASSGIcap (1U << 8) +#define GICD_TYPER2_VIL (1U << 7) +#define GICD_TYPER2_VID GENMASK(4, 0) + +#define GICD_IROUTER_SPI_MODE_ONE (0U << 31) +#define GICD_IROUTER_SPI_MODE_ANY (1U << 31) + +#define GIC_PIDR2_ARCH_MASK 0xf0 +#define GIC_PIDR2_ARCH_GICv3 0x30 +#define GIC_PIDR2_ARCH_GICv4 0x40 + +#define GIC_V3_DIST_SIZE 0x10000 + +#define GIC_PAGE_SIZE_4K 0ULL +#define GIC_PAGE_SIZE_16K 1ULL +#define GIC_PAGE_SIZE_64K 2ULL +#define GIC_PAGE_SIZE_MASK 3ULL + +/* + * Re-Distributor registers, offsets from RD_base + */ +#define GICR_CTLR GICD_CTLR +#define GICR_IIDR 0x0004 +#define GICR_TYPER 0x0008 +#define GICR_STATUSR GICD_STATUSR +#define GICR_WAKER 0x0014 +#define GICR_SETLPIR 0x0040 +#define GICR_CLRLPIR 0x0048 +#define GICR_PROPBASER 0x0070 +#define GICR_PENDBASER 0x0078 +#define GICR_INVLPIR 0x00A0 +#define GICR_INVALLR 0x00B0 +#define GICR_SYNCR 0x00C0 +#define GICR_IDREGS GICD_IDREGS +#define GICR_PIDR2 GICD_PIDR2 + +#define GICR_CTLR_ENABLE_LPIS (1UL << 0) +#define GICR_CTLR_CES (1UL << 1) +#define GICR_CTLR_IR (1UL << 2) +#define GICR_CTLR_RWP (1UL << 3) + +#define GICR_TYPER_CPU_NUMBER(r) (((r) >> 8) & 0xffff) + +#define EPPI_BASE_INTID 1056 + +#define GICR_TYPER_NR_PPIS(r) \ + ({ \ + unsigned int __ppinum = ((r) >> 27) & 0x1f; \ + unsigned int __nr_ppis = 16; \ + if (__ppinum == 1 || __ppinum == 2) \ + __nr_ppis += __ppinum * 32; \ + \ + __nr_ppis; \ + }) + +#define GICR_WAKER_ProcessorSleep (1U << 1) +#define GICR_WAKER_ChildrenAsleep (1U << 2) + +#define GIC_BASER_CACHE_nCnB 0ULL +#define GIC_BASER_CACHE_SameAsInner 0ULL +#define GIC_BASER_CACHE_nC 1ULL +#define GIC_BASER_CACHE_RaWt 2ULL +#define GIC_BASER_CACHE_RaWb 3ULL +#define GIC_BASER_CACHE_WaWt 4ULL +#define GIC_BASER_CACHE_WaWb 5ULL +#define GIC_BASER_CACHE_RaWaWt 6ULL +#define GIC_BASER_CACHE_RaWaWb 7ULL +#define GIC_BASER_CACHE_MASK 7ULL +#define GIC_BASER_NonShareable 0ULL +#define GIC_BASER_InnerShareable 1ULL +#define GIC_BASER_OuterShareable 2ULL +#define GIC_BASER_SHAREABILITY_MASK 3ULL + +#define GIC_BASER_CACHEABILITY(reg, inner_outer, type) \ + (GIC_BASER_CACHE_##type << reg##_##inner_outer##_CACHEABILITY_SHIFT) + +#define GIC_BASER_SHAREABILITY(reg, type) \ + (GIC_BASER_##type << reg##_SHAREABILITY_SHIFT) + +/* encode a size field of width @w containing @n - 1 units */ +#define GIC_ENCODE_SZ(n, w) (((unsigned long)(n) - 1) & GENMASK_ULL(((w) - 1), 0)) + +#define GICR_PROPBASER_SHAREABILITY_SHIFT (10) +#define GICR_PROPBASER_INNER_CACHEABILITY_SHIFT (7) +#define GICR_PROPBASER_OUTER_CACHEABILITY_SHIFT (56) +#define GICR_PROPBASER_SHAREABILITY_MASK \ + GIC_BASER_SHAREABILITY(GICR_PROPBASER, SHAREABILITY_MASK) +#define GICR_PROPBASER_INNER_CACHEABILITY_MASK \ + GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, MASK) +#define GICR_PROPBASER_OUTER_CACHEABILITY_MASK \ + GIC_BASER_CACHEABILITY(GICR_PROPBASER, OUTER, MASK) +#define GICR_PROPBASER_CACHEABILITY_MASK GICR_PROPBASER_INNER_CACHEABILITY_MASK + +#define GICR_PROPBASER_InnerShareable \ + GIC_BASER_SHAREABILITY(GICR_PROPBASER, InnerShareable) + +#define GICR_PROPBASER_nCnB GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, nCnB) +#define GICR_PROPBASER_nC GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, nC) +#define GICR_PROPBASER_RaWt GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, RaWt) +#define GICR_PROPBASER_RaWb GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, RaWb) +#define GICR_PROPBASER_WaWt GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, WaWt) +#define GICR_PROPBASER_WaWb GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, WaWb) +#define GICR_PROPBASER_RaWaWt GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, RaWaWt) +#define GICR_PROPBASER_RaWaWb GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, RaWaWb) + +#define GICR_PROPBASER_IDBITS_MASK (0x1f) +#define GICR_PROPBASER_ADDRESS(x) ((x) & GENMASK_ULL(51, 12)) +#define GICR_PENDBASER_ADDRESS(x) ((x) & GENMASK_ULL(51, 16)) + +#define GICR_PENDBASER_SHAREABILITY_SHIFT (10) +#define GICR_PENDBASER_INNER_CACHEABILITY_SHIFT (7) +#define GICR_PENDBASER_OUTER_CACHEABILITY_SHIFT (56) +#define GICR_PENDBASER_SHAREABILITY_MASK \ + GIC_BASER_SHAREABILITY(GICR_PENDBASER, SHAREABILITY_MASK) +#define GICR_PENDBASER_INNER_CACHEABILITY_MASK \ + GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, MASK) +#define GICR_PENDBASER_OUTER_CACHEABILITY_MASK \ + GIC_BASER_CACHEABILITY(GICR_PENDBASER, OUTER, MASK) +#define GICR_PENDBASER_CACHEABILITY_MASK GICR_PENDBASER_INNER_CACHEABILITY_MASK + +#define GICR_PENDBASER_InnerShareable \ + GIC_BASER_SHAREABILITY(GICR_PENDBASER, InnerShareable) + +#define GICR_PENDBASER_nCnB GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, nCnB) +#define GICR_PENDBASER_nC GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, nC) +#define GICR_PENDBASER_RaWt GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, RaWt) +#define GICR_PENDBASER_RaWb GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, RaWb) +#define GICR_PENDBASER_WaWt GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, WaWt) +#define GICR_PENDBASER_WaWb GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, WaWb) +#define GICR_PENDBASER_RaWaWt GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, RaWaWt) +#define GICR_PENDBASER_RaWaWb GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, RaWaWb) + +#define GICR_PENDBASER_PTZ BIT_ULL(62) + +/* + * Re-Distributor registers, offsets from SGI_base + */ +#define GICR_IGROUPR0 GICD_IGROUPR +#define GICR_ISENABLER0 GICD_ISENABLER +#define GICR_ICENABLER0 GICD_ICENABLER +#define GICR_ISPENDR0 GICD_ISPENDR +#define GICR_ICPENDR0 GICD_ICPENDR +#define GICR_ISACTIVER0 GICD_ISACTIVER +#define GICR_ICACTIVER0 GICD_ICACTIVER +#define GICR_IPRIORITYR0 GICD_IPRIORITYR +#define GICR_ICFGR0 GICD_ICFGR +#define GICR_IGRPMODR0 GICD_IGRPMODR +#define GICR_NSACR GICD_NSACR + +#define GICR_TYPER_PLPIS (1U << 0) +#define GICR_TYPER_VLPIS (1U << 1) +#define GICR_TYPER_DIRTY (1U << 2) +#define GICR_TYPER_DirectLPIS (1U << 3) +#define GICR_TYPER_LAST (1U << 4) +#define GICR_TYPER_RVPEID (1U << 7) +#define GICR_TYPER_COMMON_LPI_AFF GENMASK_ULL(25, 24) +#define GICR_TYPER_AFFINITY GENMASK_ULL(63, 32) + +#define GICR_INVLPIR_INTID GENMASK_ULL(31, 0) +#define GICR_INVLPIR_VPEID GENMASK_ULL(47, 32) +#define GICR_INVLPIR_V GENMASK_ULL(63, 63) + +#define GICR_INVALLR_VPEID GICR_INVLPIR_VPEID +#define GICR_INVALLR_V GICR_INVLPIR_V + +#define GIC_V3_REDIST_SIZE 0x20000 + +#define LPI_PROP_GROUP1 (1 << 1) +#define LPI_PROP_ENABLED (1 << 0) + +/* + * Re-Distributor registers, offsets from VLPI_base + */ +#define GICR_VPROPBASER 0x0070 + +#define GICR_VPROPBASER_IDBITS_MASK 0x1f + +#define GICR_VPROPBASER_SHAREABILITY_SHIFT (10) +#define GICR_VPROPBASER_INNER_CACHEABILITY_SHIFT (7) +#define GICR_VPROPBASER_OUTER_CACHEABILITY_SHIFT (56) + +#define GICR_VPROPBASER_SHAREABILITY_MASK \ + GIC_BASER_SHAREABILITY(GICR_VPROPBASER, SHAREABILITY_MASK) +#define GICR_VPROPBASER_INNER_CACHEABILITY_MASK \ + GIC_BASER_CACHEABILITY(GICR_VPROPBASER, INNER, MASK) +#define GICR_VPROPBASER_OUTER_CACHEABILITY_MASK \ + GIC_BASER_CACHEABILITY(GICR_VPROPBASER, OUTER, MASK) +#define GICR_VPROPBASER_CACHEABILITY_MASK \ + GICR_VPROPBASER_INNER_CACHEABILITY_MASK + +#define GICR_VPROPBASER_InnerShareable \ + GIC_BASER_SHAREABILITY(GICR_VPROPBASER, InnerShareable) + +#define GICR_VPROPBASER_nCnB GIC_BASER_CACHEABILITY(GICR_VPROPBASER, INNER, nCnB) +#define GICR_VPROPBASER_nC GIC_BASER_CACHEABILITY(GICR_VPROPBASER, INNER, nC) +#define GICR_VPROPBASER_RaWt GIC_BASER_CACHEABILITY(GICR_VPROPBASER, INNER, RaWt) +#define GICR_VPROPBASER_RaWb GIC_BASER_CACHEABILITY(GICR_VPROPBASER, INNER, RaWb) +#define GICR_VPROPBASER_WaWt GIC_BASER_CACHEABILITY(GICR_VPROPBASER, INNER, WaWt) +#define GICR_VPROPBASER_WaWb GIC_BASER_CACHEABILITY(GICR_VPROPBASER, INNER, WaWb) +#define GICR_VPROPBASER_RaWaWt GIC_BASER_CACHEABILITY(GICR_VPROPBASER, INNER, RaWaWt) +#define GICR_VPROPBASER_RaWaWb GIC_BASER_CACHEABILITY(GICR_VPROPBASER, INNER, RaWaWb) + +/* + * GICv4.1 VPROPBASER reinvention. A subtle mix between the old + * VPROPBASER and ITS_BASER. Just not quite any of the two. + */ +#define GICR_VPROPBASER_4_1_VALID (1ULL << 63) +#define GICR_VPROPBASER_4_1_ENTRY_SIZE GENMASK_ULL(61, 59) +#define GICR_VPROPBASER_4_1_INDIRECT (1ULL << 55) +#define GICR_VPROPBASER_4_1_PAGE_SIZE GENMASK_ULL(54, 53) +#define GICR_VPROPBASER_4_1_Z (1ULL << 52) +#define GICR_VPROPBASER_4_1_ADDR GENMASK_ULL(51, 12) +#define GICR_VPROPBASER_4_1_SIZE GENMASK_ULL(6, 0) + +#define GICR_VPENDBASER 0x0078 + +#define GICR_VPENDBASER_SHAREABILITY_SHIFT (10) +#define GICR_VPENDBASER_INNER_CACHEABILITY_SHIFT (7) +#define GICR_VPENDBASER_OUTER_CACHEABILITY_SHIFT (56) +#define GICR_VPENDBASER_SHAREABILITY_MASK \ + GIC_BASER_SHAREABILITY(GICR_VPENDBASER, SHAREABILITY_MASK) +#define GICR_VPENDBASER_INNER_CACHEABILITY_MASK \ + GIC_BASER_CACHEABILITY(GICR_VPENDBASER, INNER, MASK) +#define GICR_VPENDBASER_OUTER_CACHEABILITY_MASK \ + GIC_BASER_CACHEABILITY(GICR_VPENDBASER, OUTER, MASK) +#define GICR_VPENDBASER_CACHEABILITY_MASK \ + GICR_VPENDBASER_INNER_CACHEABILITY_MASK + +#define GICR_VPENDBASER_NonShareable \ + GIC_BASER_SHAREABILITY(GICR_VPENDBASER, NonShareable) + +#define GICR_VPENDBASER_InnerShareable \ + GIC_BASER_SHAREABILITY(GICR_VPENDBASER, InnerShareable) + +#define GICR_VPENDBASER_nCnB GIC_BASER_CACHEABILITY(GICR_VPENDBASER, INNER, nCnB) +#define GICR_VPENDBASER_nC GIC_BASER_CACHEABILITY(GICR_VPENDBASER, INNER, nC) +#define GICR_VPENDBASER_RaWt GIC_BASER_CACHEABILITY(GICR_VPENDBASER, INNER, RaWt) +#define GICR_VPENDBASER_RaWb GIC_BASER_CACHEABILITY(GICR_VPENDBASER, INNER, RaWb) +#define GICR_VPENDBASER_WaWt GIC_BASER_CACHEABILITY(GICR_VPENDBASER, INNER, WaWt) +#define GICR_VPENDBASER_WaWb GIC_BASER_CACHEABILITY(GICR_VPENDBASER, INNER, WaWb) +#define GICR_VPENDBASER_RaWaWt GIC_BASER_CACHEABILITY(GICR_VPENDBASER, INNER, RaWaWt) +#define GICR_VPENDBASER_RaWaWb GIC_BASER_CACHEABILITY(GICR_VPENDBASER, INNER, RaWaWb) + +#define GICR_VPENDBASER_Dirty (1ULL << 60) +#define GICR_VPENDBASER_PendingLast (1ULL << 61) +#define GICR_VPENDBASER_IDAI (1ULL << 62) +#define GICR_VPENDBASER_Valid (1ULL << 63) + +/* + * GICv4.1 VPENDBASER, used for VPE residency. On top of these fields, + * also use the above Valid, PendingLast and Dirty. + */ +#define GICR_VPENDBASER_4_1_DB (1ULL << 62) +#define GICR_VPENDBASER_4_1_VGRP0EN (1ULL << 59) +#define GICR_VPENDBASER_4_1_VGRP1EN (1ULL << 58) +#define GICR_VPENDBASER_4_1_VPEID GENMASK_ULL(15, 0) + +#define GICR_VSGIR 0x0080 + +#define GICR_VSGIR_VPEID GENMASK(15, 0) + +#define GICR_VSGIPENDR 0x0088 + +#define GICR_VSGIPENDR_BUSY (1U << 31) +#define GICR_VSGIPENDR_PENDING GENMASK(15, 0) + +/* + * ITS registers, offsets from ITS_base + */ +#define GITS_CTLR 0x0000 +#define GITS_IIDR 0x0004 +#define GITS_TYPER 0x0008 +#define GITS_MPIDR 0x0018 +#define GITS_CBASER 0x0080 +#define GITS_CWRITER 0x0088 +#define GITS_CREADR 0x0090 +#define GITS_BASER 0x0100 +#define GITS_IDREGS_BASE 0xffd0 +#define GITS_PIDR0 0xffe0 +#define GITS_PIDR1 0xffe4 +#define GITS_PIDR2 GICR_PIDR2 +#define GITS_PIDR4 0xffd0 +#define GITS_CIDR0 0xfff0 +#define GITS_CIDR1 0xfff4 +#define GITS_CIDR2 0xfff8 +#define GITS_CIDR3 0xfffc + +#define GITS_TRANSLATER 0x10040 + +#define GITS_SGIR 0x20020 + +#define GITS_SGIR_VPEID GENMASK_ULL(47, 32) +#define GITS_SGIR_VINTID GENMASK_ULL(3, 0) + +#define GITS_CTLR_ENABLE (1U << 0) +#define GITS_CTLR_ImDe (1U << 1) +#define GITS_CTLR_ITS_NUMBER_SHIFT 4 +#define GITS_CTLR_ITS_NUMBER (0xFU << GITS_CTLR_ITS_NUMBER_SHIFT) +#define GITS_CTLR_QUIESCENT (1U << 31) + +#define GITS_TYPER_PLPIS (1UL << 0) +#define GITS_TYPER_VLPIS (1UL << 1) +#define GITS_TYPER_ITT_ENTRY_SIZE_SHIFT 4 +#define GITS_TYPER_ITT_ENTRY_SIZE GENMASK_ULL(7, 4) +#define GITS_TYPER_IDBITS_SHIFT 8 +#define GITS_TYPER_DEVBITS_SHIFT 13 +#define GITS_TYPER_DEVBITS GENMASK_ULL(17, 13) +#define GITS_TYPER_PTA (1UL << 19) +#define GITS_TYPER_HCC_SHIFT 24 +#define GITS_TYPER_HCC(r) (((r) >> GITS_TYPER_HCC_SHIFT) & 0xff) +#define GITS_TYPER_VMOVP (1ULL << 37) +#define GITS_TYPER_VMAPP (1ULL << 40) +#define GITS_TYPER_SVPET GENMASK_ULL(42, 41) + +#define GITS_IIDR_REV_SHIFT 12 +#define GITS_IIDR_REV_MASK (0xf << GITS_IIDR_REV_SHIFT) +#define GITS_IIDR_REV(r) (((r) >> GITS_IIDR_REV_SHIFT) & 0xf) +#define GITS_IIDR_PRODUCTID_SHIFT 24 + +#define GITS_CBASER_VALID (1ULL << 63) +#define GITS_CBASER_SHAREABILITY_SHIFT (10) +#define GITS_CBASER_INNER_CACHEABILITY_SHIFT (59) +#define GITS_CBASER_OUTER_CACHEABILITY_SHIFT (53) +#define GITS_CBASER_SHAREABILITY_MASK \ + GIC_BASER_SHAREABILITY(GITS_CBASER, SHAREABILITY_MASK) +#define GITS_CBASER_INNER_CACHEABILITY_MASK \ + GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, MASK) +#define GITS_CBASER_OUTER_CACHEABILITY_MASK \ + GIC_BASER_CACHEABILITY(GITS_CBASER, OUTER, MASK) +#define GITS_CBASER_CACHEABILITY_MASK GITS_CBASER_INNER_CACHEABILITY_MASK + +#define GITS_CBASER_InnerShareable \ + GIC_BASER_SHAREABILITY(GITS_CBASER, InnerShareable) + +#define GITS_CBASER_nCnB GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, nCnB) +#define GITS_CBASER_nC GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, nC) +#define GITS_CBASER_RaWt GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, RaWt) +#define GITS_CBASER_RaWb GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, RaWb) +#define GITS_CBASER_WaWt GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, WaWt) +#define GITS_CBASER_WaWb GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, WaWb) +#define GITS_CBASER_RaWaWt GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, RaWaWt) +#define GITS_CBASER_RaWaWb GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, RaWaWb) + +#define GITS_CBASER_ADDRESS(cbaser) ((cbaser) & GENMASK_ULL(51, 12)) + +#define GITS_BASER_NR_REGS 8 + +#define GITS_BASER_VALID (1ULL << 63) +#define GITS_BASER_INDIRECT (1ULL << 62) + +#define GITS_BASER_INNER_CACHEABILITY_SHIFT (59) +#define GITS_BASER_OUTER_CACHEABILITY_SHIFT (53) +#define GITS_BASER_INNER_CACHEABILITY_MASK \ + GIC_BASER_CACHEABILITY(GITS_BASER, INNER, MASK) +#define GITS_BASER_CACHEABILITY_MASK GITS_BASER_INNER_CACHEABILITY_MASK +#define GITS_BASER_OUTER_CACHEABILITY_MASK \ + GIC_BASER_CACHEABILITY(GITS_BASER, OUTER, MASK) +#define GITS_BASER_SHAREABILITY_MASK \ + GIC_BASER_SHAREABILITY(GITS_BASER, SHAREABILITY_MASK) + +#define GITS_BASER_nCnB GIC_BASER_CACHEABILITY(GITS_BASER, INNER, nCnB) +#define GITS_BASER_nC GIC_BASER_CACHEABILITY(GITS_BASER, INNER, nC) +#define GITS_BASER_RaWt GIC_BASER_CACHEABILITY(GITS_BASER, INNER, RaWt) +#define GITS_BASER_RaWb GIC_BASER_CACHEABILITY(GITS_BASER, INNER, RaWb) +#define GITS_BASER_WaWt GIC_BASER_CACHEABILITY(GITS_BASER, INNER, WaWt) +#define GITS_BASER_WaWb GIC_BASER_CACHEABILITY(GITS_BASER, INNER, WaWb) +#define GITS_BASER_RaWaWt GIC_BASER_CACHEABILITY(GITS_BASER, INNER, RaWaWt) +#define GITS_BASER_RaWaWb GIC_BASER_CACHEABILITY(GITS_BASER, INNER, RaWaWb) + +#define GITS_BASER_TYPE_SHIFT (56) +#define GITS_BASER_TYPE(r) (((r) >> GITS_BASER_TYPE_SHIFT) & 7) +#define GITS_BASER_ENTRY_SIZE_SHIFT (48) +#define GITS_BASER_ENTRY_SIZE(r) ((((r) >> GITS_BASER_ENTRY_SIZE_SHIFT) & 0x1f) + 1) +#define GITS_BASER_ENTRY_SIZE_MASK GENMASK_ULL(52, 48) +#define GITS_BASER_PHYS_52_to_48(phys) \ + (((phys) & GENMASK_ULL(47, 16)) | (((phys) >> 48) & 0xf) << 12) +#define GITS_BASER_ADDR_48_to_52(baser) \ + (((baser) & GENMASK_ULL(47, 16)) | (((baser) >> 12) & 0xf) << 48) + +#define GITS_BASER_SHAREABILITY_SHIFT (10) +#define GITS_BASER_InnerShareable \ + GIC_BASER_SHAREABILITY(GITS_BASER, InnerShareable) +#define GITS_BASER_PAGE_SIZE_SHIFT (8) +#define __GITS_BASER_PSZ(sz) (GIC_PAGE_SIZE_ ## sz << GITS_BASER_PAGE_SIZE_SHIFT) +#define GITS_BASER_PAGE_SIZE_4K __GITS_BASER_PSZ(4K) +#define GITS_BASER_PAGE_SIZE_16K __GITS_BASER_PSZ(16K) +#define GITS_BASER_PAGE_SIZE_64K __GITS_BASER_PSZ(64K) +#define GITS_BASER_PAGE_SIZE_MASK __GITS_BASER_PSZ(MASK) +#define GITS_BASER_PAGES_MAX 256 +#define GITS_BASER_PAGES_SHIFT (0) +#define GITS_BASER_NR_PAGES(r) (((r) & 0xff) + 1) + +#define GITS_BASER_TYPE_NONE 0 +#define GITS_BASER_TYPE_DEVICE 1 +#define GITS_BASER_TYPE_VCPU 2 +#define GITS_BASER_TYPE_RESERVED3 3 +#define GITS_BASER_TYPE_COLLECTION 4 +#define GITS_BASER_TYPE_RESERVED5 5 +#define GITS_BASER_TYPE_RESERVED6 6 +#define GITS_BASER_TYPE_RESERVED7 7 + +#define GITS_LVL1_ENTRY_SIZE (8UL) + +/* + * ITS commands + */ +#define GITS_CMD_MAPD 0x08 +#define GITS_CMD_MAPC 0x09 +#define GITS_CMD_MAPTI 0x0a +#define GITS_CMD_MAPI 0x0b +#define GITS_CMD_MOVI 0x01 +#define GITS_CMD_DISCARD 0x0f +#define GITS_CMD_INV 0x0c +#define GITS_CMD_MOVALL 0x0e +#define GITS_CMD_INVALL 0x0d +#define GITS_CMD_INT 0x03 +#define GITS_CMD_CLEAR 0x04 +#define GITS_CMD_SYNC 0x05 + +/* + * GICv4 ITS specific commands + */ +#define GITS_CMD_GICv4(x) ((x) | 0x20) +#define GITS_CMD_VINVALL GITS_CMD_GICv4(GITS_CMD_INVALL) +#define GITS_CMD_VMAPP GITS_CMD_GICv4(GITS_CMD_MAPC) +#define GITS_CMD_VMAPTI GITS_CMD_GICv4(GITS_CMD_MAPTI) +#define GITS_CMD_VMOVI GITS_CMD_GICv4(GITS_CMD_MOVI) +#define GITS_CMD_VSYNC GITS_CMD_GICv4(GITS_CMD_SYNC) +/* VMOVP, VSGI and INVDB are the odd ones, as they dont have a physical counterpart */ +#define GITS_CMD_VMOVP GITS_CMD_GICv4(2) +#define GITS_CMD_VSGI GITS_CMD_GICv4(3) +#define GITS_CMD_INVDB GITS_CMD_GICv4(0xe) + +/* + * ITS error numbers + */ +#define E_ITS_MOVI_UNMAPPED_INTERRUPT 0x010107 +#define E_ITS_MOVI_UNMAPPED_COLLECTION 0x010109 +#define E_ITS_INT_UNMAPPED_INTERRUPT 0x010307 +#define E_ITS_CLEAR_UNMAPPED_INTERRUPT 0x010507 +#define E_ITS_MAPD_DEVICE_OOR 0x010801 +#define E_ITS_MAPD_ITTSIZE_OOR 0x010802 +#define E_ITS_MAPC_PROCNUM_OOR 0x010902 +#define E_ITS_MAPC_COLLECTION_OOR 0x010903 +#define E_ITS_MAPTI_UNMAPPED_DEVICE 0x010a04 +#define E_ITS_MAPTI_ID_OOR 0x010a05 +#define E_ITS_MAPTI_PHYSICALID_OOR 0x010a06 +#define E_ITS_INV_UNMAPPED_INTERRUPT 0x010c07 +#define E_ITS_INVALL_UNMAPPED_COLLECTION 0x010d09 +#define E_ITS_MOVALL_PROCNUM_OOR 0x010e01 +#define E_ITS_DISCARD_UNMAPPED_INTERRUPT 0x010f07 + +/* + * CPU interface registers + */ +#define ICC_CTLR_EL1_EOImode_SHIFT (1) +#define ICC_CTLR_EL1_EOImode_drop_dir (0U << ICC_CTLR_EL1_EOImode_SHIFT) +#define ICC_CTLR_EL1_EOImode_drop (1U << ICC_CTLR_EL1_EOImode_SHIFT) +#define ICC_CTLR_EL1_EOImode_MASK (1 << ICC_CTLR_EL1_EOImode_SHIFT) +#define ICC_CTLR_EL1_CBPR_SHIFT 0 +#define ICC_CTLR_EL1_CBPR_MASK (1 << ICC_CTLR_EL1_CBPR_SHIFT) +#define ICC_CTLR_EL1_PMHE_SHIFT 6 +#define ICC_CTLR_EL1_PMHE_MASK (1 << ICC_CTLR_EL1_PMHE_SHIFT) +#define ICC_CTLR_EL1_PRI_BITS_SHIFT 8 +#define ICC_CTLR_EL1_PRI_BITS_MASK (0x7 << ICC_CTLR_EL1_PRI_BITS_SHIFT) +#define ICC_CTLR_EL1_ID_BITS_SHIFT 11 +#define ICC_CTLR_EL1_ID_BITS_MASK (0x7 << ICC_CTLR_EL1_ID_BITS_SHIFT) +#define ICC_CTLR_EL1_SEIS_SHIFT 14 +#define ICC_CTLR_EL1_SEIS_MASK (0x1 << ICC_CTLR_EL1_SEIS_SHIFT) +#define ICC_CTLR_EL1_A3V_SHIFT 15 +#define ICC_CTLR_EL1_A3V_MASK (0x1 << ICC_CTLR_EL1_A3V_SHIFT) +#define ICC_CTLR_EL1_RSS (0x1 << 18) +#define ICC_CTLR_EL1_ExtRange (0x1 << 19) +#define ICC_PMR_EL1_SHIFT 0 +#define ICC_PMR_EL1_MASK (0xff << ICC_PMR_EL1_SHIFT) +#define ICC_BPR0_EL1_SHIFT 0 +#define ICC_BPR0_EL1_MASK (0x7 << ICC_BPR0_EL1_SHIFT) +#define ICC_BPR1_EL1_SHIFT 0 +#define ICC_BPR1_EL1_MASK (0x7 << ICC_BPR1_EL1_SHIFT) +#define ICC_IGRPEN0_EL1_SHIFT 0 +#define ICC_IGRPEN0_EL1_MASK (1 << ICC_IGRPEN0_EL1_SHIFT) +#define ICC_IGRPEN1_EL1_SHIFT 0 +#define ICC_IGRPEN1_EL1_MASK (1 << ICC_IGRPEN1_EL1_SHIFT) +#define ICC_SRE_EL1_DIB (1U << 2) +#define ICC_SRE_EL1_DFB (1U << 1) +#define ICC_SRE_EL1_SRE (1U << 0) + +/* These are for GICv2 emulation only */ +#define GICH_LR_VIRTUALID (0x3ffUL << 0) +#define GICH_LR_PHYSID_CPUID_SHIFT (10) +#define GICH_LR_PHYSID_CPUID (7UL << GICH_LR_PHYSID_CPUID_SHIFT) + +#define ICC_IAR1_EL1_SPURIOUS 0x3ff + +#define ICC_SRE_EL2_SRE (1 << 0) +#define ICC_SRE_EL2_ENABLE (1 << 3) + +#define ICC_SGI1R_TARGET_LIST_SHIFT 0 +#define ICC_SGI1R_TARGET_LIST_MASK (0xffff << ICC_SGI1R_TARGET_LIST_SHIFT) +#define ICC_SGI1R_AFFINITY_1_SHIFT 16 +#define ICC_SGI1R_AFFINITY_1_MASK (0xff << ICC_SGI1R_AFFINITY_1_SHIFT) +#define ICC_SGI1R_SGI_ID_SHIFT 24 +#define ICC_SGI1R_SGI_ID_MASK (0xfULL << ICC_SGI1R_SGI_ID_SHIFT) +#define ICC_SGI1R_AFFINITY_2_SHIFT 32 +#define ICC_SGI1R_AFFINITY_2_MASK (0xffULL << ICC_SGI1R_AFFINITY_2_SHIFT) +#define ICC_SGI1R_IRQ_ROUTING_MODE_BIT 40 +#define ICC_SGI1R_RS_SHIFT 44 +#define ICC_SGI1R_RS_MASK (0xfULL << ICC_SGI1R_RS_SHIFT) +#define ICC_SGI1R_AFFINITY_3_SHIFT 48 +#define ICC_SGI1R_AFFINITY_3_MASK (0xffULL << ICC_SGI1R_AFFINITY_3_SHIFT) + +#endif diff --git a/tools/testing/selftests/kvm/include/aarch64/gic_v3_its.h b/tools/testing/selftests/kvm/include/aarch64/gic_v3_its.h new file mode 100644 index 000000000000..3722ed9c8f96 --- /dev/null +++ b/tools/testing/selftests/kvm/include/aarch64/gic_v3_its.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef __SELFTESTS_GIC_V3_ITS_H__ +#define __SELFTESTS_GIC_V3_ITS_H__ + +#include <linux/sizes.h> + +void its_init(vm_paddr_t coll_tbl, size_t coll_tbl_sz, + vm_paddr_t device_tbl, size_t device_tbl_sz, + vm_paddr_t cmdq, size_t cmdq_size); + +void its_send_mapd_cmd(void *cmdq_base, u32 device_id, vm_paddr_t itt_base, + size_t itt_size, bool valid); +void its_send_mapc_cmd(void *cmdq_base, u32 vcpu_id, u32 collection_id, bool valid); +void its_send_mapti_cmd(void *cmdq_base, u32 device_id, u32 event_id, + u32 collection_id, u32 intid); +void its_send_invall_cmd(void *cmdq_base, u32 collection_id); + +#endif // __SELFTESTS_GIC_V3_ITS_H__ diff --git a/tools/testing/selftests/kvm/include/aarch64/kvm_util_arch.h b/tools/testing/selftests/kvm/include/aarch64/kvm_util_arch.h new file mode 100644 index 000000000000..e43a57d99b56 --- /dev/null +++ b/tools/testing/selftests/kvm/include/aarch64/kvm_util_arch.h @@ -0,0 +1,7 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef SELFTEST_KVM_UTIL_ARCH_H +#define SELFTEST_KVM_UTIL_ARCH_H + +struct kvm_vm_arch {}; + +#endif // SELFTEST_KVM_UTIL_ARCH_H diff --git a/tools/testing/selftests/kvm/include/aarch64/processor.h b/tools/testing/selftests/kvm/include/aarch64/processor.h index b7fa0c8551db..9b20a355d81a 100644 --- a/tools/testing/selftests/kvm/include/aarch64/processor.h +++ b/tools/testing/selftests/kvm/include/aarch64/processor.h @@ -8,16 +8,27 @@ #define SELFTEST_KVM_PROCESSOR_H #include "kvm_util.h" +#include "ucall_common.h" + +#include <linux/stringify.h> +#include <linux/types.h> +#include <asm/sysreg.h> #define ARM64_CORE_REG(x) (KVM_REG_ARM64 | KVM_REG_SIZE_U64 | \ KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(x)) -#define CPACR_EL1 3, 0, 1, 0, 2 -#define TCR_EL1 3, 0, 2, 0, 2 -#define MAIR_EL1 3, 0, 10, 2, 0 -#define TTBR0_EL1 3, 0, 2, 0, 0 -#define SCTLR_EL1 3, 0, 1, 0, 0 +/* + * KVM_ARM64_SYS_REG(sys_reg_id): Helper macro to convert + * SYS_* register definitions in asm/sysreg.h to use in KVM + * calls such as vcpu_get_reg() and vcpu_set_reg(). + */ +#define KVM_ARM64_SYS_REG(sys_reg_id) \ + ARM64_SYS_REG(sys_reg_Op0(sys_reg_id), \ + sys_reg_Op1(sys_reg_id), \ + sys_reg_CRn(sys_reg_id), \ + sys_reg_CRm(sys_reg_id), \ + sys_reg_Op2(sys_reg_id)) /* * Default MAIR @@ -29,31 +40,207 @@ * NORMAL 4 1111:1111 * NORMAL_WT 5 1011:1011 */ -#define DEFAULT_MAIR_EL1 ((0x00ul << (0 * 8)) | \ - (0x04ul << (1 * 8)) | \ - (0x0cul << (2 * 8)) | \ - (0x44ul << (3 * 8)) | \ - (0xfful << (4 * 8)) | \ - (0xbbul << (5 * 8))) - -static inline void get_reg(struct kvm_vm *vm, uint32_t vcpuid, uint64_t id, uint64_t *addr) + +/* Linux doesn't use these memory types, so let's define them. */ +#define MAIR_ATTR_DEVICE_GRE UL(0x0c) +#define MAIR_ATTR_NORMAL_WT UL(0xbb) + +#define MT_DEVICE_nGnRnE 0 +#define MT_DEVICE_nGnRE 1 +#define MT_DEVICE_GRE 2 +#define MT_NORMAL_NC 3 +#define MT_NORMAL 4 +#define MT_NORMAL_WT 5 + +#define DEFAULT_MAIR_EL1 \ + (MAIR_ATTRIDX(MAIR_ATTR_DEVICE_nGnRnE, MT_DEVICE_nGnRnE) | \ + MAIR_ATTRIDX(MAIR_ATTR_DEVICE_nGnRE, MT_DEVICE_nGnRE) | \ + MAIR_ATTRIDX(MAIR_ATTR_DEVICE_GRE, MT_DEVICE_GRE) | \ + MAIR_ATTRIDX(MAIR_ATTR_NORMAL_NC, MT_NORMAL_NC) | \ + MAIR_ATTRIDX(MAIR_ATTR_NORMAL, MT_NORMAL) | \ + MAIR_ATTRIDX(MAIR_ATTR_NORMAL_WT, MT_NORMAL_WT)) + +void aarch64_vcpu_setup(struct kvm_vcpu *vcpu, struct kvm_vcpu_init *init); +struct kvm_vcpu *aarch64_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id, + struct kvm_vcpu_init *init, void *guest_code); + +struct ex_regs { + u64 regs[31]; + u64 sp; + u64 pc; + u64 pstate; +}; + +#define VECTOR_NUM 16 + +enum { + VECTOR_SYNC_CURRENT_SP0, + VECTOR_IRQ_CURRENT_SP0, + VECTOR_FIQ_CURRENT_SP0, + VECTOR_ERROR_CURRENT_SP0, + + VECTOR_SYNC_CURRENT, + VECTOR_IRQ_CURRENT, + VECTOR_FIQ_CURRENT, + VECTOR_ERROR_CURRENT, + + VECTOR_SYNC_LOWER_64, + VECTOR_IRQ_LOWER_64, + VECTOR_FIQ_LOWER_64, + VECTOR_ERROR_LOWER_64, + + VECTOR_SYNC_LOWER_32, + VECTOR_IRQ_LOWER_32, + VECTOR_FIQ_LOWER_32, + VECTOR_ERROR_LOWER_32, +}; + +#define VECTOR_IS_SYNC(v) ((v) == VECTOR_SYNC_CURRENT_SP0 || \ + (v) == VECTOR_SYNC_CURRENT || \ + (v) == VECTOR_SYNC_LOWER_64 || \ + (v) == VECTOR_SYNC_LOWER_32) + +#define ESR_EC_NUM 64 +#define ESR_EC_SHIFT 26 +#define ESR_EC_MASK (ESR_EC_NUM - 1) + +#define ESR_EC_UNKNOWN 0x0 +#define ESR_EC_SVC64 0x15 +#define ESR_EC_IABT 0x21 +#define ESR_EC_DABT 0x25 +#define ESR_EC_HW_BP_CURRENT 0x31 +#define ESR_EC_SSTEP_CURRENT 0x33 +#define ESR_EC_WP_CURRENT 0x35 +#define ESR_EC_BRK_INS 0x3c + +/* Access flag */ +#define PTE_AF (1ULL << 10) + +/* Access flag update enable/disable */ +#define TCR_EL1_HA (1ULL << 39) + +void aarch64_get_supported_page_sizes(uint32_t ipa, uint32_t *ipa4k, + uint32_t *ipa16k, uint32_t *ipa64k); + +void vm_init_descriptor_tables(struct kvm_vm *vm); +void vcpu_init_descriptor_tables(struct kvm_vcpu *vcpu); + +typedef void(*handler_fn)(struct ex_regs *); +void vm_install_exception_handler(struct kvm_vm *vm, + int vector, handler_fn handler); +void vm_install_sync_handler(struct kvm_vm *vm, + int vector, int ec, handler_fn handler); + +uint64_t *virt_get_pte_hva(struct kvm_vm *vm, vm_vaddr_t gva); + +static inline void cpu_relax(void) +{ + asm volatile("yield" ::: "memory"); +} + +#define isb() asm volatile("isb" : : : "memory") +#define dsb(opt) asm volatile("dsb " #opt : : : "memory") +#define dmb(opt) asm volatile("dmb " #opt : : : "memory") + +#define dma_wmb() dmb(oshst) +#define __iowmb() dma_wmb() + +#define dma_rmb() dmb(oshld) + +#define __iormb(v) \ +({ \ + unsigned long tmp; \ + \ + dma_rmb(); \ + \ + /* \ + * Courtesy of arch/arm64/include/asm/io.h: \ + * Create a dummy control dependency from the IO read to any \ + * later instructions. This ensures that a subsequent call \ + * to udelay() will be ordered due to the ISB in __delay(). \ + */ \ + asm volatile("eor %0, %1, %1\n" \ + "cbnz %0, ." \ + : "=r" (tmp) : "r" ((unsigned long)(v)) \ + : "memory"); \ +}) + +static __always_inline void __raw_writel(u32 val, volatile void *addr) +{ + asm volatile("str %w0, [%1]" : : "rZ" (val), "r" (addr)); +} + +static __always_inline u32 __raw_readl(const volatile void *addr) +{ + u32 val; + asm volatile("ldr %w0, [%1]" : "=r" (val) : "r" (addr)); + return val; +} + +static __always_inline void __raw_writeq(u64 val, volatile void *addr) +{ + asm volatile("str %0, [%1]" : : "rZ" (val), "r" (addr)); +} + +static __always_inline u64 __raw_readq(const volatile void *addr) { - struct kvm_one_reg reg; - reg.id = id; - reg.addr = (uint64_t)addr; - vcpu_ioctl(vm, vcpuid, KVM_GET_ONE_REG, ®); + u64 val; + asm volatile("ldr %0, [%1]" : "=r" (val) : "r" (addr)); + return val; } -static inline void set_reg(struct kvm_vm *vm, uint32_t vcpuid, uint64_t id, uint64_t val) +#define writel_relaxed(v,c) ((void)__raw_writel((__force u32)cpu_to_le32(v),(c))) +#define readl_relaxed(c) ({ u32 __r = le32_to_cpu((__force __le32)__raw_readl(c)); __r; }) +#define writeq_relaxed(v,c) ((void)__raw_writeq((__force u64)cpu_to_le64(v),(c))) +#define readq_relaxed(c) ({ u64 __r = le64_to_cpu((__force __le64)__raw_readq(c)); __r; }) + +#define writel(v,c) ({ __iowmb(); writel_relaxed((v),(c));}) +#define readl(c) ({ u32 __v = readl_relaxed(c); __iormb(__v); __v; }) +#define writeq(v,c) ({ __iowmb(); writeq_relaxed((v),(c));}) +#define readq(c) ({ u64 __v = readq_relaxed(c); __iormb(__v); __v; }) + + +static inline void local_irq_enable(void) { - struct kvm_one_reg reg; - reg.id = id; - reg.addr = (uint64_t)&val; - vcpu_ioctl(vm, vcpuid, KVM_SET_ONE_REG, ®); + asm volatile("msr daifclr, #3" : : : "memory"); } -void aarch64_vcpu_setup(struct kvm_vm *vm, int vcpuid, struct kvm_vcpu_init *init); -void aarch64_vcpu_add_default(struct kvm_vm *vm, uint32_t vcpuid, - struct kvm_vcpu_init *init, void *guest_code); +static inline void local_irq_disable(void) +{ + asm volatile("msr daifset, #3" : : : "memory"); +} + +/** + * struct arm_smccc_res - Result from SMC/HVC call + * @a0-a3 result values from registers 0 to 3 + */ +struct arm_smccc_res { + unsigned long a0; + unsigned long a1; + unsigned long a2; + unsigned long a3; +}; + +/** + * smccc_hvc - Invoke a SMCCC function using the hvc conduit + * @function_id: the SMCCC function to be called + * @arg0-arg6: SMCCC function arguments, corresponding to registers x1-x7 + * @res: pointer to write the return values from registers x0-x3 + * + */ +void smccc_hvc(uint32_t function_id, uint64_t arg0, uint64_t arg1, + uint64_t arg2, uint64_t arg3, uint64_t arg4, uint64_t arg5, + uint64_t arg6, struct arm_smccc_res *res); + +/** + * smccc_smc - Invoke a SMCCC function using the smc conduit + * @function_id: the SMCCC function to be called + * @arg0-arg6: SMCCC function arguments, corresponding to registers x1-x7 + * @res: pointer to write the return values from registers x0-x3 + * + */ +void smccc_smc(uint32_t function_id, uint64_t arg0, uint64_t arg1, + uint64_t arg2, uint64_t arg3, uint64_t arg4, uint64_t arg5, + uint64_t arg6, struct arm_smccc_res *res); #endif /* SELFTEST_KVM_PROCESSOR_H */ diff --git a/tools/testing/selftests/kvm/include/aarch64/spinlock.h b/tools/testing/selftests/kvm/include/aarch64/spinlock.h new file mode 100644 index 000000000000..cf0984106d14 --- /dev/null +++ b/tools/testing/selftests/kvm/include/aarch64/spinlock.h @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef SELFTEST_KVM_ARM64_SPINLOCK_H +#define SELFTEST_KVM_ARM64_SPINLOCK_H + +struct spinlock { + int v; +}; + +extern void spin_lock(struct spinlock *lock); +extern void spin_unlock(struct spinlock *lock); + +#endif /* SELFTEST_KVM_ARM64_SPINLOCK_H */ diff --git a/tools/testing/selftests/kvm/include/aarch64/ucall.h b/tools/testing/selftests/kvm/include/aarch64/ucall.h new file mode 100644 index 000000000000..4ec801f37f00 --- /dev/null +++ b/tools/testing/selftests/kvm/include/aarch64/ucall.h @@ -0,0 +1,20 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef SELFTEST_KVM_UCALL_H +#define SELFTEST_KVM_UCALL_H + +#include "kvm_util.h" + +#define UCALL_EXIT_REASON KVM_EXIT_MMIO + +/* + * ucall_exit_mmio_addr holds per-VM values (global data is duplicated by each + * VM), it must not be accessed from host code. + */ +extern vm_vaddr_t *ucall_exit_mmio_addr; + +static inline void ucall_arch_do_ucall(vm_vaddr_t uc) +{ + WRITE_ONCE(*ucall_exit_mmio_addr, uc); +} + +#endif diff --git a/tools/testing/selftests/kvm/include/aarch64/vgic.h b/tools/testing/selftests/kvm/include/aarch64/vgic.h new file mode 100644 index 000000000000..c481d0c00a5d --- /dev/null +++ b/tools/testing/selftests/kvm/include/aarch64/vgic.h @@ -0,0 +1,37 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * ARM Generic Interrupt Controller (GIC) host specific defines + */ + +#ifndef SELFTEST_KVM_VGIC_H +#define SELFTEST_KVM_VGIC_H + +#include <linux/kvm.h> + +#include "kvm_util.h" + +#define REDIST_REGION_ATTR_ADDR(count, base, flags, index) \ + (((uint64_t)(count) << 52) | \ + ((uint64_t)((base) >> 16) << 16) | \ + ((uint64_t)(flags) << 12) | \ + index) + +int vgic_v3_setup(struct kvm_vm *vm, unsigned int nr_vcpus, uint32_t nr_irqs); + +#define VGIC_MAX_RESERVED 1023 + +void kvm_irq_set_level_info(int gic_fd, uint32_t intid, int level); +int _kvm_irq_set_level_info(int gic_fd, uint32_t intid, int level); + +void kvm_arm_irq_line(struct kvm_vm *vm, uint32_t intid, int level); +int _kvm_arm_irq_line(struct kvm_vm *vm, uint32_t intid, int level); + +/* The vcpu arg only applies to private interrupts. */ +void kvm_irq_write_ispendr(int gic_fd, uint32_t intid, struct kvm_vcpu *vcpu); +void kvm_irq_write_isactiver(int gic_fd, uint32_t intid, struct kvm_vcpu *vcpu); + +#define KVM_IRQCHIP_NUM_PINS (1020 - 32) + +int vgic_its_setup(struct kvm_vm *vm); + +#endif // SELFTEST_KVM_VGIC_H diff --git a/tools/testing/selftests/kvm/include/evmcs.h b/tools/testing/selftests/kvm/include/evmcs.h deleted file mode 100644 index a034438b6266..000000000000 --- a/tools/testing/selftests/kvm/include/evmcs.h +++ /dev/null @@ -1,1102 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * tools/testing/selftests/kvm/include/vmx.h - * - * Copyright (C) 2018, Red Hat, Inc. - * - */ - -#ifndef SELFTEST_KVM_EVMCS_H -#define SELFTEST_KVM_EVMCS_H - -#include <stdint.h> -#include "vmx.h" - -#define u16 uint16_t -#define u32 uint32_t -#define u64 uint64_t - -#define EVMCS_VERSION 1 - -extern bool enable_evmcs; - -struct hv_vp_assist_page { - __u32 apic_assist; - __u32 reserved; - __u64 vtl_control[2]; - __u64 nested_enlightenments_control[2]; - __u32 enlighten_vmentry; - __u64 current_nested_vmcs; -}; - -struct hv_enlightened_vmcs { - u32 revision_id; - u32 abort; - - u16 host_es_selector; - u16 host_cs_selector; - u16 host_ss_selector; - u16 host_ds_selector; - u16 host_fs_selector; - u16 host_gs_selector; - u16 host_tr_selector; - - u64 host_ia32_pat; - u64 host_ia32_efer; - - u64 host_cr0; - u64 host_cr3; - u64 host_cr4; - - u64 host_ia32_sysenter_esp; - u64 host_ia32_sysenter_eip; - u64 host_rip; - u32 host_ia32_sysenter_cs; - - u32 pin_based_vm_exec_control; - u32 vm_exit_controls; - u32 secondary_vm_exec_control; - - u64 io_bitmap_a; - u64 io_bitmap_b; - u64 msr_bitmap; - - u16 guest_es_selector; - u16 guest_cs_selector; - u16 guest_ss_selector; - u16 guest_ds_selector; - u16 guest_fs_selector; - u16 guest_gs_selector; - u16 guest_ldtr_selector; - u16 guest_tr_selector; - - u32 guest_es_limit; - u32 guest_cs_limit; - u32 guest_ss_limit; - u32 guest_ds_limit; - u32 guest_fs_limit; - u32 guest_gs_limit; - u32 guest_ldtr_limit; - u32 guest_tr_limit; - u32 guest_gdtr_limit; - u32 guest_idtr_limit; - - u32 guest_es_ar_bytes; - u32 guest_cs_ar_bytes; - u32 guest_ss_ar_bytes; - u32 guest_ds_ar_bytes; - u32 guest_fs_ar_bytes; - u32 guest_gs_ar_bytes; - u32 guest_ldtr_ar_bytes; - u32 guest_tr_ar_bytes; - - u64 guest_es_base; - u64 guest_cs_base; - u64 guest_ss_base; - u64 guest_ds_base; - u64 guest_fs_base; - u64 guest_gs_base; - u64 guest_ldtr_base; - u64 guest_tr_base; - u64 guest_gdtr_base; - u64 guest_idtr_base; - - u64 padding64_1[3]; - - u64 vm_exit_msr_store_addr; - u64 vm_exit_msr_load_addr; - u64 vm_entry_msr_load_addr; - - u64 cr3_target_value0; - u64 cr3_target_value1; - u64 cr3_target_value2; - u64 cr3_target_value3; - - u32 page_fault_error_code_mask; - u32 page_fault_error_code_match; - - u32 cr3_target_count; - u32 vm_exit_msr_store_count; - u32 vm_exit_msr_load_count; - u32 vm_entry_msr_load_count; - - u64 tsc_offset; - u64 virtual_apic_page_addr; - u64 vmcs_link_pointer; - - u64 guest_ia32_debugctl; - u64 guest_ia32_pat; - u64 guest_ia32_efer; - - u64 guest_pdptr0; - u64 guest_pdptr1; - u64 guest_pdptr2; - u64 guest_pdptr3; - - u64 guest_pending_dbg_exceptions; - u64 guest_sysenter_esp; - u64 guest_sysenter_eip; - - u32 guest_activity_state; - u32 guest_sysenter_cs; - - u64 cr0_guest_host_mask; - u64 cr4_guest_host_mask; - u64 cr0_read_shadow; - u64 cr4_read_shadow; - u64 guest_cr0; - u64 guest_cr3; - u64 guest_cr4; - u64 guest_dr7; - - u64 host_fs_base; - u64 host_gs_base; - u64 host_tr_base; - u64 host_gdtr_base; - u64 host_idtr_base; - u64 host_rsp; - - u64 ept_pointer; - - u16 virtual_processor_id; - u16 padding16[3]; - - u64 padding64_2[5]; - u64 guest_physical_address; - - u32 vm_instruction_error; - u32 vm_exit_reason; - u32 vm_exit_intr_info; - u32 vm_exit_intr_error_code; - u32 idt_vectoring_info_field; - u32 idt_vectoring_error_code; - u32 vm_exit_instruction_len; - u32 vmx_instruction_info; - - u64 exit_qualification; - u64 exit_io_instruction_ecx; - u64 exit_io_instruction_esi; - u64 exit_io_instruction_edi; - u64 exit_io_instruction_eip; - - u64 guest_linear_address; - u64 guest_rsp; - u64 guest_rflags; - - u32 guest_interruptibility_info; - u32 cpu_based_vm_exec_control; - u32 exception_bitmap; - u32 vm_entry_controls; - u32 vm_entry_intr_info_field; - u32 vm_entry_exception_error_code; - u32 vm_entry_instruction_len; - u32 tpr_threshold; - - u64 guest_rip; - - u32 hv_clean_fields; - u32 hv_padding_32; - u32 hv_synthetic_controls; - struct { - u32 nested_flush_hypercall:1; - u32 msr_bitmap:1; - u32 reserved:30; - } hv_enlightenments_control; - u32 hv_vp_id; - - u64 hv_vm_id; - u64 partition_assist_page; - u64 padding64_4[4]; - u64 guest_bndcfgs; - u64 padding64_5[7]; - u64 xss_exit_bitmap; - u64 padding64_6[7]; -}; - -#define HV_X64_MSR_VP_ASSIST_PAGE 0x40000073 -#define HV_X64_MSR_VP_ASSIST_PAGE_ENABLE 0x00000001 -#define HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_SHIFT 12 -#define HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_MASK \ - (~((1ull << HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_SHIFT) - 1)) - -extern struct hv_enlightened_vmcs *current_evmcs; -extern struct hv_vp_assist_page *current_vp_assist; - -int vcpu_enable_evmcs(struct kvm_vm *vm, int vcpu_id); - -static inline int enable_vp_assist(uint64_t vp_assist_pa, void *vp_assist) -{ - u64 val = (vp_assist_pa & HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_MASK) | - HV_X64_MSR_VP_ASSIST_PAGE_ENABLE; - - wrmsr(HV_X64_MSR_VP_ASSIST_PAGE, val); - - current_vp_assist = vp_assist; - - enable_evmcs = true; - - return 0; -} - -static inline int evmcs_vmptrld(uint64_t vmcs_pa, void *vmcs) -{ - current_vp_assist->current_nested_vmcs = vmcs_pa; - current_vp_assist->enlighten_vmentry = 1; - - current_evmcs = vmcs; - - return 0; -} - -static inline int evmcs_vmptrst(uint64_t *value) -{ - *value = current_vp_assist->current_nested_vmcs & - ~HV_X64_MSR_VP_ASSIST_PAGE_ENABLE; - - return 0; -} - -static inline int evmcs_vmread(uint64_t encoding, uint64_t *value) -{ - switch (encoding) { - case GUEST_RIP: - *value = current_evmcs->guest_rip; - break; - case GUEST_RSP: - *value = current_evmcs->guest_rsp; - break; - case GUEST_RFLAGS: - *value = current_evmcs->guest_rflags; - break; - case HOST_IA32_PAT: - *value = current_evmcs->host_ia32_pat; - break; - case HOST_IA32_EFER: - *value = current_evmcs->host_ia32_efer; - break; - case HOST_CR0: - *value = current_evmcs->host_cr0; - break; - case HOST_CR3: - *value = current_evmcs->host_cr3; - break; - case HOST_CR4: - *value = current_evmcs->host_cr4; - break; - case HOST_IA32_SYSENTER_ESP: - *value = current_evmcs->host_ia32_sysenter_esp; - break; - case HOST_IA32_SYSENTER_EIP: - *value = current_evmcs->host_ia32_sysenter_eip; - break; - case HOST_RIP: - *value = current_evmcs->host_rip; - break; - case IO_BITMAP_A: - *value = current_evmcs->io_bitmap_a; - break; - case IO_BITMAP_B: - *value = current_evmcs->io_bitmap_b; - break; - case MSR_BITMAP: - *value = current_evmcs->msr_bitmap; - break; - case GUEST_ES_BASE: - *value = current_evmcs->guest_es_base; - break; - case GUEST_CS_BASE: - *value = current_evmcs->guest_cs_base; - break; - case GUEST_SS_BASE: - *value = current_evmcs->guest_ss_base; - break; - case GUEST_DS_BASE: - *value = current_evmcs->guest_ds_base; - break; - case GUEST_FS_BASE: - *value = current_evmcs->guest_fs_base; - break; - case GUEST_GS_BASE: - *value = current_evmcs->guest_gs_base; - break; - case GUEST_LDTR_BASE: - *value = current_evmcs->guest_ldtr_base; - break; - case GUEST_TR_BASE: - *value = current_evmcs->guest_tr_base; - break; - case GUEST_GDTR_BASE: - *value = current_evmcs->guest_gdtr_base; - break; - case GUEST_IDTR_BASE: - *value = current_evmcs->guest_idtr_base; - break; - case TSC_OFFSET: - *value = current_evmcs->tsc_offset; - break; - case VIRTUAL_APIC_PAGE_ADDR: - *value = current_evmcs->virtual_apic_page_addr; - break; - case VMCS_LINK_POINTER: - *value = current_evmcs->vmcs_link_pointer; - break; - case GUEST_IA32_DEBUGCTL: - *value = current_evmcs->guest_ia32_debugctl; - break; - case GUEST_IA32_PAT: - *value = current_evmcs->guest_ia32_pat; - break; - case GUEST_IA32_EFER: - *value = current_evmcs->guest_ia32_efer; - break; - case GUEST_PDPTR0: - *value = current_evmcs->guest_pdptr0; - break; - case GUEST_PDPTR1: - *value = current_evmcs->guest_pdptr1; - break; - case GUEST_PDPTR2: - *value = current_evmcs->guest_pdptr2; - break; - case GUEST_PDPTR3: - *value = current_evmcs->guest_pdptr3; - break; - case GUEST_PENDING_DBG_EXCEPTIONS: - *value = current_evmcs->guest_pending_dbg_exceptions; - break; - case GUEST_SYSENTER_ESP: - *value = current_evmcs->guest_sysenter_esp; - break; - case GUEST_SYSENTER_EIP: - *value = current_evmcs->guest_sysenter_eip; - break; - case CR0_GUEST_HOST_MASK: - *value = current_evmcs->cr0_guest_host_mask; - break; - case CR4_GUEST_HOST_MASK: - *value = current_evmcs->cr4_guest_host_mask; - break; - case CR0_READ_SHADOW: - *value = current_evmcs->cr0_read_shadow; - break; - case CR4_READ_SHADOW: - *value = current_evmcs->cr4_read_shadow; - break; - case GUEST_CR0: - *value = current_evmcs->guest_cr0; - break; - case GUEST_CR3: - *value = current_evmcs->guest_cr3; - break; - case GUEST_CR4: - *value = current_evmcs->guest_cr4; - break; - case GUEST_DR7: - *value = current_evmcs->guest_dr7; - break; - case HOST_FS_BASE: - *value = current_evmcs->host_fs_base; - break; - case HOST_GS_BASE: - *value = current_evmcs->host_gs_base; - break; - case HOST_TR_BASE: - *value = current_evmcs->host_tr_base; - break; - case HOST_GDTR_BASE: - *value = current_evmcs->host_gdtr_base; - break; - case HOST_IDTR_BASE: - *value = current_evmcs->host_idtr_base; - break; - case HOST_RSP: - *value = current_evmcs->host_rsp; - break; - case EPT_POINTER: - *value = current_evmcs->ept_pointer; - break; - case GUEST_BNDCFGS: - *value = current_evmcs->guest_bndcfgs; - break; - case XSS_EXIT_BITMAP: - *value = current_evmcs->xss_exit_bitmap; - break; - case GUEST_PHYSICAL_ADDRESS: - *value = current_evmcs->guest_physical_address; - break; - case EXIT_QUALIFICATION: - *value = current_evmcs->exit_qualification; - break; - case GUEST_LINEAR_ADDRESS: - *value = current_evmcs->guest_linear_address; - break; - case VM_EXIT_MSR_STORE_ADDR: - *value = current_evmcs->vm_exit_msr_store_addr; - break; - case VM_EXIT_MSR_LOAD_ADDR: - *value = current_evmcs->vm_exit_msr_load_addr; - break; - case VM_ENTRY_MSR_LOAD_ADDR: - *value = current_evmcs->vm_entry_msr_load_addr; - break; - case CR3_TARGET_VALUE0: - *value = current_evmcs->cr3_target_value0; - break; - case CR3_TARGET_VALUE1: - *value = current_evmcs->cr3_target_value1; - break; - case CR3_TARGET_VALUE2: - *value = current_evmcs->cr3_target_value2; - break; - case CR3_TARGET_VALUE3: - *value = current_evmcs->cr3_target_value3; - break; - case TPR_THRESHOLD: - *value = current_evmcs->tpr_threshold; - break; - case GUEST_INTERRUPTIBILITY_INFO: - *value = current_evmcs->guest_interruptibility_info; - break; - case CPU_BASED_VM_EXEC_CONTROL: - *value = current_evmcs->cpu_based_vm_exec_control; - break; - case EXCEPTION_BITMAP: - *value = current_evmcs->exception_bitmap; - break; - case VM_ENTRY_CONTROLS: - *value = current_evmcs->vm_entry_controls; - break; - case VM_ENTRY_INTR_INFO_FIELD: - *value = current_evmcs->vm_entry_intr_info_field; - break; - case VM_ENTRY_EXCEPTION_ERROR_CODE: - *value = current_evmcs->vm_entry_exception_error_code; - break; - case VM_ENTRY_INSTRUCTION_LEN: - *value = current_evmcs->vm_entry_instruction_len; - break; - case HOST_IA32_SYSENTER_CS: - *value = current_evmcs->host_ia32_sysenter_cs; - break; - case PIN_BASED_VM_EXEC_CONTROL: - *value = current_evmcs->pin_based_vm_exec_control; - break; - case VM_EXIT_CONTROLS: - *value = current_evmcs->vm_exit_controls; - break; - case SECONDARY_VM_EXEC_CONTROL: - *value = current_evmcs->secondary_vm_exec_control; - break; - case GUEST_ES_LIMIT: - *value = current_evmcs->guest_es_limit; - break; - case GUEST_CS_LIMIT: - *value = current_evmcs->guest_cs_limit; - break; - case GUEST_SS_LIMIT: - *value = current_evmcs->guest_ss_limit; - break; - case GUEST_DS_LIMIT: - *value = current_evmcs->guest_ds_limit; - break; - case GUEST_FS_LIMIT: - *value = current_evmcs->guest_fs_limit; - break; - case GUEST_GS_LIMIT: - *value = current_evmcs->guest_gs_limit; - break; - case GUEST_LDTR_LIMIT: - *value = current_evmcs->guest_ldtr_limit; - break; - case GUEST_TR_LIMIT: - *value = current_evmcs->guest_tr_limit; - break; - case GUEST_GDTR_LIMIT: - *value = current_evmcs->guest_gdtr_limit; - break; - case GUEST_IDTR_LIMIT: - *value = current_evmcs->guest_idtr_limit; - break; - case GUEST_ES_AR_BYTES: - *value = current_evmcs->guest_es_ar_bytes; - break; - case GUEST_CS_AR_BYTES: - *value = current_evmcs->guest_cs_ar_bytes; - break; - case GUEST_SS_AR_BYTES: - *value = current_evmcs->guest_ss_ar_bytes; - break; - case GUEST_DS_AR_BYTES: - *value = current_evmcs->guest_ds_ar_bytes; - break; - case GUEST_FS_AR_BYTES: - *value = current_evmcs->guest_fs_ar_bytes; - break; - case GUEST_GS_AR_BYTES: - *value = current_evmcs->guest_gs_ar_bytes; - break; - case GUEST_LDTR_AR_BYTES: - *value = current_evmcs->guest_ldtr_ar_bytes; - break; - case GUEST_TR_AR_BYTES: - *value = current_evmcs->guest_tr_ar_bytes; - break; - case GUEST_ACTIVITY_STATE: - *value = current_evmcs->guest_activity_state; - break; - case GUEST_SYSENTER_CS: - *value = current_evmcs->guest_sysenter_cs; - break; - case VM_INSTRUCTION_ERROR: - *value = current_evmcs->vm_instruction_error; - break; - case VM_EXIT_REASON: - *value = current_evmcs->vm_exit_reason; - break; - case VM_EXIT_INTR_INFO: - *value = current_evmcs->vm_exit_intr_info; - break; - case VM_EXIT_INTR_ERROR_CODE: - *value = current_evmcs->vm_exit_intr_error_code; - break; - case IDT_VECTORING_INFO_FIELD: - *value = current_evmcs->idt_vectoring_info_field; - break; - case IDT_VECTORING_ERROR_CODE: - *value = current_evmcs->idt_vectoring_error_code; - break; - case VM_EXIT_INSTRUCTION_LEN: - *value = current_evmcs->vm_exit_instruction_len; - break; - case VMX_INSTRUCTION_INFO: - *value = current_evmcs->vmx_instruction_info; - break; - case PAGE_FAULT_ERROR_CODE_MASK: - *value = current_evmcs->page_fault_error_code_mask; - break; - case PAGE_FAULT_ERROR_CODE_MATCH: - *value = current_evmcs->page_fault_error_code_match; - break; - case CR3_TARGET_COUNT: - *value = current_evmcs->cr3_target_count; - break; - case VM_EXIT_MSR_STORE_COUNT: - *value = current_evmcs->vm_exit_msr_store_count; - break; - case VM_EXIT_MSR_LOAD_COUNT: - *value = current_evmcs->vm_exit_msr_load_count; - break; - case VM_ENTRY_MSR_LOAD_COUNT: - *value = current_evmcs->vm_entry_msr_load_count; - break; - case HOST_ES_SELECTOR: - *value = current_evmcs->host_es_selector; - break; - case HOST_CS_SELECTOR: - *value = current_evmcs->host_cs_selector; - break; - case HOST_SS_SELECTOR: - *value = current_evmcs->host_ss_selector; - break; - case HOST_DS_SELECTOR: - *value = current_evmcs->host_ds_selector; - break; - case HOST_FS_SELECTOR: - *value = current_evmcs->host_fs_selector; - break; - case HOST_GS_SELECTOR: - *value = current_evmcs->host_gs_selector; - break; - case HOST_TR_SELECTOR: - *value = current_evmcs->host_tr_selector; - break; - case GUEST_ES_SELECTOR: - *value = current_evmcs->guest_es_selector; - break; - case GUEST_CS_SELECTOR: - *value = current_evmcs->guest_cs_selector; - break; - case GUEST_SS_SELECTOR: - *value = current_evmcs->guest_ss_selector; - break; - case GUEST_DS_SELECTOR: - *value = current_evmcs->guest_ds_selector; - break; - case GUEST_FS_SELECTOR: - *value = current_evmcs->guest_fs_selector; - break; - case GUEST_GS_SELECTOR: - *value = current_evmcs->guest_gs_selector; - break; - case GUEST_LDTR_SELECTOR: - *value = current_evmcs->guest_ldtr_selector; - break; - case GUEST_TR_SELECTOR: - *value = current_evmcs->guest_tr_selector; - break; - case VIRTUAL_PROCESSOR_ID: - *value = current_evmcs->virtual_processor_id; - break; - default: return 1; - } - - return 0; -} - -static inline int evmcs_vmwrite(uint64_t encoding, uint64_t value) -{ - switch (encoding) { - case GUEST_RIP: - current_evmcs->guest_rip = value; - break; - case GUEST_RSP: - current_evmcs->guest_rsp = value; - break; - case GUEST_RFLAGS: - current_evmcs->guest_rflags = value; - break; - case HOST_IA32_PAT: - current_evmcs->host_ia32_pat = value; - break; - case HOST_IA32_EFER: - current_evmcs->host_ia32_efer = value; - break; - case HOST_CR0: - current_evmcs->host_cr0 = value; - break; - case HOST_CR3: - current_evmcs->host_cr3 = value; - break; - case HOST_CR4: - current_evmcs->host_cr4 = value; - break; - case HOST_IA32_SYSENTER_ESP: - current_evmcs->host_ia32_sysenter_esp = value; - break; - case HOST_IA32_SYSENTER_EIP: - current_evmcs->host_ia32_sysenter_eip = value; - break; - case HOST_RIP: - current_evmcs->host_rip = value; - break; - case IO_BITMAP_A: - current_evmcs->io_bitmap_a = value; - break; - case IO_BITMAP_B: - current_evmcs->io_bitmap_b = value; - break; - case MSR_BITMAP: - current_evmcs->msr_bitmap = value; - break; - case GUEST_ES_BASE: - current_evmcs->guest_es_base = value; - break; - case GUEST_CS_BASE: - current_evmcs->guest_cs_base = value; - break; - case GUEST_SS_BASE: - current_evmcs->guest_ss_base = value; - break; - case GUEST_DS_BASE: - current_evmcs->guest_ds_base = value; - break; - case GUEST_FS_BASE: - current_evmcs->guest_fs_base = value; - break; - case GUEST_GS_BASE: - current_evmcs->guest_gs_base = value; - break; - case GUEST_LDTR_BASE: - current_evmcs->guest_ldtr_base = value; - break; - case GUEST_TR_BASE: - current_evmcs->guest_tr_base = value; - break; - case GUEST_GDTR_BASE: - current_evmcs->guest_gdtr_base = value; - break; - case GUEST_IDTR_BASE: - current_evmcs->guest_idtr_base = value; - break; - case TSC_OFFSET: - current_evmcs->tsc_offset = value; - break; - case VIRTUAL_APIC_PAGE_ADDR: - current_evmcs->virtual_apic_page_addr = value; - break; - case VMCS_LINK_POINTER: - current_evmcs->vmcs_link_pointer = value; - break; - case GUEST_IA32_DEBUGCTL: - current_evmcs->guest_ia32_debugctl = value; - break; - case GUEST_IA32_PAT: - current_evmcs->guest_ia32_pat = value; - break; - case GUEST_IA32_EFER: - current_evmcs->guest_ia32_efer = value; - break; - case GUEST_PDPTR0: - current_evmcs->guest_pdptr0 = value; - break; - case GUEST_PDPTR1: - current_evmcs->guest_pdptr1 = value; - break; - case GUEST_PDPTR2: - current_evmcs->guest_pdptr2 = value; - break; - case GUEST_PDPTR3: - current_evmcs->guest_pdptr3 = value; - break; - case GUEST_PENDING_DBG_EXCEPTIONS: - current_evmcs->guest_pending_dbg_exceptions = value; - break; - case GUEST_SYSENTER_ESP: - current_evmcs->guest_sysenter_esp = value; - break; - case GUEST_SYSENTER_EIP: - current_evmcs->guest_sysenter_eip = value; - break; - case CR0_GUEST_HOST_MASK: - current_evmcs->cr0_guest_host_mask = value; - break; - case CR4_GUEST_HOST_MASK: - current_evmcs->cr4_guest_host_mask = value; - break; - case CR0_READ_SHADOW: - current_evmcs->cr0_read_shadow = value; - break; - case CR4_READ_SHADOW: - current_evmcs->cr4_read_shadow = value; - break; - case GUEST_CR0: - current_evmcs->guest_cr0 = value; - break; - case GUEST_CR3: - current_evmcs->guest_cr3 = value; - break; - case GUEST_CR4: - current_evmcs->guest_cr4 = value; - break; - case GUEST_DR7: - current_evmcs->guest_dr7 = value; - break; - case HOST_FS_BASE: - current_evmcs->host_fs_base = value; - break; - case HOST_GS_BASE: - current_evmcs->host_gs_base = value; - break; - case HOST_TR_BASE: - current_evmcs->host_tr_base = value; - break; - case HOST_GDTR_BASE: - current_evmcs->host_gdtr_base = value; - break; - case HOST_IDTR_BASE: - current_evmcs->host_idtr_base = value; - break; - case HOST_RSP: - current_evmcs->host_rsp = value; - break; - case EPT_POINTER: - current_evmcs->ept_pointer = value; - break; - case GUEST_BNDCFGS: - current_evmcs->guest_bndcfgs = value; - break; - case XSS_EXIT_BITMAP: - current_evmcs->xss_exit_bitmap = value; - break; - case GUEST_PHYSICAL_ADDRESS: - current_evmcs->guest_physical_address = value; - break; - case EXIT_QUALIFICATION: - current_evmcs->exit_qualification = value; - break; - case GUEST_LINEAR_ADDRESS: - current_evmcs->guest_linear_address = value; - break; - case VM_EXIT_MSR_STORE_ADDR: - current_evmcs->vm_exit_msr_store_addr = value; - break; - case VM_EXIT_MSR_LOAD_ADDR: - current_evmcs->vm_exit_msr_load_addr = value; - break; - case VM_ENTRY_MSR_LOAD_ADDR: - current_evmcs->vm_entry_msr_load_addr = value; - break; - case CR3_TARGET_VALUE0: - current_evmcs->cr3_target_value0 = value; - break; - case CR3_TARGET_VALUE1: - current_evmcs->cr3_target_value1 = value; - break; - case CR3_TARGET_VALUE2: - current_evmcs->cr3_target_value2 = value; - break; - case CR3_TARGET_VALUE3: - current_evmcs->cr3_target_value3 = value; - break; - case TPR_THRESHOLD: - current_evmcs->tpr_threshold = value; - break; - case GUEST_INTERRUPTIBILITY_INFO: - current_evmcs->guest_interruptibility_info = value; - break; - case CPU_BASED_VM_EXEC_CONTROL: - current_evmcs->cpu_based_vm_exec_control = value; - break; - case EXCEPTION_BITMAP: - current_evmcs->exception_bitmap = value; - break; - case VM_ENTRY_CONTROLS: - current_evmcs->vm_entry_controls = value; - break; - case VM_ENTRY_INTR_INFO_FIELD: - current_evmcs->vm_entry_intr_info_field = value; - break; - case VM_ENTRY_EXCEPTION_ERROR_CODE: - current_evmcs->vm_entry_exception_error_code = value; - break; - case VM_ENTRY_INSTRUCTION_LEN: - current_evmcs->vm_entry_instruction_len = value; - break; - case HOST_IA32_SYSENTER_CS: - current_evmcs->host_ia32_sysenter_cs = value; - break; - case PIN_BASED_VM_EXEC_CONTROL: - current_evmcs->pin_based_vm_exec_control = value; - break; - case VM_EXIT_CONTROLS: - current_evmcs->vm_exit_controls = value; - break; - case SECONDARY_VM_EXEC_CONTROL: - current_evmcs->secondary_vm_exec_control = value; - break; - case GUEST_ES_LIMIT: - current_evmcs->guest_es_limit = value; - break; - case GUEST_CS_LIMIT: - current_evmcs->guest_cs_limit = value; - break; - case GUEST_SS_LIMIT: - current_evmcs->guest_ss_limit = value; - break; - case GUEST_DS_LIMIT: - current_evmcs->guest_ds_limit = value; - break; - case GUEST_FS_LIMIT: - current_evmcs->guest_fs_limit = value; - break; - case GUEST_GS_LIMIT: - current_evmcs->guest_gs_limit = value; - break; - case GUEST_LDTR_LIMIT: - current_evmcs->guest_ldtr_limit = value; - break; - case GUEST_TR_LIMIT: - current_evmcs->guest_tr_limit = value; - break; - case GUEST_GDTR_LIMIT: - current_evmcs->guest_gdtr_limit = value; - break; - case GUEST_IDTR_LIMIT: - current_evmcs->guest_idtr_limit = value; - break; - case GUEST_ES_AR_BYTES: - current_evmcs->guest_es_ar_bytes = value; - break; - case GUEST_CS_AR_BYTES: - current_evmcs->guest_cs_ar_bytes = value; - break; - case GUEST_SS_AR_BYTES: - current_evmcs->guest_ss_ar_bytes = value; - break; - case GUEST_DS_AR_BYTES: - current_evmcs->guest_ds_ar_bytes = value; - break; - case GUEST_FS_AR_BYTES: - current_evmcs->guest_fs_ar_bytes = value; - break; - case GUEST_GS_AR_BYTES: - current_evmcs->guest_gs_ar_bytes = value; - break; - case GUEST_LDTR_AR_BYTES: - current_evmcs->guest_ldtr_ar_bytes = value; - break; - case GUEST_TR_AR_BYTES: - current_evmcs->guest_tr_ar_bytes = value; - break; - case GUEST_ACTIVITY_STATE: - current_evmcs->guest_activity_state = value; - break; - case GUEST_SYSENTER_CS: - current_evmcs->guest_sysenter_cs = value; - break; - case VM_INSTRUCTION_ERROR: - current_evmcs->vm_instruction_error = value; - break; - case VM_EXIT_REASON: - current_evmcs->vm_exit_reason = value; - break; - case VM_EXIT_INTR_INFO: - current_evmcs->vm_exit_intr_info = value; - break; - case VM_EXIT_INTR_ERROR_CODE: - current_evmcs->vm_exit_intr_error_code = value; - break; - case IDT_VECTORING_INFO_FIELD: - current_evmcs->idt_vectoring_info_field = value; - break; - case IDT_VECTORING_ERROR_CODE: - current_evmcs->idt_vectoring_error_code = value; - break; - case VM_EXIT_INSTRUCTION_LEN: - current_evmcs->vm_exit_instruction_len = value; - break; - case VMX_INSTRUCTION_INFO: - current_evmcs->vmx_instruction_info = value; - break; - case PAGE_FAULT_ERROR_CODE_MASK: - current_evmcs->page_fault_error_code_mask = value; - break; - case PAGE_FAULT_ERROR_CODE_MATCH: - current_evmcs->page_fault_error_code_match = value; - break; - case CR3_TARGET_COUNT: - current_evmcs->cr3_target_count = value; - break; - case VM_EXIT_MSR_STORE_COUNT: - current_evmcs->vm_exit_msr_store_count = value; - break; - case VM_EXIT_MSR_LOAD_COUNT: - current_evmcs->vm_exit_msr_load_count = value; - break; - case VM_ENTRY_MSR_LOAD_COUNT: - current_evmcs->vm_entry_msr_load_count = value; - break; - case HOST_ES_SELECTOR: - current_evmcs->host_es_selector = value; - break; - case HOST_CS_SELECTOR: - current_evmcs->host_cs_selector = value; - break; - case HOST_SS_SELECTOR: - current_evmcs->host_ss_selector = value; - break; - case HOST_DS_SELECTOR: - current_evmcs->host_ds_selector = value; - break; - case HOST_FS_SELECTOR: - current_evmcs->host_fs_selector = value; - break; - case HOST_GS_SELECTOR: - current_evmcs->host_gs_selector = value; - break; - case HOST_TR_SELECTOR: - current_evmcs->host_tr_selector = value; - break; - case GUEST_ES_SELECTOR: - current_evmcs->guest_es_selector = value; - break; - case GUEST_CS_SELECTOR: - current_evmcs->guest_cs_selector = value; - break; - case GUEST_SS_SELECTOR: - current_evmcs->guest_ss_selector = value; - break; - case GUEST_DS_SELECTOR: - current_evmcs->guest_ds_selector = value; - break; - case GUEST_FS_SELECTOR: - current_evmcs->guest_fs_selector = value; - break; - case GUEST_GS_SELECTOR: - current_evmcs->guest_gs_selector = value; - break; - case GUEST_LDTR_SELECTOR: - current_evmcs->guest_ldtr_selector = value; - break; - case GUEST_TR_SELECTOR: - current_evmcs->guest_tr_selector = value; - break; - case VIRTUAL_PROCESSOR_ID: - current_evmcs->virtual_processor_id = value; - break; - default: return 1; - } - - return 0; -} - -static inline int evmcs_vmlaunch(void) -{ - int ret; - - current_evmcs->hv_clean_fields = 0; - - __asm__ __volatile__("push %%rbp;" - "push %%rcx;" - "push %%rdx;" - "push %%rsi;" - "push %%rdi;" - "push $0;" - "mov %%rsp, (%[host_rsp]);" - "lea 1f(%%rip), %%rax;" - "mov %%rax, (%[host_rip]);" - "vmlaunch;" - "incq (%%rsp);" - "1: pop %%rax;" - "pop %%rdi;" - "pop %%rsi;" - "pop %%rdx;" - "pop %%rcx;" - "pop %%rbp;" - : [ret]"=&a"(ret) - : [host_rsp]"r" - ((uint64_t)¤t_evmcs->host_rsp), - [host_rip]"r" - ((uint64_t)¤t_evmcs->host_rip) - : "memory", "cc", "rbx", "r8", "r9", "r10", - "r11", "r12", "r13", "r14", "r15"); - return ret; -} - -/* - * No guest state (e.g. GPRs) is established by this vmresume. - */ -static inline int evmcs_vmresume(void) -{ - int ret; - - current_evmcs->hv_clean_fields = 0; - - __asm__ __volatile__("push %%rbp;" - "push %%rcx;" - "push %%rdx;" - "push %%rsi;" - "push %%rdi;" - "push $0;" - "mov %%rsp, (%[host_rsp]);" - "lea 1f(%%rip), %%rax;" - "mov %%rax, (%[host_rip]);" - "vmresume;" - "incq (%%rsp);" - "1: pop %%rax;" - "pop %%rdi;" - "pop %%rsi;" - "pop %%rdx;" - "pop %%rcx;" - "pop %%rbp;" - : [ret]"=&a"(ret) - : [host_rsp]"r" - ((uint64_t)¤t_evmcs->host_rsp), - [host_rip]"r" - ((uint64_t)¤t_evmcs->host_rip) - : "memory", "cc", "rbx", "r8", "r9", "r10", - "r11", "r12", "r13", "r14", "r15"); - return ret; -} - -#endif /* !SELFTEST_KVM_EVMCS_H */ diff --git a/tools/testing/selftests/kvm/include/guest_modes.h b/tools/testing/selftests/kvm/include/guest_modes.h new file mode 100644 index 000000000000..63f5167397cc --- /dev/null +++ b/tools/testing/selftests/kvm/include/guest_modes.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2020, Red Hat, Inc. + */ +#include "kvm_util.h" + +struct guest_mode { + bool supported; + bool enabled; +}; + +extern struct guest_mode guest_modes[NUM_VM_MODES]; + +#define guest_mode_append(mode, enabled) ({ \ + guest_modes[mode] = (struct guest_mode){ (enabled), (enabled) }; \ +}) + +void guest_modes_append_default(void); +void for_each_guest_mode(void (*func)(enum vm_guest_mode, void *), void *arg); +void guest_modes_help(void); +void guest_modes_cmdline(const char *arg); diff --git a/tools/testing/selftests/kvm/include/kvm_test_harness.h b/tools/testing/selftests/kvm/include/kvm_test_harness.h new file mode 100644 index 000000000000..8f7c6858e8e2 --- /dev/null +++ b/tools/testing/selftests/kvm/include/kvm_test_harness.h @@ -0,0 +1,36 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Macros for defining a KVM test + * + * Copyright (C) 2022, Google LLC. + */ + +#ifndef SELFTEST_KVM_TEST_HARNESS_H +#define SELFTEST_KVM_TEST_HARNESS_H + +#include "kselftest_harness.h" + +#define KVM_ONE_VCPU_TEST_SUITE(name) \ + FIXTURE(name) { \ + struct kvm_vcpu *vcpu; \ + }; \ + \ + FIXTURE_SETUP(name) { \ + (void)vm_create_with_one_vcpu(&self->vcpu, NULL); \ + } \ + \ + FIXTURE_TEARDOWN(name) { \ + kvm_vm_free(self->vcpu->vm); \ + } + +#define KVM_ONE_VCPU_TEST(suite, test, guestcode) \ +static void __suite##_##test(struct kvm_vcpu *vcpu); \ + \ +TEST_F(suite, test) \ +{ \ + vcpu_arch_set_entry_point(self->vcpu, guestcode); \ + __suite##_##test(self->vcpu); \ +} \ +static void __suite##_##test(struct kvm_vcpu *vcpu) + +#endif /* SELFTEST_KVM_TEST_HARNESS_H */ diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h index 919e161dd289..63c2aaae51f3 100644 --- a/tools/testing/selftests/kvm/include/kvm_util.h +++ b/tools/testing/selftests/kvm/include/kvm_util.h @@ -1,7 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0-only */ /* - * tools/testing/selftests/kvm/include/kvm_util.h - * * Copyright (C) 2018, Google LLC. */ #ifndef SELFTEST_KVM_UTIL_H @@ -9,257 +7,938 @@ #include "test_util.h" -#include "asm/kvm.h" +#include <linux/compiler.h> +#include "linux/hashtable.h" #include "linux/list.h" -#include "linux/kvm.h" +#include <linux/kernel.h> +#include <linux/kvm.h> +#include "linux/rbtree.h" +#include <linux/types.h> + +#include <asm/atomic.h> +#include <asm/kvm.h> + #include <sys/ioctl.h> +#include "kvm_util_arch.h" +#include "kvm_util_types.h" #include "sparsebit.h" +#define KVM_DEV_PATH "/dev/kvm" +#define KVM_MAX_VCPUS 512 -/* - * Callers of kvm_util only have an incomplete/opaque description of the - * structure kvm_util is using to maintain the state of a VM. - */ -struct kvm_vm; +#define NSEC_PER_SEC 1000000000L + +struct userspace_mem_region { + struct kvm_userspace_memory_region2 region; + struct sparsebit *unused_phy_pages; + struct sparsebit *protected_phy_pages; + int fd; + off_t offset; + enum vm_mem_backing_src_type backing_src_type; + void *host_mem; + void *host_alias; + void *mmap_start; + void *mmap_alias; + size_t mmap_size; + struct rb_node gpa_node; + struct rb_node hva_node; + struct hlist_node slot_node; +}; + +struct kvm_vcpu { + struct list_head list; + uint32_t id; + int fd; + struct kvm_vm *vm; + struct kvm_run *run; +#ifdef __x86_64__ + struct kvm_cpuid2 *cpuid; +#endif + struct kvm_dirty_gfn *dirty_gfns; + uint32_t fetch_index; + uint32_t dirty_gfns_count; +}; + +struct userspace_mem_regions { + struct rb_root gpa_tree; + struct rb_root hva_tree; + DECLARE_HASHTABLE(slot_hash, 9); +}; + +enum kvm_mem_region_type { + MEM_REGION_CODE, + MEM_REGION_DATA, + MEM_REGION_PT, + MEM_REGION_TEST_DATA, + NR_MEM_REGIONS, +}; + +struct kvm_vm { + int mode; + unsigned long type; + int kvm_fd; + int fd; + unsigned int pgtable_levels; + unsigned int page_size; + unsigned int page_shift; + unsigned int pa_bits; + unsigned int va_bits; + uint64_t max_gfn; + struct list_head vcpus; + struct userspace_mem_regions regions; + struct sparsebit *vpages_valid; + struct sparsebit *vpages_mapped; + bool has_irqchip; + bool pgd_created; + vm_paddr_t ucall_mmio_addr; + vm_paddr_t pgd; + vm_vaddr_t handlers; + uint32_t dirty_ring_size; + uint64_t gpa_tag_mask; + + struct kvm_vm_arch arch; + + /* Cache of information for binary stats interface */ + int stats_fd; + struct kvm_stats_header stats_header; + struct kvm_stats_desc *stats_desc; + + /* + * KVM region slots. These are the default memslots used by page + * allocators, e.g., lib/elf uses the memslots[MEM_REGION_CODE] + * memslot. + */ + uint32_t memslots[NR_MEM_REGIONS]; +}; + +struct vcpu_reg_sublist { + const char *name; + long capability; + int feature; + int feature_type; + bool finalize; + __u64 *regs; + __u64 regs_n; + __u64 *rejects_set; + __u64 rejects_set_n; + __u64 *skips_set; + __u64 skips_set_n; +}; + +struct vcpu_reg_list { + char *name; + struct vcpu_reg_sublist sublists[]; +}; + +#define for_each_sublist(c, s) \ + for ((s) = &(c)->sublists[0]; (s)->regs; ++(s)) -typedef uint64_t vm_paddr_t; /* Virtual Machine (Guest) physical address */ -typedef uint64_t vm_vaddr_t; /* Virtual Machine (Guest) virtual address */ +#define kvm_for_each_vcpu(vm, i, vcpu) \ + for ((i) = 0; (i) <= (vm)->last_vcpu_id; (i)++) \ + if (!((vcpu) = vm->vcpus[i])) \ + continue; \ + else + +struct userspace_mem_region * +memslot2region(struct kvm_vm *vm, uint32_t memslot); + +static inline struct userspace_mem_region *vm_get_mem_region(struct kvm_vm *vm, + enum kvm_mem_region_type type) +{ + assert(type < NR_MEM_REGIONS); + return memslot2region(vm, vm->memslots[type]); +} /* Minimum allocated guest virtual and physical addresses */ #define KVM_UTIL_MIN_VADDR 0x2000 +#define KVM_GUEST_PAGE_TABLE_MIN_PADDR 0x180000 -#define DEFAULT_GUEST_PHY_PAGES 512 #define DEFAULT_GUEST_STACK_VADDR_MIN 0xab6000 #define DEFAULT_STACK_PGS 5 enum vm_guest_mode { VM_MODE_P52V48_4K, + VM_MODE_P52V48_16K, VM_MODE_P52V48_64K, VM_MODE_P48V48_4K, + VM_MODE_P48V48_16K, VM_MODE_P48V48_64K, VM_MODE_P40V48_4K, + VM_MODE_P40V48_16K, VM_MODE_P40V48_64K, VM_MODE_PXXV48_4K, /* For 48bits VA but ANY bits PA */ + VM_MODE_P47V64_4K, + VM_MODE_P44V64_4K, + VM_MODE_P36V48_4K, + VM_MODE_P36V48_16K, + VM_MODE_P36V48_64K, + VM_MODE_P36V47_16K, NUM_VM_MODES, }; +struct vm_shape { + uint32_t type; + uint8_t mode; + uint8_t pad0; + uint16_t pad1; +}; + +kvm_static_assert(sizeof(struct vm_shape) == sizeof(uint64_t)); + +#define VM_TYPE_DEFAULT 0 + +#define VM_SHAPE(__mode) \ +({ \ + struct vm_shape shape = { \ + .mode = (__mode), \ + .type = VM_TYPE_DEFAULT \ + }; \ + \ + shape; \ +}) + #if defined(__aarch64__) -#define VM_MODE_DEFAULT VM_MODE_P40V48_4K + +extern enum vm_guest_mode vm_mode_default; + +#define VM_MODE_DEFAULT vm_mode_default +#define MIN_PAGE_SHIFT 12U +#define ptes_per_page(page_size) ((page_size) / 8) + #elif defined(__x86_64__) -#define VM_MODE_DEFAULT VM_MODE_PXXV48_4K -#else -#define VM_MODE_DEFAULT VM_MODE_P52V48_4K + +#define VM_MODE_DEFAULT VM_MODE_PXXV48_4K +#define MIN_PAGE_SHIFT 12U +#define ptes_per_page(page_size) ((page_size) / 8) + +#elif defined(__s390x__) + +#define VM_MODE_DEFAULT VM_MODE_P44V64_4K +#define MIN_PAGE_SHIFT 12U +#define ptes_per_page(page_size) ((page_size) / 16) + +#elif defined(__riscv) + +#if __riscv_xlen == 32 +#error "RISC-V 32-bit kvm selftests not supported" +#endif + +#define VM_MODE_DEFAULT VM_MODE_P40V48_4K +#define MIN_PAGE_SHIFT 12U +#define ptes_per_page(page_size) ((page_size) / 8) + #endif -#define vm_guest_mode_string(m) vm_guest_mode_string[m] -extern const char * const vm_guest_mode_string[]; +#define VM_SHAPE_DEFAULT VM_SHAPE(VM_MODE_DEFAULT) -enum vm_mem_backing_src_type { - VM_MEM_SRC_ANONYMOUS, - VM_MEM_SRC_ANONYMOUS_THP, - VM_MEM_SRC_ANONYMOUS_HUGETLB, +#define MIN_PAGE_SIZE (1U << MIN_PAGE_SHIFT) +#define PTES_PER_MIN_PAGE ptes_per_page(MIN_PAGE_SIZE) + +struct vm_guest_mode_params { + unsigned int pa_bits; + unsigned int va_bits; + unsigned int page_size; + unsigned int page_shift; }; +extern const struct vm_guest_mode_params vm_guest_mode_params[]; + +int open_path_or_exit(const char *path, int flags); +int open_kvm_dev_path_or_exit(void); + +bool get_kvm_param_bool(const char *param); +bool get_kvm_intel_param_bool(const char *param); +bool get_kvm_amd_param_bool(const char *param); + +int get_kvm_param_integer(const char *param); +int get_kvm_intel_param_integer(const char *param); +int get_kvm_amd_param_integer(const char *param); + +unsigned int kvm_check_cap(long cap); + +static inline bool kvm_has_cap(long cap) +{ + return kvm_check_cap(cap); +} + +#define __KVM_SYSCALL_ERROR(_name, _ret) \ + "%s failed, rc: %i errno: %i (%s)", (_name), (_ret), errno, strerror(errno) + +/* + * Use the "inner", double-underscore macro when reporting errors from within + * other macros so that the name of ioctl() and not its literal numeric value + * is printed on error. The "outer" macro is strongly preferred when reporting + * errors "directly", i.e. without an additional layer of macros, as it reduces + * the probability of passing in the wrong string. + */ +#define __KVM_IOCTL_ERROR(_name, _ret) __KVM_SYSCALL_ERROR(_name, _ret) +#define KVM_IOCTL_ERROR(_ioctl, _ret) __KVM_IOCTL_ERROR(#_ioctl, _ret) + +#define kvm_do_ioctl(fd, cmd, arg) \ +({ \ + kvm_static_assert(!_IOC_SIZE(cmd) || sizeof(*arg) == _IOC_SIZE(cmd)); \ + ioctl(fd, cmd, arg); \ +}) + +#define __kvm_ioctl(kvm_fd, cmd, arg) \ + kvm_do_ioctl(kvm_fd, cmd, arg) + +#define kvm_ioctl(kvm_fd, cmd, arg) \ +({ \ + int ret = __kvm_ioctl(kvm_fd, cmd, arg); \ + \ + TEST_ASSERT(!ret, __KVM_IOCTL_ERROR(#cmd, ret)); \ +}) + +static __always_inline void static_assert_is_vm(struct kvm_vm *vm) { } + +#define __vm_ioctl(vm, cmd, arg) \ +({ \ + static_assert_is_vm(vm); \ + kvm_do_ioctl((vm)->fd, cmd, arg); \ +}) + +/* + * Assert that a VM or vCPU ioctl() succeeded, with extra magic to detect if + * the ioctl() failed because KVM killed/bugged the VM. To detect a dead VM, + * probe KVM_CAP_USER_MEMORY, which (a) has been supported by KVM since before + * selftests existed and (b) should never outright fail, i.e. is supposed to + * return 0 or 1. If KVM kills a VM, KVM returns -EIO for all ioctl()s for the + * VM and its vCPUs, including KVM_CHECK_EXTENSION. + */ +#define __TEST_ASSERT_VM_VCPU_IOCTL(cond, name, ret, vm) \ +do { \ + int __errno = errno; \ + \ + static_assert_is_vm(vm); \ + \ + if (cond) \ + break; \ + \ + if (errno == EIO && \ + __vm_ioctl(vm, KVM_CHECK_EXTENSION, (void *)KVM_CAP_USER_MEMORY) < 0) { \ + TEST_ASSERT(errno == EIO, "KVM killed the VM, should return -EIO"); \ + TEST_FAIL("KVM killed/bugged the VM, check the kernel log for clues"); \ + } \ + errno = __errno; \ + TEST_ASSERT(cond, __KVM_IOCTL_ERROR(name, ret)); \ +} while (0) + +#define TEST_ASSERT_VM_VCPU_IOCTL(cond, cmd, ret, vm) \ + __TEST_ASSERT_VM_VCPU_IOCTL(cond, #cmd, ret, vm) + +#define vm_ioctl(vm, cmd, arg) \ +({ \ + int ret = __vm_ioctl(vm, cmd, arg); \ + \ + __TEST_ASSERT_VM_VCPU_IOCTL(!ret, #cmd, ret, vm); \ +}) + +static __always_inline void static_assert_is_vcpu(struct kvm_vcpu *vcpu) { } + +#define __vcpu_ioctl(vcpu, cmd, arg) \ +({ \ + static_assert_is_vcpu(vcpu); \ + kvm_do_ioctl((vcpu)->fd, cmd, arg); \ +}) + +#define vcpu_ioctl(vcpu, cmd, arg) \ +({ \ + int ret = __vcpu_ioctl(vcpu, cmd, arg); \ + \ + __TEST_ASSERT_VM_VCPU_IOCTL(!ret, #cmd, ret, (vcpu)->vm); \ +}) + +/* + * Looks up and returns the value corresponding to the capability + * (KVM_CAP_*) given by cap. + */ +static inline int vm_check_cap(struct kvm_vm *vm, long cap) +{ + int ret = __vm_ioctl(vm, KVM_CHECK_EXTENSION, (void *)cap); + + TEST_ASSERT_VM_VCPU_IOCTL(ret >= 0, KVM_CHECK_EXTENSION, ret, vm); + return ret; +} -int kvm_check_cap(long cap); -int vm_enable_cap(struct kvm_vm *vm, struct kvm_enable_cap *cap); +static inline int __vm_enable_cap(struct kvm_vm *vm, uint32_t cap, uint64_t arg0) +{ + struct kvm_enable_cap enable_cap = { .cap = cap, .args = { arg0 } }; + + return __vm_ioctl(vm, KVM_ENABLE_CAP, &enable_cap); +} +static inline void vm_enable_cap(struct kvm_vm *vm, uint32_t cap, uint64_t arg0) +{ + struct kvm_enable_cap enable_cap = { .cap = cap, .args = { arg0 } }; + + vm_ioctl(vm, KVM_ENABLE_CAP, &enable_cap); +} + +static inline void vm_set_memory_attributes(struct kvm_vm *vm, uint64_t gpa, + uint64_t size, uint64_t attributes) +{ + struct kvm_memory_attributes attr = { + .attributes = attributes, + .address = gpa, + .size = size, + .flags = 0, + }; + + /* + * KVM_SET_MEMORY_ATTRIBUTES overwrites _all_ attributes. These flows + * need significant enhancements to support multiple attributes. + */ + TEST_ASSERT(!attributes || attributes == KVM_MEMORY_ATTRIBUTE_PRIVATE, + "Update me to support multiple attributes!"); + + vm_ioctl(vm, KVM_SET_MEMORY_ATTRIBUTES, &attr); +} + + +static inline void vm_mem_set_private(struct kvm_vm *vm, uint64_t gpa, + uint64_t size) +{ + vm_set_memory_attributes(vm, gpa, size, KVM_MEMORY_ATTRIBUTE_PRIVATE); +} + +static inline void vm_mem_set_shared(struct kvm_vm *vm, uint64_t gpa, + uint64_t size) +{ + vm_set_memory_attributes(vm, gpa, size, 0); +} + +void vm_guest_mem_fallocate(struct kvm_vm *vm, uint64_t gpa, uint64_t size, + bool punch_hole); + +static inline void vm_guest_mem_punch_hole(struct kvm_vm *vm, uint64_t gpa, + uint64_t size) +{ + vm_guest_mem_fallocate(vm, gpa, size, true); +} + +static inline void vm_guest_mem_allocate(struct kvm_vm *vm, uint64_t gpa, + uint64_t size) +{ + vm_guest_mem_fallocate(vm, gpa, size, false); +} + +void vm_enable_dirty_ring(struct kvm_vm *vm, uint32_t ring_size); +const char *vm_guest_mode_string(uint32_t i); -struct kvm_vm *vm_create(enum vm_guest_mode mode, uint64_t phy_pages, int perm); -struct kvm_vm *_vm_create(enum vm_guest_mode mode, uint64_t phy_pages, int perm); void kvm_vm_free(struct kvm_vm *vmp); -void kvm_vm_restart(struct kvm_vm *vmp, int perm); +void kvm_vm_restart(struct kvm_vm *vmp); void kvm_vm_release(struct kvm_vm *vmp); -void kvm_vm_get_dirty_log(struct kvm_vm *vm, int slot, void *log); -void kvm_vm_clear_dirty_log(struct kvm_vm *vm, int slot, void *log, - uint64_t first_page, uint32_t num_pages); - int kvm_memcmp_hva_gva(void *hva, struct kvm_vm *vm, const vm_vaddr_t gva, size_t len); - -void kvm_vm_elf_load(struct kvm_vm *vm, const char *filename, - uint32_t data_memslot, uint32_t pgd_memslot); +void kvm_vm_elf_load(struct kvm_vm *vm, const char *filename); +int kvm_memfd_alloc(size_t size, bool hugepages); void vm_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent); -/* - * VM VCPU Dump - * - * Input Args: - * stream - Output FILE stream - * vm - Virtual Machine - * vcpuid - VCPU ID - * indent - Left margin indent amount - * - * Output Args: None - * - * Return: None - * - * Dumps the current state of the VCPU specified by @vcpuid, within the VM - * given by @vm, to the FILE stream given by @stream. - */ -void vcpu_dump(FILE *stream, struct kvm_vm *vm, uint32_t vcpuid, - uint8_t indent); +static inline void kvm_vm_get_dirty_log(struct kvm_vm *vm, int slot, void *log) +{ + struct kvm_dirty_log args = { .dirty_bitmap = log, .slot = slot }; + + vm_ioctl(vm, KVM_GET_DIRTY_LOG, &args); +} + +static inline void kvm_vm_clear_dirty_log(struct kvm_vm *vm, int slot, void *log, + uint64_t first_page, uint32_t num_pages) +{ + struct kvm_clear_dirty_log args = { + .dirty_bitmap = log, + .slot = slot, + .first_page = first_page, + .num_pages = num_pages + }; + + vm_ioctl(vm, KVM_CLEAR_DIRTY_LOG, &args); +} + +static inline uint32_t kvm_vm_reset_dirty_ring(struct kvm_vm *vm) +{ + return __vm_ioctl(vm, KVM_RESET_DIRTY_RINGS, NULL); +} + +static inline int vm_get_stats_fd(struct kvm_vm *vm) +{ + int fd = __vm_ioctl(vm, KVM_GET_STATS_FD, NULL); + + TEST_ASSERT_VM_VCPU_IOCTL(fd >= 0, KVM_GET_STATS_FD, fd, vm); + return fd; +} + +static inline void read_stats_header(int stats_fd, struct kvm_stats_header *header) +{ + ssize_t ret; + + ret = pread(stats_fd, header, sizeof(*header), 0); + TEST_ASSERT(ret == sizeof(*header), + "Failed to read '%lu' header bytes, ret = '%ld'", + sizeof(*header), ret); +} + +struct kvm_stats_desc *read_stats_descriptors(int stats_fd, + struct kvm_stats_header *header); + +static inline ssize_t get_stats_descriptor_size(struct kvm_stats_header *header) +{ + /* + * The base size of the descriptor is defined by KVM's ABI, but the + * size of the name field is variable, as far as KVM's ABI is + * concerned. For a given instance of KVM, the name field is the same + * size for all stats and is provided in the overall stats header. + */ + return sizeof(struct kvm_stats_desc) + header->name_size; +} + +static inline struct kvm_stats_desc *get_stats_descriptor(struct kvm_stats_desc *stats, + int index, + struct kvm_stats_header *header) +{ + /* + * Note, size_desc includes the size of the name field, which is + * variable. i.e. this is NOT equivalent to &stats_desc[i]. + */ + return (void *)stats + index * get_stats_descriptor_size(header); +} + +void read_stat_data(int stats_fd, struct kvm_stats_header *header, + struct kvm_stats_desc *desc, uint64_t *data, + size_t max_elements); + +void __vm_get_stat(struct kvm_vm *vm, const char *stat_name, uint64_t *data, + size_t max_elements); + +static inline uint64_t vm_get_stat(struct kvm_vm *vm, const char *stat_name) +{ + uint64_t data; + + __vm_get_stat(vm, stat_name, &data, 1); + return data; +} void vm_create_irqchip(struct kvm_vm *vm); +static inline int __vm_create_guest_memfd(struct kvm_vm *vm, uint64_t size, + uint64_t flags) +{ + struct kvm_create_guest_memfd guest_memfd = { + .size = size, + .flags = flags, + }; + + return __vm_ioctl(vm, KVM_CREATE_GUEST_MEMFD, &guest_memfd); +} + +static inline int vm_create_guest_memfd(struct kvm_vm *vm, uint64_t size, + uint64_t flags) +{ + int fd = __vm_create_guest_memfd(vm, size, flags); + + TEST_ASSERT(fd >= 0, KVM_IOCTL_ERROR(KVM_CREATE_GUEST_MEMFD, fd)); + return fd; +} + +void vm_set_user_memory_region(struct kvm_vm *vm, uint32_t slot, uint32_t flags, + uint64_t gpa, uint64_t size, void *hva); +int __vm_set_user_memory_region(struct kvm_vm *vm, uint32_t slot, uint32_t flags, + uint64_t gpa, uint64_t size, void *hva); +void vm_set_user_memory_region2(struct kvm_vm *vm, uint32_t slot, uint32_t flags, + uint64_t gpa, uint64_t size, void *hva, + uint32_t guest_memfd, uint64_t guest_memfd_offset); +int __vm_set_user_memory_region2(struct kvm_vm *vm, uint32_t slot, uint32_t flags, + uint64_t gpa, uint64_t size, void *hva, + uint32_t guest_memfd, uint64_t guest_memfd_offset); + void vm_userspace_mem_region_add(struct kvm_vm *vm, enum vm_mem_backing_src_type src_type, uint64_t guest_paddr, uint32_t slot, uint64_t npages, uint32_t flags); +void vm_mem_add(struct kvm_vm *vm, enum vm_mem_backing_src_type src_type, + uint64_t guest_paddr, uint32_t slot, uint64_t npages, + uint32_t flags, int guest_memfd_fd, uint64_t guest_memfd_offset); + +#ifndef vm_arch_has_protected_memory +static inline bool vm_arch_has_protected_memory(struct kvm_vm *vm) +{ + return false; +} +#endif -void vcpu_ioctl(struct kvm_vm *vm, uint32_t vcpuid, unsigned long ioctl, - void *arg); -int _vcpu_ioctl(struct kvm_vm *vm, uint32_t vcpuid, unsigned long ioctl, - void *arg); -void vm_ioctl(struct kvm_vm *vm, unsigned long ioctl, void *arg); void vm_mem_region_set_flags(struct kvm_vm *vm, uint32_t slot, uint32_t flags); void vm_mem_region_move(struct kvm_vm *vm, uint32_t slot, uint64_t new_gpa); void vm_mem_region_delete(struct kvm_vm *vm, uint32_t slot); -void vm_vcpu_add(struct kvm_vm *vm, uint32_t vcpuid); -vm_vaddr_t vm_vaddr_alloc(struct kvm_vm *vm, size_t sz, vm_vaddr_t vaddr_min, - uint32_t data_memslot, uint32_t pgd_memslot); +struct kvm_vcpu *__vm_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id); +void vm_populate_vaddr_bitmap(struct kvm_vm *vm); +vm_vaddr_t vm_vaddr_unused_gap(struct kvm_vm *vm, size_t sz, vm_vaddr_t vaddr_min); +vm_vaddr_t vm_vaddr_alloc(struct kvm_vm *vm, size_t sz, vm_vaddr_t vaddr_min); +vm_vaddr_t __vm_vaddr_alloc(struct kvm_vm *vm, size_t sz, vm_vaddr_t vaddr_min, + enum kvm_mem_region_type type); +vm_vaddr_t vm_vaddr_alloc_shared(struct kvm_vm *vm, size_t sz, + vm_vaddr_t vaddr_min, + enum kvm_mem_region_type type); +vm_vaddr_t vm_vaddr_alloc_pages(struct kvm_vm *vm, int nr_pages); +vm_vaddr_t __vm_vaddr_alloc_page(struct kvm_vm *vm, + enum kvm_mem_region_type type); +vm_vaddr_t vm_vaddr_alloc_page(struct kvm_vm *vm); + void virt_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, - unsigned int npages, uint32_t pgd_memslot); + unsigned int npages); void *addr_gpa2hva(struct kvm_vm *vm, vm_paddr_t gpa); void *addr_gva2hva(struct kvm_vm *vm, vm_vaddr_t gva); vm_paddr_t addr_hva2gpa(struct kvm_vm *vm, void *hva); +void *addr_gpa2alias(struct kvm_vm *vm, vm_paddr_t gpa); -/* - * Address Guest Virtual to Guest Physical - * - * Input Args: - * vm - Virtual Machine - * gva - VM virtual address - * - * Output Args: None - * - * Return: - * Equivalent VM physical address - * - * Returns the VM physical address of the translated VM virtual - * address given by @gva. - */ -vm_paddr_t addr_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva); - -struct kvm_run *vcpu_state(struct kvm_vm *vm, uint32_t vcpuid); -void vcpu_run(struct kvm_vm *vm, uint32_t vcpuid); -int _vcpu_run(struct kvm_vm *vm, uint32_t vcpuid); -void vcpu_run_complete_io(struct kvm_vm *vm, uint32_t vcpuid); -void vcpu_set_guest_debug(struct kvm_vm *vm, uint32_t vcpuid, - struct kvm_guest_debug *debug); -void vcpu_set_mp_state(struct kvm_vm *vm, uint32_t vcpuid, - struct kvm_mp_state *mp_state); -void vcpu_regs_get(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_regs *regs); -void vcpu_regs_set(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_regs *regs); +#ifndef vcpu_arch_put_guest +#define vcpu_arch_put_guest(mem, val) do { (mem) = (val); } while (0) +#endif + +static inline vm_paddr_t vm_untag_gpa(struct kvm_vm *vm, vm_paddr_t gpa) +{ + return gpa & ~vm->gpa_tag_mask; +} + +void vcpu_run(struct kvm_vcpu *vcpu); +int _vcpu_run(struct kvm_vcpu *vcpu); + +static inline int __vcpu_run(struct kvm_vcpu *vcpu) +{ + return __vcpu_ioctl(vcpu, KVM_RUN, NULL); +} + +void vcpu_run_complete_io(struct kvm_vcpu *vcpu); +struct kvm_reg_list *vcpu_get_reg_list(struct kvm_vcpu *vcpu); + +static inline void vcpu_enable_cap(struct kvm_vcpu *vcpu, uint32_t cap, + uint64_t arg0) +{ + struct kvm_enable_cap enable_cap = { .cap = cap, .args = { arg0 } }; + + vcpu_ioctl(vcpu, KVM_ENABLE_CAP, &enable_cap); +} + +static inline void vcpu_guest_debug_set(struct kvm_vcpu *vcpu, + struct kvm_guest_debug *debug) +{ + vcpu_ioctl(vcpu, KVM_SET_GUEST_DEBUG, debug); +} + +static inline void vcpu_mp_state_get(struct kvm_vcpu *vcpu, + struct kvm_mp_state *mp_state) +{ + vcpu_ioctl(vcpu, KVM_GET_MP_STATE, mp_state); +} +static inline void vcpu_mp_state_set(struct kvm_vcpu *vcpu, + struct kvm_mp_state *mp_state) +{ + vcpu_ioctl(vcpu, KVM_SET_MP_STATE, mp_state); +} + +static inline void vcpu_regs_get(struct kvm_vcpu *vcpu, struct kvm_regs *regs) +{ + vcpu_ioctl(vcpu, KVM_GET_REGS, regs); +} + +static inline void vcpu_regs_set(struct kvm_vcpu *vcpu, struct kvm_regs *regs) +{ + vcpu_ioctl(vcpu, KVM_SET_REGS, regs); +} +static inline void vcpu_sregs_get(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) +{ + vcpu_ioctl(vcpu, KVM_GET_SREGS, sregs); + +} +static inline void vcpu_sregs_set(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) +{ + vcpu_ioctl(vcpu, KVM_SET_SREGS, sregs); +} +static inline int _vcpu_sregs_set(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) +{ + return __vcpu_ioctl(vcpu, KVM_SET_SREGS, sregs); +} +static inline void vcpu_fpu_get(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) +{ + vcpu_ioctl(vcpu, KVM_GET_FPU, fpu); +} +static inline void vcpu_fpu_set(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) +{ + vcpu_ioctl(vcpu, KVM_SET_FPU, fpu); +} + +static inline int __vcpu_get_reg(struct kvm_vcpu *vcpu, uint64_t id, void *addr) +{ + struct kvm_one_reg reg = { .id = id, .addr = (uint64_t)addr }; + + return __vcpu_ioctl(vcpu, KVM_GET_ONE_REG, ®); +} +static inline int __vcpu_set_reg(struct kvm_vcpu *vcpu, uint64_t id, uint64_t val) +{ + struct kvm_one_reg reg = { .id = id, .addr = (uint64_t)&val }; + + return __vcpu_ioctl(vcpu, KVM_SET_ONE_REG, ®); +} +static inline void vcpu_get_reg(struct kvm_vcpu *vcpu, uint64_t id, void *addr) +{ + struct kvm_one_reg reg = { .id = id, .addr = (uint64_t)addr }; + + vcpu_ioctl(vcpu, KVM_GET_ONE_REG, ®); +} +static inline void vcpu_set_reg(struct kvm_vcpu *vcpu, uint64_t id, uint64_t val) +{ + struct kvm_one_reg reg = { .id = id, .addr = (uint64_t)&val }; + + vcpu_ioctl(vcpu, KVM_SET_ONE_REG, ®); +} -/* - * VM VCPU Args Set - * - * Input Args: - * vm - Virtual Machine - * vcpuid - VCPU ID - * num - number of arguments - * ... - arguments, each of type uint64_t - * - * Output Args: None - * - * Return: None - * - * Sets the first @num function input registers of the VCPU with @vcpuid, - * per the C calling convention of the architecture, to the values given - * as variable args. Each of the variable args is expected to be of type - * uint64_t. The maximum @num can be is specific to the architecture. - */ -void vcpu_args_set(struct kvm_vm *vm, uint32_t vcpuid, unsigned int num, ...); - -void vcpu_sregs_get(struct kvm_vm *vm, uint32_t vcpuid, - struct kvm_sregs *sregs); -void vcpu_sregs_set(struct kvm_vm *vm, uint32_t vcpuid, - struct kvm_sregs *sregs); -int _vcpu_sregs_set(struct kvm_vm *vm, uint32_t vcpuid, - struct kvm_sregs *sregs); -void vcpu_fpu_get(struct kvm_vm *vm, uint32_t vcpuid, - struct kvm_fpu *fpu); -void vcpu_fpu_set(struct kvm_vm *vm, uint32_t vcpuid, - struct kvm_fpu *fpu); -void vcpu_get_reg(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_one_reg *reg); -void vcpu_set_reg(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_one_reg *reg); #ifdef __KVM_HAVE_VCPU_EVENTS -void vcpu_events_get(struct kvm_vm *vm, uint32_t vcpuid, - struct kvm_vcpu_events *events); -void vcpu_events_set(struct kvm_vm *vm, uint32_t vcpuid, - struct kvm_vcpu_events *events); +static inline void vcpu_events_get(struct kvm_vcpu *vcpu, + struct kvm_vcpu_events *events) +{ + vcpu_ioctl(vcpu, KVM_GET_VCPU_EVENTS, events); +} +static inline void vcpu_events_set(struct kvm_vcpu *vcpu, + struct kvm_vcpu_events *events) +{ + vcpu_ioctl(vcpu, KVM_SET_VCPU_EVENTS, events); +} #endif #ifdef __x86_64__ -void vcpu_nested_state_get(struct kvm_vm *vm, uint32_t vcpuid, - struct kvm_nested_state *state); -int vcpu_nested_state_set(struct kvm_vm *vm, uint32_t vcpuid, - struct kvm_nested_state *state, bool ignore_error); +static inline void vcpu_nested_state_get(struct kvm_vcpu *vcpu, + struct kvm_nested_state *state) +{ + vcpu_ioctl(vcpu, KVM_GET_NESTED_STATE, state); +} +static inline int __vcpu_nested_state_set(struct kvm_vcpu *vcpu, + struct kvm_nested_state *state) +{ + return __vcpu_ioctl(vcpu, KVM_SET_NESTED_STATE, state); +} + +static inline void vcpu_nested_state_set(struct kvm_vcpu *vcpu, + struct kvm_nested_state *state) +{ + vcpu_ioctl(vcpu, KVM_SET_NESTED_STATE, state); +} #endif +static inline int vcpu_get_stats_fd(struct kvm_vcpu *vcpu) +{ + int fd = __vcpu_ioctl(vcpu, KVM_GET_STATS_FD, NULL); -const char *exit_reason_str(unsigned int exit_reason); + TEST_ASSERT_VM_VCPU_IOCTL(fd >= 0, KVM_CHECK_EXTENSION, fd, vcpu->vm); + return fd; +} + +int __kvm_has_device_attr(int dev_fd, uint32_t group, uint64_t attr); + +static inline void kvm_has_device_attr(int dev_fd, uint32_t group, uint64_t attr) +{ + int ret = __kvm_has_device_attr(dev_fd, group, attr); -void virt_pgd_alloc(struct kvm_vm *vm, uint32_t pgd_memslot); + TEST_ASSERT(!ret, "KVM_HAS_DEVICE_ATTR failed, rc: %i errno: %i", ret, errno); +} + +int __kvm_device_attr_get(int dev_fd, uint32_t group, uint64_t attr, void *val); + +static inline void kvm_device_attr_get(int dev_fd, uint32_t group, + uint64_t attr, void *val) +{ + int ret = __kvm_device_attr_get(dev_fd, group, attr, val); + + TEST_ASSERT(!ret, KVM_IOCTL_ERROR(KVM_GET_DEVICE_ATTR, ret)); +} + +int __kvm_device_attr_set(int dev_fd, uint32_t group, uint64_t attr, void *val); + +static inline void kvm_device_attr_set(int dev_fd, uint32_t group, + uint64_t attr, void *val) +{ + int ret = __kvm_device_attr_set(dev_fd, group, attr, val); + + TEST_ASSERT(!ret, KVM_IOCTL_ERROR(KVM_SET_DEVICE_ATTR, ret)); +} + +static inline int __vcpu_has_device_attr(struct kvm_vcpu *vcpu, uint32_t group, + uint64_t attr) +{ + return __kvm_has_device_attr(vcpu->fd, group, attr); +} + +static inline void vcpu_has_device_attr(struct kvm_vcpu *vcpu, uint32_t group, + uint64_t attr) +{ + kvm_has_device_attr(vcpu->fd, group, attr); +} + +static inline int __vcpu_device_attr_get(struct kvm_vcpu *vcpu, uint32_t group, + uint64_t attr, void *val) +{ + return __kvm_device_attr_get(vcpu->fd, group, attr, val); +} + +static inline void vcpu_device_attr_get(struct kvm_vcpu *vcpu, uint32_t group, + uint64_t attr, void *val) +{ + kvm_device_attr_get(vcpu->fd, group, attr, val); +} + +static inline int __vcpu_device_attr_set(struct kvm_vcpu *vcpu, uint32_t group, + uint64_t attr, void *val) +{ + return __kvm_device_attr_set(vcpu->fd, group, attr, val); +} + +static inline void vcpu_device_attr_set(struct kvm_vcpu *vcpu, uint32_t group, + uint64_t attr, void *val) +{ + kvm_device_attr_set(vcpu->fd, group, attr, val); +} + +int __kvm_test_create_device(struct kvm_vm *vm, uint64_t type); +int __kvm_create_device(struct kvm_vm *vm, uint64_t type); + +static inline int kvm_create_device(struct kvm_vm *vm, uint64_t type) +{ + int fd = __kvm_create_device(vm, type); + + TEST_ASSERT(fd >= 0, KVM_IOCTL_ERROR(KVM_CREATE_DEVICE, fd)); + return fd; +} + +void *vcpu_map_dirty_ring(struct kvm_vcpu *vcpu); /* - * VM Virtual Page Map + * VM VCPU Args Set * * Input Args: * vm - Virtual Machine - * vaddr - VM Virtual Address - * paddr - VM Physical Address - * memslot - Memory region slot for new virtual translation tables + * num - number of arguments + * ... - arguments, each of type uint64_t * * Output Args: None * * Return: None * - * Within @vm, creates a virtual translation for the page starting - * at @vaddr to the page starting at @paddr. + * Sets the first @num input parameters for the function at @vcpu's entry point, + * per the C calling convention of the architecture, to the values given as + * variable args. Each of the variable args is expected to be of type uint64_t. + * The maximum @num can be is specific to the architecture. */ -void virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, - uint32_t memslot); +void vcpu_args_set(struct kvm_vcpu *vcpu, unsigned int num, ...); + +void kvm_irq_line(struct kvm_vm *vm, uint32_t irq, int level); +int _kvm_irq_line(struct kvm_vm *vm, uint32_t irq, int level); + +#define KVM_MAX_IRQ_ROUTES 4096 + +struct kvm_irq_routing *kvm_gsi_routing_create(void); +void kvm_gsi_routing_irqchip_add(struct kvm_irq_routing *routing, + uint32_t gsi, uint32_t pin); +int _kvm_gsi_routing_write(struct kvm_vm *vm, struct kvm_irq_routing *routing); +void kvm_gsi_routing_write(struct kvm_vm *vm, struct kvm_irq_routing *routing); + +const char *exit_reason_str(unsigned int exit_reason); vm_paddr_t vm_phy_page_alloc(struct kvm_vm *vm, vm_paddr_t paddr_min, uint32_t memslot); -vm_paddr_t vm_phy_pages_alloc(struct kvm_vm *vm, size_t num, - vm_paddr_t paddr_min, uint32_t memslot); +vm_paddr_t __vm_phy_pages_alloc(struct kvm_vm *vm, size_t num, + vm_paddr_t paddr_min, uint32_t memslot, + bool protected); +vm_paddr_t vm_alloc_page_table(struct kvm_vm *vm); + +static inline vm_paddr_t vm_phy_pages_alloc(struct kvm_vm *vm, size_t num, + vm_paddr_t paddr_min, uint32_t memslot) +{ + /* + * By default, allocate memory as protected for VMs that support + * protected memory, as the majority of memory for such VMs is + * protected, i.e. using shared memory is effectively opt-in. + */ + return __vm_phy_pages_alloc(vm, num, paddr_min, memslot, + vm_arch_has_protected_memory(vm)); +} /* - * Create a VM with reasonable defaults - * - * Input Args: - * vcpuid - The id of the single VCPU to add to the VM. - * extra_mem_pages - The number of extra pages to add (this will - * decide how much extra space we will need to - * setup the page tables using memslot 0) - * guest_code - The vCPU's entry point - * - * Output Args: None - * - * Return: - * Pointer to opaque structure that describes the created VM. + * ____vm_create() does KVM_CREATE_VM and little else. __vm_create() also + * loads the test binary into guest memory and creates an IRQ chip (x86 only). + * __vm_create() does NOT create vCPUs, @nr_runnable_vcpus is used purely to + * calculate the amount of memory needed for per-vCPU data, e.g. stacks. */ -struct kvm_vm *vm_create_default(uint32_t vcpuid, uint64_t extra_mem_pages, - void *guest_code); +struct kvm_vm *____vm_create(struct vm_shape shape); +struct kvm_vm *__vm_create(struct vm_shape shape, uint32_t nr_runnable_vcpus, + uint64_t nr_extra_pages); + +static inline struct kvm_vm *vm_create_barebones(void) +{ + return ____vm_create(VM_SHAPE_DEFAULT); +} + +static inline struct kvm_vm *vm_create_barebones_type(unsigned long type) +{ + const struct vm_shape shape = { + .mode = VM_MODE_DEFAULT, + .type = type, + }; + + return ____vm_create(shape); +} + +static inline struct kvm_vm *vm_create(uint32_t nr_runnable_vcpus) +{ + return __vm_create(VM_SHAPE_DEFAULT, nr_runnable_vcpus, 0); +} + +struct kvm_vm *__vm_create_with_vcpus(struct vm_shape shape, uint32_t nr_vcpus, + uint64_t extra_mem_pages, + void *guest_code, struct kvm_vcpu *vcpus[]); + +static inline struct kvm_vm *vm_create_with_vcpus(uint32_t nr_vcpus, + void *guest_code, + struct kvm_vcpu *vcpus[]) +{ + return __vm_create_with_vcpus(VM_SHAPE_DEFAULT, nr_vcpus, 0, + guest_code, vcpus); +} + + +struct kvm_vm *__vm_create_shape_with_one_vcpu(struct vm_shape shape, + struct kvm_vcpu **vcpu, + uint64_t extra_mem_pages, + void *guest_code); /* - * Adds a vCPU with reasonable defaults (e.g. a stack) - * - * Input Args: - * vm - Virtual Machine - * vcpuid - The id of the VCPU to add to the VM. - * guest_code - The vCPU's entry point + * Create a VM with a single vCPU with reasonable defaults and @extra_mem_pages + * additional pages of guest memory. Returns the VM and vCPU (via out param). */ -void vm_vcpu_add_default(struct kvm_vm *vm, uint32_t vcpuid, void *guest_code); +static inline struct kvm_vm *__vm_create_with_one_vcpu(struct kvm_vcpu **vcpu, + uint64_t extra_mem_pages, + void *guest_code) +{ + return __vm_create_shape_with_one_vcpu(VM_SHAPE_DEFAULT, vcpu, + extra_mem_pages, guest_code); +} + +static inline struct kvm_vm *vm_create_with_one_vcpu(struct kvm_vcpu **vcpu, + void *guest_code) +{ + return __vm_create_with_one_vcpu(vcpu, 0, guest_code); +} -bool vm_is_unrestricted_guest(struct kvm_vm *vm); +static inline struct kvm_vm *vm_create_shape_with_one_vcpu(struct vm_shape shape, + struct kvm_vcpu **vcpu, + void *guest_code) +{ + return __vm_create_shape_with_one_vcpu(shape, vcpu, 0, guest_code); +} -unsigned int vm_get_page_size(struct kvm_vm *vm); -unsigned int vm_get_page_shift(struct kvm_vm *vm); -unsigned int vm_get_max_gfn(struct kvm_vm *vm); -int vm_get_fd(struct kvm_vm *vm); +struct kvm_vcpu *vm_recreate_with_one_vcpu(struct kvm_vm *vm); +void kvm_pin_this_task_to_pcpu(uint32_t pcpu); +void kvm_print_vcpu_pinning_help(void); +void kvm_parse_vcpu_pinning(const char *pcpus_string, uint32_t vcpu_to_pcpu[], + int nr_vcpus); + +unsigned long vm_compute_max_gfn(struct kvm_vm *vm); unsigned int vm_calc_num_guest_pages(enum vm_guest_mode mode, size_t size); unsigned int vm_num_host_pages(enum vm_guest_mode mode, unsigned int num_guest_pages); unsigned int vm_num_guest_pages(enum vm_guest_mode mode, unsigned int num_host_pages); @@ -275,15 +954,6 @@ vm_adjust_num_guest_pages(enum vm_guest_mode mode, unsigned int num_guest_pages) return n; } -struct kvm_userspace_memory_region * -kvm_userspace_memory_region_find(struct kvm_vm *vm, uint64_t start, - uint64_t end); - -struct kvm_dirty_log * -allocate_kvm_dirty_log(struct kvm_userspace_memory_region *region); - -int vm_create_device(struct kvm_vm *vm, struct kvm_create_device *cd); - #define sync_global_to_guest(vm, g) ({ \ typeof(g) *_p = addr_gva2hva(vm, (vm_vaddr_t)&(g)); \ memcpy(_p, &(g), sizeof(g)); \ @@ -294,50 +964,153 @@ int vm_create_device(struct kvm_vm *vm, struct kvm_create_device *cd); memcpy(&(g), _p, sizeof(g)); \ }) -/* Common ucalls */ -enum { - UCALL_NONE, - UCALL_SYNC, - UCALL_ABORT, - UCALL_DONE, -}; +/* + * Write a global value, but only in the VM's (guest's) domain. Primarily used + * for "globals" that hold per-VM values (VMs always duplicate code and global + * data into their own region of physical memory), but can be used anytime it's + * undesirable to change the host's copy of the global. + */ +#define write_guest_global(vm, g, val) ({ \ + typeof(g) *_p = addr_gva2hva(vm, (vm_vaddr_t)&(g)); \ + typeof(g) _val = val; \ + \ + memcpy(_p, &(_val), sizeof(g)); \ +}) -#define UCALL_MAX_ARGS 6 +void assert_on_unhandled_exception(struct kvm_vcpu *vcpu); -struct ucall { - uint64_t cmd; - uint64_t args[UCALL_MAX_ARGS]; -}; +void vcpu_arch_dump(FILE *stream, struct kvm_vcpu *vcpu, + uint8_t indent); -void ucall_init(struct kvm_vm *vm, void *arg); -void ucall_uninit(struct kvm_vm *vm); -void ucall(uint64_t cmd, int nargs, ...); -uint64_t get_ucall(struct kvm_vm *vm, uint32_t vcpu_id, struct ucall *uc); - -#define GUEST_SYNC_ARGS(stage, arg1, arg2, arg3, arg4) \ - ucall(UCALL_SYNC, 6, "hello", stage, arg1, arg2, arg3, arg4) -#define GUEST_SYNC(stage) ucall(UCALL_SYNC, 2, "hello", stage) -#define GUEST_DONE() ucall(UCALL_DONE, 0) -#define __GUEST_ASSERT(_condition, _nargs, _args...) do { \ - if (!(_condition)) \ - ucall(UCALL_ABORT, 2 + _nargs, \ - "Failed guest assert: " \ - #_condition, __LINE__, _args); \ -} while (0) +static inline void vcpu_dump(FILE *stream, struct kvm_vcpu *vcpu, + uint8_t indent) +{ + vcpu_arch_dump(stream, vcpu, indent); +} + +/* + * Adds a vCPU with reasonable defaults (e.g. a stack) + * + * Input Args: + * vm - Virtual Machine + * vcpu_id - The id of the VCPU to add to the VM. + */ +struct kvm_vcpu *vm_arch_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id); +void vcpu_arch_set_entry_point(struct kvm_vcpu *vcpu, void *guest_code); + +static inline struct kvm_vcpu *vm_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id, + void *guest_code) +{ + struct kvm_vcpu *vcpu = vm_arch_vcpu_add(vm, vcpu_id); + + vcpu_arch_set_entry_point(vcpu, guest_code); + + return vcpu; +} + +/* Re-create a vCPU after restarting a VM, e.g. for state save/restore tests. */ +struct kvm_vcpu *vm_arch_vcpu_recreate(struct kvm_vm *vm, uint32_t vcpu_id); -#define GUEST_ASSERT(_condition) \ - __GUEST_ASSERT((_condition), 0, 0) +static inline struct kvm_vcpu *vm_vcpu_recreate(struct kvm_vm *vm, + uint32_t vcpu_id) +{ + return vm_arch_vcpu_recreate(vm, vcpu_id); +} + +void vcpu_arch_free(struct kvm_vcpu *vcpu); + +void virt_arch_pgd_alloc(struct kvm_vm *vm); + +static inline void virt_pgd_alloc(struct kvm_vm *vm) +{ + virt_arch_pgd_alloc(vm); +} -#define GUEST_ASSERT_1(_condition, arg1) \ - __GUEST_ASSERT((_condition), 1, (arg1)) +/* + * VM Virtual Page Map + * + * Input Args: + * vm - Virtual Machine + * vaddr - VM Virtual Address + * paddr - VM Physical Address + * memslot - Memory region slot for new virtual translation tables + * + * Output Args: None + * + * Return: None + * + * Within @vm, creates a virtual translation for the page starting + * at @vaddr to the page starting at @paddr. + */ +void virt_arch_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr); + +static inline void virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr) +{ + virt_arch_pg_map(vm, vaddr, paddr); +} + + +/* + * Address Guest Virtual to Guest Physical + * + * Input Args: + * vm - Virtual Machine + * gva - VM virtual address + * + * Output Args: None + * + * Return: + * Equivalent VM physical address + * + * Returns the VM physical address of the translated VM virtual + * address given by @gva. + */ +vm_paddr_t addr_arch_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva); + +static inline vm_paddr_t addr_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva) +{ + return addr_arch_gva2gpa(vm, gva); +} + +/* + * Virtual Translation Tables Dump + * + * Input Args: + * stream - Output FILE stream + * vm - Virtual Machine + * indent - Left margin indent amount + * + * Output Args: None + * + * Return: None + * + * Dumps to the FILE stream given by @stream, the contents of all the + * virtual translation tables for the VM given by @vm. + */ +void virt_arch_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent); + +static inline void virt_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent) +{ + virt_arch_dump(stream, vm, indent); +} + + +static inline int __vm_disable_nx_huge_pages(struct kvm_vm *vm) +{ + return __vm_enable_cap(vm, KVM_CAP_VM_DISABLE_NX_HUGE_PAGES, 0); +} + +/* + * Arch hook that is invoked via a constructor, i.e. before exeucting main(), + * to allow for arch-specific setup that is common to all tests, e.g. computing + * the default guest "mode". + */ +void kvm_selftest_arch_init(void); -#define GUEST_ASSERT_2(_condition, arg1, arg2) \ - __GUEST_ASSERT((_condition), 2, (arg1), (arg2)) +void kvm_arch_vm_post_create(struct kvm_vm *vm); -#define GUEST_ASSERT_3(_condition, arg1, arg2, arg3) \ - __GUEST_ASSERT((_condition), 3, (arg1), (arg2), (arg3)) +bool vm_is_gpa_protected(struct kvm_vm *vm, vm_paddr_t paddr); -#define GUEST_ASSERT_4(_condition, arg1, arg2, arg3, arg4) \ - __GUEST_ASSERT((_condition), 4, (arg1), (arg2), (arg3), (arg4)) +uint32_t guest_get_vcpuid(void); #endif /* SELFTEST_KVM_UTIL_H */ diff --git a/tools/testing/selftests/kvm/include/kvm_util_types.h b/tools/testing/selftests/kvm/include/kvm_util_types.h new file mode 100644 index 000000000000..ec787b97cf18 --- /dev/null +++ b/tools/testing/selftests/kvm/include/kvm_util_types.h @@ -0,0 +1,20 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef SELFTEST_KVM_UTIL_TYPES_H +#define SELFTEST_KVM_UTIL_TYPES_H + +/* + * Provide a version of static_assert() that is guaranteed to have an optional + * message param. _GNU_SOURCE is defined for all KVM selftests, _GNU_SOURCE + * implies _ISOC11_SOURCE, and if _ISOC11_SOURCE is defined, glibc #undefs and + * #defines static_assert() as a direct alias to _Static_assert() (see + * usr/include/assert.h). Define a custom macro instead of redefining + * static_assert() to avoid creating non-deterministic behavior that is + * dependent on include order. + */ +#define __kvm_static_assert(expr, msg, ...) _Static_assert(expr, msg) +#define kvm_static_assert(expr, ...) __kvm_static_assert(expr, ##__VA_ARGS__, #expr) + +typedef uint64_t vm_paddr_t; /* Virtual Machine (Guest) physical address */ +typedef uint64_t vm_vaddr_t; /* Virtual Machine (Guest) virtual address */ + +#endif /* SELFTEST_KVM_UTIL_TYPES_H */ diff --git a/tools/testing/selftests/kvm/include/memstress.h b/tools/testing/selftests/kvm/include/memstress.h new file mode 100644 index 000000000000..9071eb6dea60 --- /dev/null +++ b/tools/testing/selftests/kvm/include/memstress.h @@ -0,0 +1,82 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * tools/testing/selftests/kvm/include/memstress.h + * + * Copyright (C) 2020, Google LLC. + */ + +#ifndef SELFTEST_KVM_MEMSTRESS_H +#define SELFTEST_KVM_MEMSTRESS_H + +#include <pthread.h> + +#include "kvm_util.h" + +/* Default guest test virtual memory offset */ +#define DEFAULT_GUEST_TEST_MEM 0xc0000000 + +#define DEFAULT_PER_VCPU_MEM_SIZE (1 << 30) /* 1G */ + +#define MEMSTRESS_MEM_SLOT_INDEX 1 + +struct memstress_vcpu_args { + uint64_t gpa; + uint64_t gva; + uint64_t pages; + + /* Only used by the host userspace part of the vCPU thread */ + struct kvm_vcpu *vcpu; + int vcpu_idx; +}; + +struct memstress_args { + struct kvm_vm *vm; + /* The starting address and size of the guest test region. */ + uint64_t gpa; + uint64_t size; + uint64_t guest_page_size; + uint32_t random_seed; + uint32_t write_percent; + + /* Run vCPUs in L2 instead of L1, if the architecture supports it. */ + bool nested; + /* Randomize which pages are accessed by the guest. */ + bool random_access; + /* True if all vCPUs are pinned to pCPUs */ + bool pin_vcpus; + /* The vCPU=>pCPU pinning map. Only valid if pin_vcpus is true. */ + uint32_t vcpu_to_pcpu[KVM_MAX_VCPUS]; + + /* Test is done, stop running vCPUs. */ + bool stop_vcpus; + + struct memstress_vcpu_args vcpu_args[KVM_MAX_VCPUS]; +}; + +extern struct memstress_args memstress_args; + +struct kvm_vm *memstress_create_vm(enum vm_guest_mode mode, int nr_vcpus, + uint64_t vcpu_memory_bytes, int slots, + enum vm_mem_backing_src_type backing_src, + bool partition_vcpu_memory_access); +void memstress_destroy_vm(struct kvm_vm *vm); + +void memstress_set_write_percent(struct kvm_vm *vm, uint32_t write_percent); +void memstress_set_random_access(struct kvm_vm *vm, bool random_access); + +void memstress_start_vcpu_threads(int vcpus, void (*vcpu_fn)(struct memstress_vcpu_args *)); +void memstress_join_vcpu_threads(int vcpus); +void memstress_guest_code(uint32_t vcpu_id); + +uint64_t memstress_nested_pages(int nr_vcpus); +void memstress_setup_nested(struct kvm_vm *vm, int nr_vcpus, struct kvm_vcpu *vcpus[]); + +void memstress_enable_dirty_logging(struct kvm_vm *vm, int slots); +void memstress_disable_dirty_logging(struct kvm_vm *vm, int slots); +void memstress_get_dirty_log(struct kvm_vm *vm, unsigned long *bitmaps[], int slots); +void memstress_clear_dirty_log(struct kvm_vm *vm, unsigned long *bitmaps[], + int slots, uint64_t pages_per_slot); +unsigned long **memstress_alloc_bitmaps(int slots, uint64_t pages_per_slot); +void memstress_free_bitmaps(unsigned long *bitmaps[], int slots); + +#endif /* SELFTEST_KVM_MEMSTRESS_H */ diff --git a/tools/testing/selftests/kvm/include/numaif.h b/tools/testing/selftests/kvm/include/numaif.h new file mode 100644 index 000000000000..b020547403fd --- /dev/null +++ b/tools/testing/selftests/kvm/include/numaif.h @@ -0,0 +1,55 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * tools/testing/selftests/kvm/include/numaif.h + * + * Copyright (C) 2020, Google LLC. + * + * This work is licensed under the terms of the GNU GPL, version 2. + * + * Header file that provides access to NUMA API functions not explicitly + * exported to user space. + */ + +#ifndef SELFTEST_KVM_NUMAIF_H +#define SELFTEST_KVM_NUMAIF_H + +#define __NR_get_mempolicy 239 +#define __NR_migrate_pages 256 + +/* System calls */ +long get_mempolicy(int *policy, const unsigned long *nmask, + unsigned long maxnode, void *addr, int flags) +{ + return syscall(__NR_get_mempolicy, policy, nmask, + maxnode, addr, flags); +} + +long migrate_pages(int pid, unsigned long maxnode, + const unsigned long *frommask, + const unsigned long *tomask) +{ + return syscall(__NR_migrate_pages, pid, maxnode, frommask, tomask); +} + +/* Policies */ +#define MPOL_DEFAULT 0 +#define MPOL_PREFERRED 1 +#define MPOL_BIND 2 +#define MPOL_INTERLEAVE 3 + +#define MPOL_MAX MPOL_INTERLEAVE + +/* Flags for get_mem_policy */ +#define MPOL_F_NODE (1<<0) /* return next il node or node of address */ + /* Warning: MPOL_F_NODE is unsupported and + * subject to change. Don't use. + */ +#define MPOL_F_ADDR (1<<1) /* look up vma using address */ +#define MPOL_F_MEMS_ALLOWED (1<<2) /* query nodes allowed in cpuset */ + +/* Flags for mbind */ +#define MPOL_MF_STRICT (1<<0) /* Verify existing pages in the mapping */ +#define MPOL_MF_MOVE (1<<1) /* Move pages owned by this process to conform to mapping */ +#define MPOL_MF_MOVE_ALL (1<<2) /* Move every page to conform to mapping */ + +#endif /* SELFTEST_KVM_NUMAIF_H */ diff --git a/tools/testing/selftests/kvm/include/riscv/arch_timer.h b/tools/testing/selftests/kvm/include/riscv/arch_timer.h new file mode 100644 index 000000000000..225d81dad064 --- /dev/null +++ b/tools/testing/selftests/kvm/include/riscv/arch_timer.h @@ -0,0 +1,71 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * RISC-V Arch Timer(sstc) specific interface + * + * Copyright (c) 2024 Intel Corporation + */ + +#ifndef SELFTEST_KVM_ARCH_TIMER_H +#define SELFTEST_KVM_ARCH_TIMER_H + +#include <asm/csr.h> +#include <asm/vdso/processor.h> + +static unsigned long timer_freq; + +#define msec_to_cycles(msec) \ + ((timer_freq) * (uint64_t)(msec) / 1000) + +#define usec_to_cycles(usec) \ + ((timer_freq) * (uint64_t)(usec) / 1000000) + +#define cycles_to_usec(cycles) \ + ((uint64_t)(cycles) * 1000000 / (timer_freq)) + +static inline uint64_t timer_get_cycles(void) +{ + return csr_read(CSR_TIME); +} + +static inline void timer_set_cmp(uint64_t cval) +{ + csr_write(CSR_STIMECMP, cval); +} + +static inline uint64_t timer_get_cmp(void) +{ + return csr_read(CSR_STIMECMP); +} + +static inline void timer_irq_enable(void) +{ + csr_set(CSR_SIE, IE_TIE); +} + +static inline void timer_irq_disable(void) +{ + csr_clear(CSR_SIE, IE_TIE); +} + +static inline void timer_set_next_cmp_ms(uint32_t msec) +{ + uint64_t now_ct = timer_get_cycles(); + uint64_t next_ct = now_ct + msec_to_cycles(msec); + + timer_set_cmp(next_ct); +} + +static inline void __delay(uint64_t cycles) +{ + uint64_t start = timer_get_cycles(); + + while ((timer_get_cycles() - start) < cycles) + cpu_relax(); +} + +static inline void udelay(unsigned long usec) +{ + __delay(usec_to_cycles(usec)); +} + +#endif /* SELFTEST_KVM_ARCH_TIMER_H */ diff --git a/tools/testing/selftests/kvm/include/riscv/kvm_util_arch.h b/tools/testing/selftests/kvm/include/riscv/kvm_util_arch.h new file mode 100644 index 000000000000..e43a57d99b56 --- /dev/null +++ b/tools/testing/selftests/kvm/include/riscv/kvm_util_arch.h @@ -0,0 +1,7 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef SELFTEST_KVM_UTIL_ARCH_H +#define SELFTEST_KVM_UTIL_ARCH_H + +struct kvm_vm_arch {}; + +#endif // SELFTEST_KVM_UTIL_ARCH_H diff --git a/tools/testing/selftests/kvm/include/riscv/processor.h b/tools/testing/selftests/kvm/include/riscv/processor.h new file mode 100644 index 000000000000..5f389166338c --- /dev/null +++ b/tools/testing/selftests/kvm/include/riscv/processor.h @@ -0,0 +1,177 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * RISC-V processor specific defines + * + * Copyright (C) 2021 Western Digital Corporation or its affiliates. + */ +#ifndef SELFTEST_KVM_PROCESSOR_H +#define SELFTEST_KVM_PROCESSOR_H + +#include <linux/stringify.h> +#include <asm/csr.h> +#include "kvm_util.h" + +static inline uint64_t __kvm_reg_id(uint64_t type, uint64_t subtype, + uint64_t idx, uint64_t size) +{ + return KVM_REG_RISCV | type | subtype | idx | size; +} + +#if __riscv_xlen == 64 +#define KVM_REG_SIZE_ULONG KVM_REG_SIZE_U64 +#else +#define KVM_REG_SIZE_ULONG KVM_REG_SIZE_U32 +#endif + +#define RISCV_CONFIG_REG(name) __kvm_reg_id(KVM_REG_RISCV_CONFIG, 0, \ + KVM_REG_RISCV_CONFIG_REG(name), \ + KVM_REG_SIZE_ULONG) + +#define RISCV_CORE_REG(name) __kvm_reg_id(KVM_REG_RISCV_CORE, 0, \ + KVM_REG_RISCV_CORE_REG(name), \ + KVM_REG_SIZE_ULONG) + +#define RISCV_GENERAL_CSR_REG(name) __kvm_reg_id(KVM_REG_RISCV_CSR, \ + KVM_REG_RISCV_CSR_GENERAL, \ + KVM_REG_RISCV_CSR_REG(name), \ + KVM_REG_SIZE_ULONG) + +#define RISCV_TIMER_REG(name) __kvm_reg_id(KVM_REG_RISCV_TIMER, 0, \ + KVM_REG_RISCV_TIMER_REG(name), \ + KVM_REG_SIZE_U64) + +#define RISCV_ISA_EXT_REG(idx) __kvm_reg_id(KVM_REG_RISCV_ISA_EXT, \ + KVM_REG_RISCV_ISA_SINGLE, \ + idx, KVM_REG_SIZE_ULONG) + +#define RISCV_SBI_EXT_REG(idx) __kvm_reg_id(KVM_REG_RISCV_SBI_EXT, \ + KVM_REG_RISCV_SBI_SINGLE, \ + idx, KVM_REG_SIZE_ULONG) + +bool __vcpu_has_ext(struct kvm_vcpu *vcpu, uint64_t ext); + +static inline bool __vcpu_has_isa_ext(struct kvm_vcpu *vcpu, uint64_t isa_ext) +{ + return __vcpu_has_ext(vcpu, RISCV_ISA_EXT_REG(isa_ext)); +} + +static inline bool __vcpu_has_sbi_ext(struct kvm_vcpu *vcpu, uint64_t sbi_ext) +{ + return __vcpu_has_ext(vcpu, RISCV_SBI_EXT_REG(sbi_ext)); +} + +struct ex_regs { + unsigned long ra; + unsigned long sp; + unsigned long gp; + unsigned long tp; + unsigned long t0; + unsigned long t1; + unsigned long t2; + unsigned long s0; + unsigned long s1; + unsigned long a0; + unsigned long a1; + unsigned long a2; + unsigned long a3; + unsigned long a4; + unsigned long a5; + unsigned long a6; + unsigned long a7; + unsigned long s2; + unsigned long s3; + unsigned long s4; + unsigned long s5; + unsigned long s6; + unsigned long s7; + unsigned long s8; + unsigned long s9; + unsigned long s10; + unsigned long s11; + unsigned long t3; + unsigned long t4; + unsigned long t5; + unsigned long t6; + unsigned long epc; + unsigned long status; + unsigned long cause; +}; + +#define NR_VECTORS 2 +#define NR_EXCEPTIONS 32 +#define EC_MASK (NR_EXCEPTIONS - 1) + +typedef void(*exception_handler_fn)(struct ex_regs *); + +void vm_init_vector_tables(struct kvm_vm *vm); +void vcpu_init_vector_tables(struct kvm_vcpu *vcpu); + +void vm_install_exception_handler(struct kvm_vm *vm, int vector, exception_handler_fn handler); + +void vm_install_interrupt_handler(struct kvm_vm *vm, exception_handler_fn handler); + +/* L3 index Bit[47:39] */ +#define PGTBL_L3_INDEX_MASK 0x0000FF8000000000ULL +#define PGTBL_L3_INDEX_SHIFT 39 +#define PGTBL_L3_BLOCK_SHIFT 39 +#define PGTBL_L3_BLOCK_SIZE 0x0000008000000000ULL +#define PGTBL_L3_MAP_MASK (~(PGTBL_L3_BLOCK_SIZE - 1)) +/* L2 index Bit[38:30] */ +#define PGTBL_L2_INDEX_MASK 0x0000007FC0000000ULL +#define PGTBL_L2_INDEX_SHIFT 30 +#define PGTBL_L2_BLOCK_SHIFT 30 +#define PGTBL_L2_BLOCK_SIZE 0x0000000040000000ULL +#define PGTBL_L2_MAP_MASK (~(PGTBL_L2_BLOCK_SIZE - 1)) +/* L1 index Bit[29:21] */ +#define PGTBL_L1_INDEX_MASK 0x000000003FE00000ULL +#define PGTBL_L1_INDEX_SHIFT 21 +#define PGTBL_L1_BLOCK_SHIFT 21 +#define PGTBL_L1_BLOCK_SIZE 0x0000000000200000ULL +#define PGTBL_L1_MAP_MASK (~(PGTBL_L1_BLOCK_SIZE - 1)) +/* L0 index Bit[20:12] */ +#define PGTBL_L0_INDEX_MASK 0x00000000001FF000ULL +#define PGTBL_L0_INDEX_SHIFT 12 +#define PGTBL_L0_BLOCK_SHIFT 12 +#define PGTBL_L0_BLOCK_SIZE 0x0000000000001000ULL +#define PGTBL_L0_MAP_MASK (~(PGTBL_L0_BLOCK_SIZE - 1)) + +#define PGTBL_PTE_ADDR_MASK 0x003FFFFFFFFFFC00ULL +#define PGTBL_PTE_ADDR_SHIFT 10 +#define PGTBL_PTE_RSW_MASK 0x0000000000000300ULL +#define PGTBL_PTE_RSW_SHIFT 8 +#define PGTBL_PTE_DIRTY_MASK 0x0000000000000080ULL +#define PGTBL_PTE_DIRTY_SHIFT 7 +#define PGTBL_PTE_ACCESSED_MASK 0x0000000000000040ULL +#define PGTBL_PTE_ACCESSED_SHIFT 6 +#define PGTBL_PTE_GLOBAL_MASK 0x0000000000000020ULL +#define PGTBL_PTE_GLOBAL_SHIFT 5 +#define PGTBL_PTE_USER_MASK 0x0000000000000010ULL +#define PGTBL_PTE_USER_SHIFT 4 +#define PGTBL_PTE_EXECUTE_MASK 0x0000000000000008ULL +#define PGTBL_PTE_EXECUTE_SHIFT 3 +#define PGTBL_PTE_WRITE_MASK 0x0000000000000004ULL +#define PGTBL_PTE_WRITE_SHIFT 2 +#define PGTBL_PTE_READ_MASK 0x0000000000000002ULL +#define PGTBL_PTE_READ_SHIFT 1 +#define PGTBL_PTE_PERM_MASK (PGTBL_PTE_ACCESSED_MASK | \ + PGTBL_PTE_DIRTY_MASK | \ + PGTBL_PTE_EXECUTE_MASK | \ + PGTBL_PTE_WRITE_MASK | \ + PGTBL_PTE_READ_MASK) +#define PGTBL_PTE_VALID_MASK 0x0000000000000001ULL +#define PGTBL_PTE_VALID_SHIFT 0 + +#define PGTBL_PAGE_SIZE PGTBL_L0_BLOCK_SIZE +#define PGTBL_PAGE_SIZE_SHIFT PGTBL_L0_BLOCK_SHIFT + +static inline void local_irq_enable(void) +{ + csr_set(CSR_SSTATUS, SR_SIE); +} + +static inline void local_irq_disable(void) +{ + csr_clear(CSR_SSTATUS, SR_SIE); +} + +#endif /* SELFTEST_KVM_PROCESSOR_H */ diff --git a/tools/testing/selftests/kvm/include/riscv/sbi.h b/tools/testing/selftests/kvm/include/riscv/sbi.h new file mode 100644 index 000000000000..046b432ae896 --- /dev/null +++ b/tools/testing/selftests/kvm/include/riscv/sbi.h @@ -0,0 +1,141 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * RISC-V SBI specific definitions + * + * Copyright (C) 2024 Rivos Inc. + */ + +#ifndef SELFTEST_KVM_SBI_H +#define SELFTEST_KVM_SBI_H + +/* SBI spec version fields */ +#define SBI_SPEC_VERSION_DEFAULT 0x1 +#define SBI_SPEC_VERSION_MAJOR_SHIFT 24 +#define SBI_SPEC_VERSION_MAJOR_MASK 0x7f +#define SBI_SPEC_VERSION_MINOR_MASK 0xffffff + +/* SBI return error codes */ +#define SBI_SUCCESS 0 +#define SBI_ERR_FAILURE -1 +#define SBI_ERR_NOT_SUPPORTED -2 +#define SBI_ERR_INVALID_PARAM -3 +#define SBI_ERR_DENIED -4 +#define SBI_ERR_INVALID_ADDRESS -5 +#define SBI_ERR_ALREADY_AVAILABLE -6 +#define SBI_ERR_ALREADY_STARTED -7 +#define SBI_ERR_ALREADY_STOPPED -8 + +#define SBI_EXT_EXPERIMENTAL_START 0x08000000 +#define SBI_EXT_EXPERIMENTAL_END 0x08FFFFFF + +#define KVM_RISCV_SELFTESTS_SBI_EXT SBI_EXT_EXPERIMENTAL_END +#define KVM_RISCV_SELFTESTS_SBI_UCALL 0 +#define KVM_RISCV_SELFTESTS_SBI_UNEXP 1 + +enum sbi_ext_id { + SBI_EXT_BASE = 0x10, + SBI_EXT_STA = 0x535441, + SBI_EXT_PMU = 0x504D55, +}; + +enum sbi_ext_base_fid { + SBI_EXT_BASE_GET_SPEC_VERSION = 0, + SBI_EXT_BASE_GET_IMP_ID, + SBI_EXT_BASE_GET_IMP_VERSION, + SBI_EXT_BASE_PROBE_EXT = 3, +}; +enum sbi_ext_pmu_fid { + SBI_EXT_PMU_NUM_COUNTERS = 0, + SBI_EXT_PMU_COUNTER_GET_INFO, + SBI_EXT_PMU_COUNTER_CFG_MATCH, + SBI_EXT_PMU_COUNTER_START, + SBI_EXT_PMU_COUNTER_STOP, + SBI_EXT_PMU_COUNTER_FW_READ, + SBI_EXT_PMU_COUNTER_FW_READ_HI, + SBI_EXT_PMU_SNAPSHOT_SET_SHMEM, +}; + +union sbi_pmu_ctr_info { + unsigned long value; + struct { + unsigned long csr:12; + unsigned long width:6; +#if __riscv_xlen == 32 + unsigned long reserved:13; +#else + unsigned long reserved:45; +#endif + unsigned long type:1; + }; +}; + +struct riscv_pmu_snapshot_data { + u64 ctr_overflow_mask; + u64 ctr_values[64]; + u64 reserved[447]; +}; + +struct sbiret { + long error; + long value; +}; + +/** General pmu event codes specified in SBI PMU extension */ +enum sbi_pmu_hw_generic_events_t { + SBI_PMU_HW_NO_EVENT = 0, + SBI_PMU_HW_CPU_CYCLES = 1, + SBI_PMU_HW_INSTRUCTIONS = 2, + SBI_PMU_HW_CACHE_REFERENCES = 3, + SBI_PMU_HW_CACHE_MISSES = 4, + SBI_PMU_HW_BRANCH_INSTRUCTIONS = 5, + SBI_PMU_HW_BRANCH_MISSES = 6, + SBI_PMU_HW_BUS_CYCLES = 7, + SBI_PMU_HW_STALLED_CYCLES_FRONTEND = 8, + SBI_PMU_HW_STALLED_CYCLES_BACKEND = 9, + SBI_PMU_HW_REF_CPU_CYCLES = 10, + + SBI_PMU_HW_GENERAL_MAX, +}; + +/* SBI PMU counter types */ +enum sbi_pmu_ctr_type { + SBI_PMU_CTR_TYPE_HW = 0x0, + SBI_PMU_CTR_TYPE_FW, +}; + +/* Flags defined for config matching function */ +#define SBI_PMU_CFG_FLAG_SKIP_MATCH BIT(0) +#define SBI_PMU_CFG_FLAG_CLEAR_VALUE BIT(1) +#define SBI_PMU_CFG_FLAG_AUTO_START BIT(2) +#define SBI_PMU_CFG_FLAG_SET_VUINH BIT(3) +#define SBI_PMU_CFG_FLAG_SET_VSINH BIT(4) +#define SBI_PMU_CFG_FLAG_SET_UINH BIT(5) +#define SBI_PMU_CFG_FLAG_SET_SINH BIT(6) +#define SBI_PMU_CFG_FLAG_SET_MINH BIT(7) + +/* Flags defined for counter start function */ +#define SBI_PMU_START_FLAG_SET_INIT_VALUE BIT(0) +#define SBI_PMU_START_FLAG_INIT_SNAPSHOT BIT(1) + +/* Flags defined for counter stop function */ +#define SBI_PMU_STOP_FLAG_RESET BIT(0) +#define SBI_PMU_STOP_FLAG_TAKE_SNAPSHOT BIT(1) + +struct sbiret sbi_ecall(int ext, int fid, unsigned long arg0, + unsigned long arg1, unsigned long arg2, + unsigned long arg3, unsigned long arg4, + unsigned long arg5); + +bool guest_sbi_probe_extension(int extid, long *out_val); + +/* Make SBI version */ +static inline unsigned long sbi_mk_version(unsigned long major, + unsigned long minor) +{ + return ((major & SBI_SPEC_VERSION_MAJOR_MASK) << SBI_SPEC_VERSION_MAJOR_SHIFT) + | (minor & SBI_SPEC_VERSION_MINOR_MASK); +} + +unsigned long get_host_sbi_spec_version(void); + +#endif /* SELFTEST_KVM_SBI_H */ diff --git a/tools/testing/selftests/kvm/include/riscv/ucall.h b/tools/testing/selftests/kvm/include/riscv/ucall.h new file mode 100644 index 000000000000..a695ae36f3e0 --- /dev/null +++ b/tools/testing/selftests/kvm/include/riscv/ucall.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef SELFTEST_KVM_UCALL_H +#define SELFTEST_KVM_UCALL_H + +#include "processor.h" +#include "sbi.h" + +#define UCALL_EXIT_REASON KVM_EXIT_RISCV_SBI + +static inline void ucall_arch_init(struct kvm_vm *vm, vm_paddr_t mmio_gpa) +{ +} + +static inline void ucall_arch_do_ucall(vm_vaddr_t uc) +{ + sbi_ecall(KVM_RISCV_SELFTESTS_SBI_EXT, + KVM_RISCV_SELFTESTS_SBI_UCALL, + uc, 0, 0, 0, 0, 0); +} + +#endif diff --git a/tools/testing/selftests/kvm/include/s390x/diag318_test_handler.h b/tools/testing/selftests/kvm/include/s390x/diag318_test_handler.h new file mode 100644 index 000000000000..b0ed71302722 --- /dev/null +++ b/tools/testing/selftests/kvm/include/s390x/diag318_test_handler.h @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later + * + * Test handler for the s390x DIAGNOSE 0x0318 instruction. + * + * Copyright (C) 2020, IBM + */ + +#ifndef SELFTEST_KVM_DIAG318_TEST_HANDLER +#define SELFTEST_KVM_DIAG318_TEST_HANDLER + +uint64_t get_diag318_info(void); + +#endif diff --git a/tools/testing/selftests/kvm/include/s390x/kvm_util_arch.h b/tools/testing/selftests/kvm/include/s390x/kvm_util_arch.h new file mode 100644 index 000000000000..e43a57d99b56 --- /dev/null +++ b/tools/testing/selftests/kvm/include/s390x/kvm_util_arch.h @@ -0,0 +1,7 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef SELFTEST_KVM_UTIL_ARCH_H +#define SELFTEST_KVM_UTIL_ARCH_H + +struct kvm_vm_arch {}; + +#endif // SELFTEST_KVM_UTIL_ARCH_H diff --git a/tools/testing/selftests/kvm/include/s390x/processor.h b/tools/testing/selftests/kvm/include/s390x/processor.h index e0e96a5f608c..255c9b990f4c 100644 --- a/tools/testing/selftests/kvm/include/s390x/processor.h +++ b/tools/testing/selftests/kvm/include/s390x/processor.h @@ -5,6 +5,8 @@ #ifndef SELFTEST_KVM_PROCESSOR_H #define SELFTEST_KVM_PROCESSOR_H +#include <linux/compiler.h> + /* Bits in the region/segment table entry */ #define REGION_ENTRY_ORIGIN ~0xfffUL /* region/segment table origin */ #define REGION_ENTRY_PROTECT 0x200 /* region protection bit */ @@ -19,4 +21,10 @@ #define PAGE_PROTECT 0x200 /* HW read-only bit */ #define PAGE_NOEXEC 0x100 /* HW no-execute bit */ +/* Is there a portable way to do this? */ +static inline void cpu_relax(void) +{ + barrier(); +} + #endif diff --git a/tools/testing/selftests/kvm/include/s390x/ucall.h b/tools/testing/selftests/kvm/include/s390x/ucall.h new file mode 100644 index 000000000000..8035a872a351 --- /dev/null +++ b/tools/testing/selftests/kvm/include/s390x/ucall.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef SELFTEST_KVM_UCALL_H +#define SELFTEST_KVM_UCALL_H + +#include "kvm_util.h" + +#define UCALL_EXIT_REASON KVM_EXIT_S390_SIEIC + +static inline void ucall_arch_init(struct kvm_vm *vm, vm_paddr_t mmio_gpa) +{ +} + +static inline void ucall_arch_do_ucall(vm_vaddr_t uc) +{ + /* Exit via DIAGNOSE 0x501 (normally used for breakpoints) */ + asm volatile ("diag 0,%0,0x501" : : "a"(uc) : "memory"); +} + +#endif diff --git a/tools/testing/selftests/kvm/include/sparsebit.h b/tools/testing/selftests/kvm/include/sparsebit.h index 12a9a4b9cead..bc760761e1a3 100644 --- a/tools/testing/selftests/kvm/include/sparsebit.h +++ b/tools/testing/selftests/kvm/include/sparsebit.h @@ -30,26 +30,26 @@ typedef uint64_t sparsebit_num_t; struct sparsebit *sparsebit_alloc(void); void sparsebit_free(struct sparsebit **sbitp); -void sparsebit_copy(struct sparsebit *dstp, struct sparsebit *src); +void sparsebit_copy(struct sparsebit *dstp, const struct sparsebit *src); -bool sparsebit_is_set(struct sparsebit *sbit, sparsebit_idx_t idx); -bool sparsebit_is_set_num(struct sparsebit *sbit, +bool sparsebit_is_set(const struct sparsebit *sbit, sparsebit_idx_t idx); +bool sparsebit_is_set_num(const struct sparsebit *sbit, sparsebit_idx_t idx, sparsebit_num_t num); -bool sparsebit_is_clear(struct sparsebit *sbit, sparsebit_idx_t idx); -bool sparsebit_is_clear_num(struct sparsebit *sbit, +bool sparsebit_is_clear(const struct sparsebit *sbit, sparsebit_idx_t idx); +bool sparsebit_is_clear_num(const struct sparsebit *sbit, sparsebit_idx_t idx, sparsebit_num_t num); -sparsebit_num_t sparsebit_num_set(struct sparsebit *sbit); -bool sparsebit_any_set(struct sparsebit *sbit); -bool sparsebit_any_clear(struct sparsebit *sbit); -bool sparsebit_all_set(struct sparsebit *sbit); -bool sparsebit_all_clear(struct sparsebit *sbit); -sparsebit_idx_t sparsebit_first_set(struct sparsebit *sbit); -sparsebit_idx_t sparsebit_first_clear(struct sparsebit *sbit); -sparsebit_idx_t sparsebit_next_set(struct sparsebit *sbit, sparsebit_idx_t prev); -sparsebit_idx_t sparsebit_next_clear(struct sparsebit *sbit, sparsebit_idx_t prev); -sparsebit_idx_t sparsebit_next_set_num(struct sparsebit *sbit, +sparsebit_num_t sparsebit_num_set(const struct sparsebit *sbit); +bool sparsebit_any_set(const struct sparsebit *sbit); +bool sparsebit_any_clear(const struct sparsebit *sbit); +bool sparsebit_all_set(const struct sparsebit *sbit); +bool sparsebit_all_clear(const struct sparsebit *sbit); +sparsebit_idx_t sparsebit_first_set(const struct sparsebit *sbit); +sparsebit_idx_t sparsebit_first_clear(const struct sparsebit *sbit); +sparsebit_idx_t sparsebit_next_set(const struct sparsebit *sbit, sparsebit_idx_t prev); +sparsebit_idx_t sparsebit_next_clear(const struct sparsebit *sbit, sparsebit_idx_t prev); +sparsebit_idx_t sparsebit_next_set_num(const struct sparsebit *sbit, sparsebit_idx_t start, sparsebit_num_t num); -sparsebit_idx_t sparsebit_next_clear_num(struct sparsebit *sbit, +sparsebit_idx_t sparsebit_next_clear_num(const struct sparsebit *sbit, sparsebit_idx_t start, sparsebit_num_t num); void sparsebit_set(struct sparsebit *sbitp, sparsebit_idx_t idx); @@ -62,9 +62,29 @@ void sparsebit_clear_num(struct sparsebit *sbitp, sparsebit_idx_t start, sparsebit_num_t num); void sparsebit_clear_all(struct sparsebit *sbitp); -void sparsebit_dump(FILE *stream, struct sparsebit *sbit, +void sparsebit_dump(FILE *stream, const struct sparsebit *sbit, unsigned int indent); -void sparsebit_validate_internal(struct sparsebit *sbit); +void sparsebit_validate_internal(const struct sparsebit *sbit); + +/* + * Iterate over an inclusive ranges within sparsebit @s. In each iteration, + * @range_begin and @range_end will take the beginning and end of the set + * range, which are of type sparsebit_idx_t. + * + * For example, if the range [3, 7] (inclusive) is set, within the + * iteration,@range_begin will take the value 3 and @range_end will take + * the value 7. + * + * Ensure that there is at least one bit set before using this macro with + * sparsebit_any_set(), because sparsebit_first_set() will abort if none + * are set. + */ +#define sparsebit_for_each_set_range(s, range_begin, range_end) \ + for (range_begin = sparsebit_first_set(s), \ + range_end = sparsebit_next_clear(s, range_begin) - 1; \ + range_begin && range_end; \ + range_begin = sparsebit_next_set(s, range_end), \ + range_end = sparsebit_next_clear(s, range_begin) - 1) #ifdef __cplusplus } diff --git a/tools/testing/selftests/kvm/include/test_util.h b/tools/testing/selftests/kvm/include/test_util.h index 5eb01bf51b86..3e473058849f 100644 --- a/tools/testing/selftests/kvm/include/test_util.h +++ b/tools/testing/selftests/kvm/include/test_util.h @@ -17,8 +17,11 @@ #include <errno.h> #include <unistd.h> #include <fcntl.h> +#include <sys/mman.h> #include "kselftest.h" +#define msecs_to_usecs(msec) ((msec) * 1000ULL) + static inline int _no_printf(const char *format, ...) { return 0; } #ifdef DEBUG @@ -32,31 +35,48 @@ static inline int _no_printf(const char *format, ...) { return 0; } #define pr_info(...) _no_printf(__VA_ARGS__) #endif -void print_skip(const char *fmt, ...) __attribute__((format(printf, 1, 2))); +void __printf(1, 2) print_skip(const char *fmt, ...); +#define __TEST_REQUIRE(f, fmt, ...) \ +do { \ + if (!(f)) \ + ksft_exit_skip("- " fmt "\n", ##__VA_ARGS__); \ +} while (0) + +#define TEST_REQUIRE(f) __TEST_REQUIRE(f, "Requirement not met: %s", #f) ssize_t test_write(int fd, const void *buf, size_t count); ssize_t test_read(int fd, void *buf, size_t count); int test_seq_read(const char *path, char **bufp, size_t *sizep); -void test_assert(bool exp, const char *exp_str, - const char *file, unsigned int line, const char *fmt, ...) - __attribute__((format(printf, 5, 6))); +void __printf(5, 6) test_assert(bool exp, const char *exp_str, + const char *file, unsigned int line, + const char *fmt, ...); #define TEST_ASSERT(e, fmt, ...) \ test_assert((e), #e, __FILE__, __LINE__, fmt, ##__VA_ARGS__) -#define ASSERT_EQ(a, b) do { \ - typeof(a) __a = (a); \ - typeof(b) __b = (b); \ - TEST_ASSERT(__a == __b, \ - "ASSERT_EQ(%s, %s) failed.\n" \ - "\t%s is %#lx\n" \ - "\t%s is %#lx", \ - #a, #b, #a, (unsigned long) __a, #b, (unsigned long) __b); \ +#define TEST_ASSERT_EQ(a, b) \ +do { \ + typeof(a) __a = (a); \ + typeof(b) __b = (b); \ + test_assert(__a == __b, #a " == " #b, __FILE__, __LINE__, \ + "%#lx != %#lx (%s != %s)", \ + (unsigned long)(__a), (unsigned long)(__b), #a, #b);\ } while (0) -#define TEST_FAIL(fmt, ...) \ - TEST_ASSERT(false, fmt, ##__VA_ARGS__) +#define TEST_ASSERT_KVM_EXIT_REASON(vcpu, expected) do { \ + __u32 exit_reason = (vcpu)->run->exit_reason; \ + \ + TEST_ASSERT(exit_reason == (expected), \ + "Wanted KVM exit reason: %u (%s), got: %u (%s)", \ + (expected), exit_reason_str((expected)), \ + exit_reason, exit_reason_str(exit_reason)); \ +} while (0) + +#define TEST_FAIL(fmt, ...) do { \ + TEST_ASSERT(false, fmt, ##__VA_ARGS__); \ + __builtin_unreachable(); \ +} while (0) size_t parse_size(const char *size); @@ -64,5 +84,138 @@ int64_t timespec_to_ns(struct timespec ts); struct timespec timespec_add_ns(struct timespec ts, int64_t ns); struct timespec timespec_add(struct timespec ts1, struct timespec ts2); struct timespec timespec_sub(struct timespec ts1, struct timespec ts2); +struct timespec timespec_elapsed(struct timespec start); +struct timespec timespec_div(struct timespec ts, int divisor); + +struct guest_random_state { + uint32_t seed; +}; + +extern uint32_t guest_random_seed; +extern struct guest_random_state guest_rng; + +struct guest_random_state new_guest_random_state(uint32_t seed); +uint32_t guest_random_u32(struct guest_random_state *state); + +static inline bool __guest_random_bool(struct guest_random_state *state, + uint8_t percent) +{ + return (guest_random_u32(state) % 100) < percent; +} + +static inline bool guest_random_bool(struct guest_random_state *state) +{ + return __guest_random_bool(state, 50); +} + +static inline uint64_t guest_random_u64(struct guest_random_state *state) +{ + return ((uint64_t)guest_random_u32(state) << 32) | guest_random_u32(state); +} + +enum vm_mem_backing_src_type { + VM_MEM_SRC_ANONYMOUS, + VM_MEM_SRC_ANONYMOUS_THP, + VM_MEM_SRC_ANONYMOUS_HUGETLB, + VM_MEM_SRC_ANONYMOUS_HUGETLB_16KB, + VM_MEM_SRC_ANONYMOUS_HUGETLB_64KB, + VM_MEM_SRC_ANONYMOUS_HUGETLB_512KB, + VM_MEM_SRC_ANONYMOUS_HUGETLB_1MB, + VM_MEM_SRC_ANONYMOUS_HUGETLB_2MB, + VM_MEM_SRC_ANONYMOUS_HUGETLB_8MB, + VM_MEM_SRC_ANONYMOUS_HUGETLB_16MB, + VM_MEM_SRC_ANONYMOUS_HUGETLB_32MB, + VM_MEM_SRC_ANONYMOUS_HUGETLB_256MB, + VM_MEM_SRC_ANONYMOUS_HUGETLB_512MB, + VM_MEM_SRC_ANONYMOUS_HUGETLB_1GB, + VM_MEM_SRC_ANONYMOUS_HUGETLB_2GB, + VM_MEM_SRC_ANONYMOUS_HUGETLB_16GB, + VM_MEM_SRC_SHMEM, + VM_MEM_SRC_SHARED_HUGETLB, + NUM_SRC_TYPES, +}; + +#define DEFAULT_VM_MEM_SRC VM_MEM_SRC_ANONYMOUS + +struct vm_mem_backing_src_alias { + const char *name; + uint32_t flag; +}; + +#define MIN_RUN_DELAY_NS 200000UL + +bool thp_configured(void); +size_t get_trans_hugepagesz(void); +size_t get_def_hugetlb_pagesz(void); +const struct vm_mem_backing_src_alias *vm_mem_backing_src_alias(uint32_t i); +size_t get_backing_src_pagesz(uint32_t i); +bool is_backing_src_hugetlb(uint32_t i); +void backing_src_help(const char *flag); +enum vm_mem_backing_src_type parse_backing_src_type(const char *type_name); +long get_run_delay(void); + +/* + * Whether or not the given source type is shared memory (as opposed to + * anonymous). + */ +static inline bool backing_src_is_shared(enum vm_mem_backing_src_type t) +{ + return vm_mem_backing_src_alias(t)->flag & MAP_SHARED; +} + +static inline bool backing_src_can_be_huge(enum vm_mem_backing_src_type t) +{ + return t != VM_MEM_SRC_ANONYMOUS && t != VM_MEM_SRC_SHMEM; +} + +/* Aligns x up to the next multiple of size. Size must be a power of 2. */ +static inline uint64_t align_up(uint64_t x, uint64_t size) +{ + uint64_t mask = size - 1; + + TEST_ASSERT(size != 0 && !(size & (size - 1)), + "size not a power of 2: %lu", size); + return ((x + mask) & ~mask); +} + +static inline uint64_t align_down(uint64_t x, uint64_t size) +{ + uint64_t x_aligned_up = align_up(x, size); + + if (x == x_aligned_up) + return x; + else + return x_aligned_up - size; +} + +static inline void *align_ptr_up(void *x, size_t size) +{ + return (void *)align_up((unsigned long)x, size); +} + +int atoi_paranoid(const char *num_str); + +static inline uint32_t atoi_positive(const char *name, const char *num_str) +{ + int num = atoi_paranoid(num_str); + + TEST_ASSERT(num > 0, "%s must be greater than 0, got '%s'", name, num_str); + return num; +} + +static inline uint32_t atoi_non_negative(const char *name, const char *num_str) +{ + int num = atoi_paranoid(num_str); + + TEST_ASSERT(num >= 0, "%s must be non-negative, got '%s'", name, num_str); + return num; +} + +int guest_vsnprintf(char *buf, int n, const char *fmt, va_list args); +__printf(3, 4) int guest_snprintf(char *buf, int n, const char *fmt, ...); + +char *strdup_printf(const char *fmt, ...) __attribute__((format(printf, 1, 2), nonnull(1))); + +char *sys_get_cur_clocksource(void); #endif /* SELFTEST_KVM_TEST_UTIL_H */ diff --git a/tools/testing/selftests/kvm/include/timer_test.h b/tools/testing/selftests/kvm/include/timer_test.h new file mode 100644 index 000000000000..9b6edaafe6d4 --- /dev/null +++ b/tools/testing/selftests/kvm/include/timer_test.h @@ -0,0 +1,45 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * timer test specific header + * + * Copyright (C) 2018, Google LLC + */ + +#ifndef SELFTEST_KVM_TIMER_TEST_H +#define SELFTEST_KVM_TIMER_TEST_H + +#include "kvm_util.h" + +#define NR_VCPUS_DEF 4 +#define NR_TEST_ITERS_DEF 5 +#define TIMER_TEST_PERIOD_MS_DEF 10 +#define TIMER_TEST_ERR_MARGIN_US 100 +#define TIMER_TEST_MIGRATION_FREQ_MS 2 + +/* Timer test cmdline parameters */ +struct test_args { + uint32_t nr_vcpus; + uint32_t nr_iter; + uint32_t timer_period_ms; + uint32_t migration_freq_ms; + uint32_t timer_err_margin_us; + /* Members of struct kvm_arm_counter_offset */ + uint64_t counter_offset; + uint64_t reserved; +}; + +/* Shared variables between host and guest */ +struct test_vcpu_shared_data { + uint32_t nr_iter; + int guest_stage; + uint64_t xcnt; +}; + +extern struct test_args test_args; +extern struct kvm_vcpu *vcpus[]; +extern struct test_vcpu_shared_data vcpu_shared_data[]; + +struct kvm_vm *test_vm_create(void); +void test_vm_cleanup(struct kvm_vm *vm); + +#endif /* SELFTEST_KVM_TIMER_TEST_H */ diff --git a/tools/testing/selftests/kvm/include/ucall_common.h b/tools/testing/selftests/kvm/include/ucall_common.h new file mode 100644 index 000000000000..d9d6581b8d4f --- /dev/null +++ b/tools/testing/selftests/kvm/include/ucall_common.h @@ -0,0 +1,116 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2018, Google LLC. + */ +#ifndef SELFTEST_KVM_UCALL_COMMON_H +#define SELFTEST_KVM_UCALL_COMMON_H +#include "test_util.h" +#include "ucall.h" + +/* Common ucalls */ +enum { + UCALL_NONE, + UCALL_SYNC, + UCALL_ABORT, + UCALL_PRINTF, + UCALL_DONE, + UCALL_UNHANDLED, +}; + +#define UCALL_MAX_ARGS 7 +#define UCALL_BUFFER_LEN 1024 + +struct ucall { + uint64_t cmd; + uint64_t args[UCALL_MAX_ARGS]; + char buffer[UCALL_BUFFER_LEN]; + + /* Host virtual address of this struct. */ + struct ucall *hva; +}; + +void ucall_arch_init(struct kvm_vm *vm, vm_paddr_t mmio_gpa); +void ucall_arch_do_ucall(vm_vaddr_t uc); +void *ucall_arch_get_ucall(struct kvm_vcpu *vcpu); + +void ucall(uint64_t cmd, int nargs, ...); +__printf(2, 3) void ucall_fmt(uint64_t cmd, const char *fmt, ...); +__printf(5, 6) void ucall_assert(uint64_t cmd, const char *exp, + const char *file, unsigned int line, + const char *fmt, ...); +uint64_t get_ucall(struct kvm_vcpu *vcpu, struct ucall *uc); +void ucall_init(struct kvm_vm *vm, vm_paddr_t mmio_gpa); +int ucall_nr_pages_required(uint64_t page_size); + +/* + * Perform userspace call without any associated data. This bare call avoids + * allocating a ucall struct, which can be useful if the atomic operations in + * the full ucall() are problematic and/or unwanted. Note, this will come out + * as UCALL_NONE on the backend. + */ +#define GUEST_UCALL_NONE() ucall_arch_do_ucall((vm_vaddr_t)NULL) + +#define GUEST_SYNC_ARGS(stage, arg1, arg2, arg3, arg4) \ + ucall(UCALL_SYNC, 6, "hello", stage, arg1, arg2, arg3, arg4) +#define GUEST_SYNC(stage) ucall(UCALL_SYNC, 2, "hello", stage) +#define GUEST_SYNC1(arg0) ucall(UCALL_SYNC, 1, arg0) +#define GUEST_SYNC2(arg0, arg1) ucall(UCALL_SYNC, 2, arg0, arg1) +#define GUEST_SYNC3(arg0, arg1, arg2) \ + ucall(UCALL_SYNC, 3, arg0, arg1, arg2) +#define GUEST_SYNC4(arg0, arg1, arg2, arg3) \ + ucall(UCALL_SYNC, 4, arg0, arg1, arg2, arg3) +#define GUEST_SYNC5(arg0, arg1, arg2, arg3, arg4) \ + ucall(UCALL_SYNC, 5, arg0, arg1, arg2, arg3, arg4) +#define GUEST_SYNC6(arg0, arg1, arg2, arg3, arg4, arg5) \ + ucall(UCALL_SYNC, 6, arg0, arg1, arg2, arg3, arg4, arg5) + +#define GUEST_PRINTF(_fmt, _args...) ucall_fmt(UCALL_PRINTF, _fmt, ##_args) +#define GUEST_DONE() ucall(UCALL_DONE, 0) + +#define REPORT_GUEST_PRINTF(ucall) pr_info("%s", (ucall).buffer) + +enum guest_assert_builtin_args { + GUEST_ERROR_STRING, + GUEST_FILE, + GUEST_LINE, + GUEST_ASSERT_BUILTIN_NARGS +}; + +#define ____GUEST_ASSERT(_condition, _exp, _fmt, _args...) \ +do { \ + if (!(_condition)) \ + ucall_assert(UCALL_ABORT, _exp, __FILE__, __LINE__, _fmt, ##_args); \ +} while (0) + +#define __GUEST_ASSERT(_condition, _fmt, _args...) \ + ____GUEST_ASSERT(_condition, #_condition, _fmt, ##_args) + +#define GUEST_ASSERT(_condition) \ + __GUEST_ASSERT(_condition, #_condition) + +#define GUEST_FAIL(_fmt, _args...) \ + ucall_assert(UCALL_ABORT, "Unconditional guest failure", \ + __FILE__, __LINE__, _fmt, ##_args) + +#define GUEST_ASSERT_EQ(a, b) \ +do { \ + typeof(a) __a = (a); \ + typeof(b) __b = (b); \ + ____GUEST_ASSERT(__a == __b, #a " == " #b, "%#lx != %#lx (%s != %s)", \ + (unsigned long)(__a), (unsigned long)(__b), #a, #b); \ +} while (0) + +#define GUEST_ASSERT_NE(a, b) \ +do { \ + typeof(a) __a = (a); \ + typeof(b) __b = (b); \ + ____GUEST_ASSERT(__a != __b, #a " != " #b, "%#lx == %#lx (%s == %s)", \ + (unsigned long)(__a), (unsigned long)(__b), #a, #b); \ +} while (0) + +#define REPORT_GUEST_ASSERT(ucall) \ + test_assert(false, (const char *)(ucall).args[GUEST_ERROR_STRING], \ + (const char *)(ucall).args[GUEST_FILE], \ + (ucall).args[GUEST_LINE], "%s", (ucall).buffer) + +#endif /* SELFTEST_KVM_UCALL_COMMON_H */ diff --git a/tools/testing/selftests/kvm/include/userfaultfd_util.h b/tools/testing/selftests/kvm/include/userfaultfd_util.h new file mode 100644 index 000000000000..60f7f9d435dc --- /dev/null +++ b/tools/testing/selftests/kvm/include/userfaultfd_util.h @@ -0,0 +1,52 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * KVM userfaultfd util + * + * Copyright (C) 2018, Red Hat, Inc. + * Copyright (C) 2019-2022 Google LLC + */ +#include <inttypes.h> +#include <time.h> +#include <pthread.h> +#include <linux/userfaultfd.h> + +#include "test_util.h" + +typedef int (*uffd_handler_t)(int uffd_mode, int uffd, struct uffd_msg *msg); + +struct uffd_reader_args { + int uffd_mode; + int uffd; + useconds_t delay; + uffd_handler_t handler; + /* Holds the read end of the pipe for killing the reader. */ + int pipe; +}; + +struct uffd_desc { + int uffd; + uint64_t num_readers; + /* Holds the write ends of the pipes for killing the readers. */ + int *pipefds; + pthread_t *readers; + struct uffd_reader_args *reader_args; +}; + +struct uffd_desc *uffd_setup_demand_paging(int uffd_mode, useconds_t delay, + void *hva, uint64_t len, + uint64_t num_readers, + uffd_handler_t handler); + +void uffd_stop_demand_paging(struct uffd_desc *uffd); + +#ifdef PRINT_PER_PAGE_UPDATES +#define PER_PAGE_DEBUG(...) printf(__VA_ARGS__) +#else +#define PER_PAGE_DEBUG(...) _no_printf(__VA_ARGS__) +#endif + +#ifdef PRINT_PER_VCPU_UPDATES +#define PER_VCPU_DEBUG(...) printf(__VA_ARGS__) +#else +#define PER_VCPU_DEBUG(...) _no_printf(__VA_ARGS__) +#endif diff --git a/tools/testing/selftests/kvm/include/x86_64/apic.h b/tools/testing/selftests/kvm/include/x86_64/apic.h new file mode 100644 index 000000000000..bed316fdecd5 --- /dev/null +++ b/tools/testing/selftests/kvm/include/x86_64/apic.h @@ -0,0 +1,93 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * tools/testing/selftests/kvm/include/x86_64/apic.h + * + * Copyright (C) 2021, Google LLC. + */ + +#ifndef SELFTEST_KVM_APIC_H +#define SELFTEST_KVM_APIC_H + +#include <stdint.h> + +#include "processor.h" + +#define APIC_DEFAULT_GPA 0xfee00000ULL + +/* APIC base address MSR and fields */ +#define MSR_IA32_APICBASE 0x0000001b +#define MSR_IA32_APICBASE_BSP (1<<8) +#define MSR_IA32_APICBASE_EXTD (1<<10) +#define MSR_IA32_APICBASE_ENABLE (1<<11) +#define MSR_IA32_APICBASE_BASE (0xfffff<<12) +#define GET_APIC_BASE(x) (((x) >> 12) << 12) + +#define APIC_BASE_MSR 0x800 +#define X2APIC_ENABLE (1UL << 10) +#define APIC_ID 0x20 +#define APIC_LVR 0x30 +#define GET_APIC_ID_FIELD(x) (((x) >> 24) & 0xFF) +#define APIC_TASKPRI 0x80 +#define APIC_PROCPRI 0xA0 +#define APIC_EOI 0xB0 +#define APIC_SPIV 0xF0 +#define APIC_SPIV_FOCUS_DISABLED (1 << 9) +#define APIC_SPIV_APIC_ENABLED (1 << 8) +#define APIC_IRR 0x200 +#define APIC_ICR 0x300 +#define APIC_LVTCMCI 0x2f0 +#define APIC_DEST_SELF 0x40000 +#define APIC_DEST_ALLINC 0x80000 +#define APIC_DEST_ALLBUT 0xC0000 +#define APIC_ICR_RR_MASK 0x30000 +#define APIC_ICR_RR_INVALID 0x00000 +#define APIC_ICR_RR_INPROG 0x10000 +#define APIC_ICR_RR_VALID 0x20000 +#define APIC_INT_LEVELTRIG 0x08000 +#define APIC_INT_ASSERT 0x04000 +#define APIC_ICR_BUSY 0x01000 +#define APIC_DEST_LOGICAL 0x00800 +#define APIC_DEST_PHYSICAL 0x00000 +#define APIC_DM_FIXED 0x00000 +#define APIC_DM_FIXED_MASK 0x00700 +#define APIC_DM_LOWEST 0x00100 +#define APIC_DM_SMI 0x00200 +#define APIC_DM_REMRD 0x00300 +#define APIC_DM_NMI 0x00400 +#define APIC_DM_INIT 0x00500 +#define APIC_DM_STARTUP 0x00600 +#define APIC_DM_EXTINT 0x00700 +#define APIC_VECTOR_MASK 0x000FF +#define APIC_ICR2 0x310 +#define SET_APIC_DEST_FIELD(x) ((x) << 24) + +void apic_disable(void); +void xapic_enable(void); +void x2apic_enable(void); + +static inline uint32_t get_bsp_flag(void) +{ + return rdmsr(MSR_IA32_APICBASE) & MSR_IA32_APICBASE_BSP; +} + +static inline uint32_t xapic_read_reg(unsigned int reg) +{ + return ((volatile uint32_t *)APIC_DEFAULT_GPA)[reg >> 2]; +} + +static inline void xapic_write_reg(unsigned int reg, uint32_t val) +{ + ((volatile uint32_t *)APIC_DEFAULT_GPA)[reg >> 2] = val; +} + +static inline uint64_t x2apic_read_reg(unsigned int reg) +{ + return rdmsr(APIC_BASE_MSR + (reg >> 4)); +} + +static inline void x2apic_write_reg(unsigned int reg, uint64_t value) +{ + wrmsr(APIC_BASE_MSR + (reg >> 4), value); +} + +#endif /* SELFTEST_KVM_APIC_H */ diff --git a/tools/testing/selftests/kvm/include/x86_64/evmcs.h b/tools/testing/selftests/kvm/include/x86_64/evmcs.h new file mode 100644 index 000000000000..901caf0e0939 --- /dev/null +++ b/tools/testing/selftests/kvm/include/x86_64/evmcs.h @@ -0,0 +1,1279 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * tools/testing/selftests/kvm/include/x86_64/evmcs.h + * + * Copyright (C) 2018, Red Hat, Inc. + * + */ + +#ifndef SELFTEST_KVM_EVMCS_H +#define SELFTEST_KVM_EVMCS_H + +#include <stdint.h> +#include "hyperv.h" +#include "vmx.h" + +#define u16 uint16_t +#define u32 uint32_t +#define u64 uint64_t + +#define EVMCS_VERSION 1 + +extern bool enable_evmcs; + +struct hv_enlightened_vmcs { + u32 revision_id; + u32 abort; + + u16 host_es_selector; + u16 host_cs_selector; + u16 host_ss_selector; + u16 host_ds_selector; + u16 host_fs_selector; + u16 host_gs_selector; + u16 host_tr_selector; + + u16 padding16_1; + + u64 host_ia32_pat; + u64 host_ia32_efer; + + u64 host_cr0; + u64 host_cr3; + u64 host_cr4; + + u64 host_ia32_sysenter_esp; + u64 host_ia32_sysenter_eip; + u64 host_rip; + u32 host_ia32_sysenter_cs; + + u32 pin_based_vm_exec_control; + u32 vm_exit_controls; + u32 secondary_vm_exec_control; + + u64 io_bitmap_a; + u64 io_bitmap_b; + u64 msr_bitmap; + + u16 guest_es_selector; + u16 guest_cs_selector; + u16 guest_ss_selector; + u16 guest_ds_selector; + u16 guest_fs_selector; + u16 guest_gs_selector; + u16 guest_ldtr_selector; + u16 guest_tr_selector; + + u32 guest_es_limit; + u32 guest_cs_limit; + u32 guest_ss_limit; + u32 guest_ds_limit; + u32 guest_fs_limit; + u32 guest_gs_limit; + u32 guest_ldtr_limit; + u32 guest_tr_limit; + u32 guest_gdtr_limit; + u32 guest_idtr_limit; + + u32 guest_es_ar_bytes; + u32 guest_cs_ar_bytes; + u32 guest_ss_ar_bytes; + u32 guest_ds_ar_bytes; + u32 guest_fs_ar_bytes; + u32 guest_gs_ar_bytes; + u32 guest_ldtr_ar_bytes; + u32 guest_tr_ar_bytes; + + u64 guest_es_base; + u64 guest_cs_base; + u64 guest_ss_base; + u64 guest_ds_base; + u64 guest_fs_base; + u64 guest_gs_base; + u64 guest_ldtr_base; + u64 guest_tr_base; + u64 guest_gdtr_base; + u64 guest_idtr_base; + + u64 padding64_1[3]; + + u64 vm_exit_msr_store_addr; + u64 vm_exit_msr_load_addr; + u64 vm_entry_msr_load_addr; + + u64 cr3_target_value0; + u64 cr3_target_value1; + u64 cr3_target_value2; + u64 cr3_target_value3; + + u32 page_fault_error_code_mask; + u32 page_fault_error_code_match; + + u32 cr3_target_count; + u32 vm_exit_msr_store_count; + u32 vm_exit_msr_load_count; + u32 vm_entry_msr_load_count; + + u64 tsc_offset; + u64 virtual_apic_page_addr; + u64 vmcs_link_pointer; + + u64 guest_ia32_debugctl; + u64 guest_ia32_pat; + u64 guest_ia32_efer; + + u64 guest_pdptr0; + u64 guest_pdptr1; + u64 guest_pdptr2; + u64 guest_pdptr3; + + u64 guest_pending_dbg_exceptions; + u64 guest_sysenter_esp; + u64 guest_sysenter_eip; + + u32 guest_activity_state; + u32 guest_sysenter_cs; + + u64 cr0_guest_host_mask; + u64 cr4_guest_host_mask; + u64 cr0_read_shadow; + u64 cr4_read_shadow; + u64 guest_cr0; + u64 guest_cr3; + u64 guest_cr4; + u64 guest_dr7; + + u64 host_fs_base; + u64 host_gs_base; + u64 host_tr_base; + u64 host_gdtr_base; + u64 host_idtr_base; + u64 host_rsp; + + u64 ept_pointer; + + u16 virtual_processor_id; + u16 padding16_2[3]; + + u64 padding64_2[5]; + u64 guest_physical_address; + + u32 vm_instruction_error; + u32 vm_exit_reason; + u32 vm_exit_intr_info; + u32 vm_exit_intr_error_code; + u32 idt_vectoring_info_field; + u32 idt_vectoring_error_code; + u32 vm_exit_instruction_len; + u32 vmx_instruction_info; + + u64 exit_qualification; + u64 exit_io_instruction_ecx; + u64 exit_io_instruction_esi; + u64 exit_io_instruction_edi; + u64 exit_io_instruction_eip; + + u64 guest_linear_address; + u64 guest_rsp; + u64 guest_rflags; + + u32 guest_interruptibility_info; + u32 cpu_based_vm_exec_control; + u32 exception_bitmap; + u32 vm_entry_controls; + u32 vm_entry_intr_info_field; + u32 vm_entry_exception_error_code; + u32 vm_entry_instruction_len; + u32 tpr_threshold; + + u64 guest_rip; + + u32 hv_clean_fields; + u32 padding32_1; + u32 hv_synthetic_controls; + struct { + u32 nested_flush_hypercall:1; + u32 msr_bitmap:1; + u32 reserved:30; + } __packed hv_enlightenments_control; + u32 hv_vp_id; + u32 padding32_2; + u64 hv_vm_id; + u64 partition_assist_page; + u64 padding64_4[4]; + u64 guest_bndcfgs; + u64 guest_ia32_perf_global_ctrl; + u64 guest_ia32_s_cet; + u64 guest_ssp; + u64 guest_ia32_int_ssp_table_addr; + u64 guest_ia32_lbr_ctl; + u64 padding64_5[2]; + u64 xss_exit_bitmap; + u64 encls_exiting_bitmap; + u64 host_ia32_perf_global_ctrl; + u64 tsc_multiplier; + u64 host_ia32_s_cet; + u64 host_ssp; + u64 host_ia32_int_ssp_table_addr; + u64 padding64_6; +} __packed; + +#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE 0 +#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP BIT(0) +#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP BIT(1) +#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2 BIT(2) +#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1 BIT(3) +#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_PROC BIT(4) +#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT BIT(5) +#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_ENTRY BIT(6) +#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EXCPN BIT(7) +#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR BIT(8) +#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT BIT(9) +#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC BIT(10) +#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1 BIT(11) +#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2 BIT(12) +#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER BIT(13) +#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1 BIT(14) +#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_ENLIGHTENMENTSCONTROL BIT(15) +#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL 0xFFFF + +#define HV_VMX_SYNTHETIC_EXIT_REASON_TRAP_AFTER_FLUSH 0x10000031 + +extern struct hv_enlightened_vmcs *current_evmcs; + +int vcpu_enable_evmcs(struct kvm_vcpu *vcpu); + +static inline void evmcs_enable(void) +{ + enable_evmcs = true; +} + +static inline int evmcs_vmptrld(uint64_t vmcs_pa, void *vmcs) +{ + current_vp_assist->current_nested_vmcs = vmcs_pa; + current_vp_assist->enlighten_vmentry = 1; + + current_evmcs = vmcs; + + return 0; +} + +static inline bool load_evmcs(struct hyperv_test_pages *hv) +{ + if (evmcs_vmptrld(hv->enlightened_vmcs_gpa, hv->enlightened_vmcs)) + return false; + + current_evmcs->revision_id = EVMCS_VERSION; + + return true; +} + +static inline int evmcs_vmptrst(uint64_t *value) +{ + *value = current_vp_assist->current_nested_vmcs & + ~HV_X64_MSR_VP_ASSIST_PAGE_ENABLE; + + return 0; +} + +static inline int evmcs_vmread(uint64_t encoding, uint64_t *value) +{ + switch (encoding) { + case GUEST_RIP: + *value = current_evmcs->guest_rip; + break; + case GUEST_RSP: + *value = current_evmcs->guest_rsp; + break; + case GUEST_RFLAGS: + *value = current_evmcs->guest_rflags; + break; + case HOST_IA32_PAT: + *value = current_evmcs->host_ia32_pat; + break; + case HOST_IA32_EFER: + *value = current_evmcs->host_ia32_efer; + break; + case HOST_CR0: + *value = current_evmcs->host_cr0; + break; + case HOST_CR3: + *value = current_evmcs->host_cr3; + break; + case HOST_CR4: + *value = current_evmcs->host_cr4; + break; + case HOST_IA32_SYSENTER_ESP: + *value = current_evmcs->host_ia32_sysenter_esp; + break; + case HOST_IA32_SYSENTER_EIP: + *value = current_evmcs->host_ia32_sysenter_eip; + break; + case HOST_RIP: + *value = current_evmcs->host_rip; + break; + case IO_BITMAP_A: + *value = current_evmcs->io_bitmap_a; + break; + case IO_BITMAP_B: + *value = current_evmcs->io_bitmap_b; + break; + case MSR_BITMAP: + *value = current_evmcs->msr_bitmap; + break; + case GUEST_ES_BASE: + *value = current_evmcs->guest_es_base; + break; + case GUEST_CS_BASE: + *value = current_evmcs->guest_cs_base; + break; + case GUEST_SS_BASE: + *value = current_evmcs->guest_ss_base; + break; + case GUEST_DS_BASE: + *value = current_evmcs->guest_ds_base; + break; + case GUEST_FS_BASE: + *value = current_evmcs->guest_fs_base; + break; + case GUEST_GS_BASE: + *value = current_evmcs->guest_gs_base; + break; + case GUEST_LDTR_BASE: + *value = current_evmcs->guest_ldtr_base; + break; + case GUEST_TR_BASE: + *value = current_evmcs->guest_tr_base; + break; + case GUEST_GDTR_BASE: + *value = current_evmcs->guest_gdtr_base; + break; + case GUEST_IDTR_BASE: + *value = current_evmcs->guest_idtr_base; + break; + case TSC_OFFSET: + *value = current_evmcs->tsc_offset; + break; + case VIRTUAL_APIC_PAGE_ADDR: + *value = current_evmcs->virtual_apic_page_addr; + break; + case VMCS_LINK_POINTER: + *value = current_evmcs->vmcs_link_pointer; + break; + case GUEST_IA32_DEBUGCTL: + *value = current_evmcs->guest_ia32_debugctl; + break; + case GUEST_IA32_PAT: + *value = current_evmcs->guest_ia32_pat; + break; + case GUEST_IA32_EFER: + *value = current_evmcs->guest_ia32_efer; + break; + case GUEST_PDPTR0: + *value = current_evmcs->guest_pdptr0; + break; + case GUEST_PDPTR1: + *value = current_evmcs->guest_pdptr1; + break; + case GUEST_PDPTR2: + *value = current_evmcs->guest_pdptr2; + break; + case GUEST_PDPTR3: + *value = current_evmcs->guest_pdptr3; + break; + case GUEST_PENDING_DBG_EXCEPTIONS: + *value = current_evmcs->guest_pending_dbg_exceptions; + break; + case GUEST_SYSENTER_ESP: + *value = current_evmcs->guest_sysenter_esp; + break; + case GUEST_SYSENTER_EIP: + *value = current_evmcs->guest_sysenter_eip; + break; + case CR0_GUEST_HOST_MASK: + *value = current_evmcs->cr0_guest_host_mask; + break; + case CR4_GUEST_HOST_MASK: + *value = current_evmcs->cr4_guest_host_mask; + break; + case CR0_READ_SHADOW: + *value = current_evmcs->cr0_read_shadow; + break; + case CR4_READ_SHADOW: + *value = current_evmcs->cr4_read_shadow; + break; + case GUEST_CR0: + *value = current_evmcs->guest_cr0; + break; + case GUEST_CR3: + *value = current_evmcs->guest_cr3; + break; + case GUEST_CR4: + *value = current_evmcs->guest_cr4; + break; + case GUEST_DR7: + *value = current_evmcs->guest_dr7; + break; + case HOST_FS_BASE: + *value = current_evmcs->host_fs_base; + break; + case HOST_GS_BASE: + *value = current_evmcs->host_gs_base; + break; + case HOST_TR_BASE: + *value = current_evmcs->host_tr_base; + break; + case HOST_GDTR_BASE: + *value = current_evmcs->host_gdtr_base; + break; + case HOST_IDTR_BASE: + *value = current_evmcs->host_idtr_base; + break; + case HOST_RSP: + *value = current_evmcs->host_rsp; + break; + case EPT_POINTER: + *value = current_evmcs->ept_pointer; + break; + case GUEST_BNDCFGS: + *value = current_evmcs->guest_bndcfgs; + break; + case XSS_EXIT_BITMAP: + *value = current_evmcs->xss_exit_bitmap; + break; + case GUEST_PHYSICAL_ADDRESS: + *value = current_evmcs->guest_physical_address; + break; + case EXIT_QUALIFICATION: + *value = current_evmcs->exit_qualification; + break; + case GUEST_LINEAR_ADDRESS: + *value = current_evmcs->guest_linear_address; + break; + case VM_EXIT_MSR_STORE_ADDR: + *value = current_evmcs->vm_exit_msr_store_addr; + break; + case VM_EXIT_MSR_LOAD_ADDR: + *value = current_evmcs->vm_exit_msr_load_addr; + break; + case VM_ENTRY_MSR_LOAD_ADDR: + *value = current_evmcs->vm_entry_msr_load_addr; + break; + case CR3_TARGET_VALUE0: + *value = current_evmcs->cr3_target_value0; + break; + case CR3_TARGET_VALUE1: + *value = current_evmcs->cr3_target_value1; + break; + case CR3_TARGET_VALUE2: + *value = current_evmcs->cr3_target_value2; + break; + case CR3_TARGET_VALUE3: + *value = current_evmcs->cr3_target_value3; + break; + case TPR_THRESHOLD: + *value = current_evmcs->tpr_threshold; + break; + case GUEST_INTERRUPTIBILITY_INFO: + *value = current_evmcs->guest_interruptibility_info; + break; + case CPU_BASED_VM_EXEC_CONTROL: + *value = current_evmcs->cpu_based_vm_exec_control; + break; + case EXCEPTION_BITMAP: + *value = current_evmcs->exception_bitmap; + break; + case VM_ENTRY_CONTROLS: + *value = current_evmcs->vm_entry_controls; + break; + case VM_ENTRY_INTR_INFO_FIELD: + *value = current_evmcs->vm_entry_intr_info_field; + break; + case VM_ENTRY_EXCEPTION_ERROR_CODE: + *value = current_evmcs->vm_entry_exception_error_code; + break; + case VM_ENTRY_INSTRUCTION_LEN: + *value = current_evmcs->vm_entry_instruction_len; + break; + case HOST_IA32_SYSENTER_CS: + *value = current_evmcs->host_ia32_sysenter_cs; + break; + case PIN_BASED_VM_EXEC_CONTROL: + *value = current_evmcs->pin_based_vm_exec_control; + break; + case VM_EXIT_CONTROLS: + *value = current_evmcs->vm_exit_controls; + break; + case SECONDARY_VM_EXEC_CONTROL: + *value = current_evmcs->secondary_vm_exec_control; + break; + case GUEST_ES_LIMIT: + *value = current_evmcs->guest_es_limit; + break; + case GUEST_CS_LIMIT: + *value = current_evmcs->guest_cs_limit; + break; + case GUEST_SS_LIMIT: + *value = current_evmcs->guest_ss_limit; + break; + case GUEST_DS_LIMIT: + *value = current_evmcs->guest_ds_limit; + break; + case GUEST_FS_LIMIT: + *value = current_evmcs->guest_fs_limit; + break; + case GUEST_GS_LIMIT: + *value = current_evmcs->guest_gs_limit; + break; + case GUEST_LDTR_LIMIT: + *value = current_evmcs->guest_ldtr_limit; + break; + case GUEST_TR_LIMIT: + *value = current_evmcs->guest_tr_limit; + break; + case GUEST_GDTR_LIMIT: + *value = current_evmcs->guest_gdtr_limit; + break; + case GUEST_IDTR_LIMIT: + *value = current_evmcs->guest_idtr_limit; + break; + case GUEST_ES_AR_BYTES: + *value = current_evmcs->guest_es_ar_bytes; + break; + case GUEST_CS_AR_BYTES: + *value = current_evmcs->guest_cs_ar_bytes; + break; + case GUEST_SS_AR_BYTES: + *value = current_evmcs->guest_ss_ar_bytes; + break; + case GUEST_DS_AR_BYTES: + *value = current_evmcs->guest_ds_ar_bytes; + break; + case GUEST_FS_AR_BYTES: + *value = current_evmcs->guest_fs_ar_bytes; + break; + case GUEST_GS_AR_BYTES: + *value = current_evmcs->guest_gs_ar_bytes; + break; + case GUEST_LDTR_AR_BYTES: + *value = current_evmcs->guest_ldtr_ar_bytes; + break; + case GUEST_TR_AR_BYTES: + *value = current_evmcs->guest_tr_ar_bytes; + break; + case GUEST_ACTIVITY_STATE: + *value = current_evmcs->guest_activity_state; + break; + case GUEST_SYSENTER_CS: + *value = current_evmcs->guest_sysenter_cs; + break; + case VM_INSTRUCTION_ERROR: + *value = current_evmcs->vm_instruction_error; + break; + case VM_EXIT_REASON: + *value = current_evmcs->vm_exit_reason; + break; + case VM_EXIT_INTR_INFO: + *value = current_evmcs->vm_exit_intr_info; + break; + case VM_EXIT_INTR_ERROR_CODE: + *value = current_evmcs->vm_exit_intr_error_code; + break; + case IDT_VECTORING_INFO_FIELD: + *value = current_evmcs->idt_vectoring_info_field; + break; + case IDT_VECTORING_ERROR_CODE: + *value = current_evmcs->idt_vectoring_error_code; + break; + case VM_EXIT_INSTRUCTION_LEN: + *value = current_evmcs->vm_exit_instruction_len; + break; + case VMX_INSTRUCTION_INFO: + *value = current_evmcs->vmx_instruction_info; + break; + case PAGE_FAULT_ERROR_CODE_MASK: + *value = current_evmcs->page_fault_error_code_mask; + break; + case PAGE_FAULT_ERROR_CODE_MATCH: + *value = current_evmcs->page_fault_error_code_match; + break; + case CR3_TARGET_COUNT: + *value = current_evmcs->cr3_target_count; + break; + case VM_EXIT_MSR_STORE_COUNT: + *value = current_evmcs->vm_exit_msr_store_count; + break; + case VM_EXIT_MSR_LOAD_COUNT: + *value = current_evmcs->vm_exit_msr_load_count; + break; + case VM_ENTRY_MSR_LOAD_COUNT: + *value = current_evmcs->vm_entry_msr_load_count; + break; + case HOST_ES_SELECTOR: + *value = current_evmcs->host_es_selector; + break; + case HOST_CS_SELECTOR: + *value = current_evmcs->host_cs_selector; + break; + case HOST_SS_SELECTOR: + *value = current_evmcs->host_ss_selector; + break; + case HOST_DS_SELECTOR: + *value = current_evmcs->host_ds_selector; + break; + case HOST_FS_SELECTOR: + *value = current_evmcs->host_fs_selector; + break; + case HOST_GS_SELECTOR: + *value = current_evmcs->host_gs_selector; + break; + case HOST_TR_SELECTOR: + *value = current_evmcs->host_tr_selector; + break; + case GUEST_ES_SELECTOR: + *value = current_evmcs->guest_es_selector; + break; + case GUEST_CS_SELECTOR: + *value = current_evmcs->guest_cs_selector; + break; + case GUEST_SS_SELECTOR: + *value = current_evmcs->guest_ss_selector; + break; + case GUEST_DS_SELECTOR: + *value = current_evmcs->guest_ds_selector; + break; + case GUEST_FS_SELECTOR: + *value = current_evmcs->guest_fs_selector; + break; + case GUEST_GS_SELECTOR: + *value = current_evmcs->guest_gs_selector; + break; + case GUEST_LDTR_SELECTOR: + *value = current_evmcs->guest_ldtr_selector; + break; + case GUEST_TR_SELECTOR: + *value = current_evmcs->guest_tr_selector; + break; + case VIRTUAL_PROCESSOR_ID: + *value = current_evmcs->virtual_processor_id; + break; + case HOST_IA32_PERF_GLOBAL_CTRL: + *value = current_evmcs->host_ia32_perf_global_ctrl; + break; + case GUEST_IA32_PERF_GLOBAL_CTRL: + *value = current_evmcs->guest_ia32_perf_global_ctrl; + break; + case ENCLS_EXITING_BITMAP: + *value = current_evmcs->encls_exiting_bitmap; + break; + case TSC_MULTIPLIER: + *value = current_evmcs->tsc_multiplier; + break; + default: return 1; + } + + return 0; +} + +static inline int evmcs_vmwrite(uint64_t encoding, uint64_t value) +{ + switch (encoding) { + case GUEST_RIP: + current_evmcs->guest_rip = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE; + break; + case GUEST_RSP: + current_evmcs->guest_rsp = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC; + break; + case GUEST_RFLAGS: + current_evmcs->guest_rflags = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC; + break; + case HOST_IA32_PAT: + current_evmcs->host_ia32_pat = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1; + break; + case HOST_IA32_EFER: + current_evmcs->host_ia32_efer = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1; + break; + case HOST_CR0: + current_evmcs->host_cr0 = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1; + break; + case HOST_CR3: + current_evmcs->host_cr3 = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1; + break; + case HOST_CR4: + current_evmcs->host_cr4 = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1; + break; + case HOST_IA32_SYSENTER_ESP: + current_evmcs->host_ia32_sysenter_esp = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1; + break; + case HOST_IA32_SYSENTER_EIP: + current_evmcs->host_ia32_sysenter_eip = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1; + break; + case HOST_RIP: + current_evmcs->host_rip = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1; + break; + case IO_BITMAP_A: + current_evmcs->io_bitmap_a = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP; + break; + case IO_BITMAP_B: + current_evmcs->io_bitmap_b = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP; + break; + case MSR_BITMAP: + current_evmcs->msr_bitmap = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP; + break; + case GUEST_ES_BASE: + current_evmcs->guest_es_base = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2; + break; + case GUEST_CS_BASE: + current_evmcs->guest_cs_base = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2; + break; + case GUEST_SS_BASE: + current_evmcs->guest_ss_base = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2; + break; + case GUEST_DS_BASE: + current_evmcs->guest_ds_base = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2; + break; + case GUEST_FS_BASE: + current_evmcs->guest_fs_base = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2; + break; + case GUEST_GS_BASE: + current_evmcs->guest_gs_base = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2; + break; + case GUEST_LDTR_BASE: + current_evmcs->guest_ldtr_base = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2; + break; + case GUEST_TR_BASE: + current_evmcs->guest_tr_base = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2; + break; + case GUEST_GDTR_BASE: + current_evmcs->guest_gdtr_base = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2; + break; + case GUEST_IDTR_BASE: + current_evmcs->guest_idtr_base = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2; + break; + case TSC_OFFSET: + current_evmcs->tsc_offset = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2; + break; + case VIRTUAL_APIC_PAGE_ADDR: + current_evmcs->virtual_apic_page_addr = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2; + break; + case VMCS_LINK_POINTER: + current_evmcs->vmcs_link_pointer = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1; + break; + case GUEST_IA32_DEBUGCTL: + current_evmcs->guest_ia32_debugctl = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1; + break; + case GUEST_IA32_PAT: + current_evmcs->guest_ia32_pat = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1; + break; + case GUEST_IA32_EFER: + current_evmcs->guest_ia32_efer = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1; + break; + case GUEST_PDPTR0: + current_evmcs->guest_pdptr0 = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1; + break; + case GUEST_PDPTR1: + current_evmcs->guest_pdptr1 = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1; + break; + case GUEST_PDPTR2: + current_evmcs->guest_pdptr2 = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1; + break; + case GUEST_PDPTR3: + current_evmcs->guest_pdptr3 = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1; + break; + case GUEST_PENDING_DBG_EXCEPTIONS: + current_evmcs->guest_pending_dbg_exceptions = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1; + break; + case GUEST_SYSENTER_ESP: + current_evmcs->guest_sysenter_esp = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1; + break; + case GUEST_SYSENTER_EIP: + current_evmcs->guest_sysenter_eip = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1; + break; + case CR0_GUEST_HOST_MASK: + current_evmcs->cr0_guest_host_mask = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR; + break; + case CR4_GUEST_HOST_MASK: + current_evmcs->cr4_guest_host_mask = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR; + break; + case CR0_READ_SHADOW: + current_evmcs->cr0_read_shadow = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR; + break; + case CR4_READ_SHADOW: + current_evmcs->cr4_read_shadow = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR; + break; + case GUEST_CR0: + current_evmcs->guest_cr0 = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR; + break; + case GUEST_CR3: + current_evmcs->guest_cr3 = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR; + break; + case GUEST_CR4: + current_evmcs->guest_cr4 = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR; + break; + case GUEST_DR7: + current_evmcs->guest_dr7 = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR; + break; + case HOST_FS_BASE: + current_evmcs->host_fs_base = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER; + break; + case HOST_GS_BASE: + current_evmcs->host_gs_base = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER; + break; + case HOST_TR_BASE: + current_evmcs->host_tr_base = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER; + break; + case HOST_GDTR_BASE: + current_evmcs->host_gdtr_base = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER; + break; + case HOST_IDTR_BASE: + current_evmcs->host_idtr_base = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER; + break; + case HOST_RSP: + current_evmcs->host_rsp = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER; + break; + case EPT_POINTER: + current_evmcs->ept_pointer = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT; + break; + case GUEST_BNDCFGS: + current_evmcs->guest_bndcfgs = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1; + break; + case XSS_EXIT_BITMAP: + current_evmcs->xss_exit_bitmap = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2; + break; + case GUEST_PHYSICAL_ADDRESS: + current_evmcs->guest_physical_address = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE; + break; + case EXIT_QUALIFICATION: + current_evmcs->exit_qualification = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE; + break; + case GUEST_LINEAR_ADDRESS: + current_evmcs->guest_linear_address = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE; + break; + case VM_EXIT_MSR_STORE_ADDR: + current_evmcs->vm_exit_msr_store_addr = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; + break; + case VM_EXIT_MSR_LOAD_ADDR: + current_evmcs->vm_exit_msr_load_addr = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; + break; + case VM_ENTRY_MSR_LOAD_ADDR: + current_evmcs->vm_entry_msr_load_addr = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; + break; + case CR3_TARGET_VALUE0: + current_evmcs->cr3_target_value0 = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; + break; + case CR3_TARGET_VALUE1: + current_evmcs->cr3_target_value1 = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; + break; + case CR3_TARGET_VALUE2: + current_evmcs->cr3_target_value2 = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; + break; + case CR3_TARGET_VALUE3: + current_evmcs->cr3_target_value3 = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; + break; + case TPR_THRESHOLD: + current_evmcs->tpr_threshold = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE; + break; + case GUEST_INTERRUPTIBILITY_INFO: + current_evmcs->guest_interruptibility_info = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC; + break; + case CPU_BASED_VM_EXEC_CONTROL: + current_evmcs->cpu_based_vm_exec_control = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_PROC; + break; + case EXCEPTION_BITMAP: + current_evmcs->exception_bitmap = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EXCPN; + break; + case VM_ENTRY_CONTROLS: + current_evmcs->vm_entry_controls = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_ENTRY; + break; + case VM_ENTRY_INTR_INFO_FIELD: + current_evmcs->vm_entry_intr_info_field = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT; + break; + case VM_ENTRY_EXCEPTION_ERROR_CODE: + current_evmcs->vm_entry_exception_error_code = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT; + break; + case VM_ENTRY_INSTRUCTION_LEN: + current_evmcs->vm_entry_instruction_len = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT; + break; + case HOST_IA32_SYSENTER_CS: + current_evmcs->host_ia32_sysenter_cs = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1; + break; + case PIN_BASED_VM_EXEC_CONTROL: + current_evmcs->pin_based_vm_exec_control = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1; + break; + case VM_EXIT_CONTROLS: + current_evmcs->vm_exit_controls = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1; + break; + case SECONDARY_VM_EXEC_CONTROL: + current_evmcs->secondary_vm_exec_control = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1; + break; + case GUEST_ES_LIMIT: + current_evmcs->guest_es_limit = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2; + break; + case GUEST_CS_LIMIT: + current_evmcs->guest_cs_limit = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2; + break; + case GUEST_SS_LIMIT: + current_evmcs->guest_ss_limit = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2; + break; + case GUEST_DS_LIMIT: + current_evmcs->guest_ds_limit = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2; + break; + case GUEST_FS_LIMIT: + current_evmcs->guest_fs_limit = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2; + break; + case GUEST_GS_LIMIT: + current_evmcs->guest_gs_limit = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2; + break; + case GUEST_LDTR_LIMIT: + current_evmcs->guest_ldtr_limit = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2; + break; + case GUEST_TR_LIMIT: + current_evmcs->guest_tr_limit = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2; + break; + case GUEST_GDTR_LIMIT: + current_evmcs->guest_gdtr_limit = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2; + break; + case GUEST_IDTR_LIMIT: + current_evmcs->guest_idtr_limit = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2; + break; + case GUEST_ES_AR_BYTES: + current_evmcs->guest_es_ar_bytes = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2; + break; + case GUEST_CS_AR_BYTES: + current_evmcs->guest_cs_ar_bytes = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2; + break; + case GUEST_SS_AR_BYTES: + current_evmcs->guest_ss_ar_bytes = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2; + break; + case GUEST_DS_AR_BYTES: + current_evmcs->guest_ds_ar_bytes = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2; + break; + case GUEST_FS_AR_BYTES: + current_evmcs->guest_fs_ar_bytes = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2; + break; + case GUEST_GS_AR_BYTES: + current_evmcs->guest_gs_ar_bytes = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2; + break; + case GUEST_LDTR_AR_BYTES: + current_evmcs->guest_ldtr_ar_bytes = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2; + break; + case GUEST_TR_AR_BYTES: + current_evmcs->guest_tr_ar_bytes = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2; + break; + case GUEST_ACTIVITY_STATE: + current_evmcs->guest_activity_state = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1; + break; + case GUEST_SYSENTER_CS: + current_evmcs->guest_sysenter_cs = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1; + break; + case VM_INSTRUCTION_ERROR: + current_evmcs->vm_instruction_error = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE; + break; + case VM_EXIT_REASON: + current_evmcs->vm_exit_reason = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE; + break; + case VM_EXIT_INTR_INFO: + current_evmcs->vm_exit_intr_info = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE; + break; + case VM_EXIT_INTR_ERROR_CODE: + current_evmcs->vm_exit_intr_error_code = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE; + break; + case IDT_VECTORING_INFO_FIELD: + current_evmcs->idt_vectoring_info_field = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE; + break; + case IDT_VECTORING_ERROR_CODE: + current_evmcs->idt_vectoring_error_code = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE; + break; + case VM_EXIT_INSTRUCTION_LEN: + current_evmcs->vm_exit_instruction_len = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE; + break; + case VMX_INSTRUCTION_INFO: + current_evmcs->vmx_instruction_info = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE; + break; + case PAGE_FAULT_ERROR_CODE_MASK: + current_evmcs->page_fault_error_code_mask = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; + break; + case PAGE_FAULT_ERROR_CODE_MATCH: + current_evmcs->page_fault_error_code_match = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; + break; + case CR3_TARGET_COUNT: + current_evmcs->cr3_target_count = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; + break; + case VM_EXIT_MSR_STORE_COUNT: + current_evmcs->vm_exit_msr_store_count = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; + break; + case VM_EXIT_MSR_LOAD_COUNT: + current_evmcs->vm_exit_msr_load_count = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; + break; + case VM_ENTRY_MSR_LOAD_COUNT: + current_evmcs->vm_entry_msr_load_count = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; + break; + case HOST_ES_SELECTOR: + current_evmcs->host_es_selector = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1; + break; + case HOST_CS_SELECTOR: + current_evmcs->host_cs_selector = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1; + break; + case HOST_SS_SELECTOR: + current_evmcs->host_ss_selector = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1; + break; + case HOST_DS_SELECTOR: + current_evmcs->host_ds_selector = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1; + break; + case HOST_FS_SELECTOR: + current_evmcs->host_fs_selector = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1; + break; + case HOST_GS_SELECTOR: + current_evmcs->host_gs_selector = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1; + break; + case HOST_TR_SELECTOR: + current_evmcs->host_tr_selector = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1; + break; + case GUEST_ES_SELECTOR: + current_evmcs->guest_es_selector = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2; + break; + case GUEST_CS_SELECTOR: + current_evmcs->guest_cs_selector = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2; + break; + case GUEST_SS_SELECTOR: + current_evmcs->guest_ss_selector = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2; + break; + case GUEST_DS_SELECTOR: + current_evmcs->guest_ds_selector = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2; + break; + case GUEST_FS_SELECTOR: + current_evmcs->guest_fs_selector = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2; + break; + case GUEST_GS_SELECTOR: + current_evmcs->guest_gs_selector = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2; + break; + case GUEST_LDTR_SELECTOR: + current_evmcs->guest_ldtr_selector = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2; + break; + case GUEST_TR_SELECTOR: + current_evmcs->guest_tr_selector = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2; + break; + case VIRTUAL_PROCESSOR_ID: + current_evmcs->virtual_processor_id = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT; + break; + case HOST_IA32_PERF_GLOBAL_CTRL: + current_evmcs->host_ia32_perf_global_ctrl = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1; + break; + case GUEST_IA32_PERF_GLOBAL_CTRL: + current_evmcs->guest_ia32_perf_global_ctrl = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1; + break; + case ENCLS_EXITING_BITMAP: + current_evmcs->encls_exiting_bitmap = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2; + break; + case TSC_MULTIPLIER: + current_evmcs->tsc_multiplier = value; + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2; + break; + default: return 1; + } + + return 0; +} + +static inline int evmcs_vmlaunch(void) +{ + int ret; + + current_evmcs->hv_clean_fields = 0; + + __asm__ __volatile__("push %%rbp;" + "push %%rcx;" + "push %%rdx;" + "push %%rsi;" + "push %%rdi;" + "push $0;" + "mov %%rsp, (%[host_rsp]);" + "lea 1f(%%rip), %%rax;" + "mov %%rax, (%[host_rip]);" + "vmlaunch;" + "incq (%%rsp);" + "1: pop %%rax;" + "pop %%rdi;" + "pop %%rsi;" + "pop %%rdx;" + "pop %%rcx;" + "pop %%rbp;" + : [ret]"=&a"(ret) + : [host_rsp]"r" + ((uint64_t)¤t_evmcs->host_rsp), + [host_rip]"r" + ((uint64_t)¤t_evmcs->host_rip) + : "memory", "cc", "rbx", "r8", "r9", "r10", + "r11", "r12", "r13", "r14", "r15"); + return ret; +} + +/* + * No guest state (e.g. GPRs) is established by this vmresume. + */ +static inline int evmcs_vmresume(void) +{ + int ret; + + /* HOST_RIP */ + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1; + /* HOST_RSP */ + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER; + + __asm__ __volatile__("push %%rbp;" + "push %%rcx;" + "push %%rdx;" + "push %%rsi;" + "push %%rdi;" + "push $0;" + "mov %%rsp, (%[host_rsp]);" + "lea 1f(%%rip), %%rax;" + "mov %%rax, (%[host_rip]);" + "vmresume;" + "incq (%%rsp);" + "1: pop %%rax;" + "pop %%rdi;" + "pop %%rsi;" + "pop %%rdx;" + "pop %%rcx;" + "pop %%rbp;" + : [ret]"=&a"(ret) + : [host_rsp]"r" + ((uint64_t)¤t_evmcs->host_rsp), + [host_rip]"r" + ((uint64_t)¤t_evmcs->host_rip) + : "memory", "cc", "rbx", "r8", "r9", "r10", + "r11", "r12", "r13", "r14", "r15"); + return ret; +} + +#endif /* !SELFTEST_KVM_EVMCS_H */ diff --git a/tools/testing/selftests/kvm/include/x86_64/hyperv.h b/tools/testing/selftests/kvm/include/x86_64/hyperv.h new file mode 100644 index 000000000000..fa65b908b13e --- /dev/null +++ b/tools/testing/selftests/kvm/include/x86_64/hyperv.h @@ -0,0 +1,346 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * tools/testing/selftests/kvm/include/x86_64/hyperv.h + * + * Copyright (C) 2021, Red Hat, Inc. + * + */ + +#ifndef SELFTEST_KVM_HYPERV_H +#define SELFTEST_KVM_HYPERV_H + +#include "processor.h" + +#define HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS 0x40000000 +#define HYPERV_CPUID_INTERFACE 0x40000001 +#define HYPERV_CPUID_VERSION 0x40000002 +#define HYPERV_CPUID_FEATURES 0x40000003 +#define HYPERV_CPUID_ENLIGHTMENT_INFO 0x40000004 +#define HYPERV_CPUID_IMPLEMENT_LIMITS 0x40000005 +#define HYPERV_CPUID_CPU_MANAGEMENT_FEATURES 0x40000007 +#define HYPERV_CPUID_NESTED_FEATURES 0x4000000A +#define HYPERV_CPUID_SYNDBG_VENDOR_AND_MAX_FUNCTIONS 0x40000080 +#define HYPERV_CPUID_SYNDBG_INTERFACE 0x40000081 +#define HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES 0x40000082 + +#define HV_X64_MSR_GUEST_OS_ID 0x40000000 +#define HV_X64_MSR_HYPERCALL 0x40000001 +#define HV_X64_MSR_VP_INDEX 0x40000002 +#define HV_X64_MSR_RESET 0x40000003 +#define HV_X64_MSR_VP_RUNTIME 0x40000010 +#define HV_X64_MSR_TIME_REF_COUNT 0x40000020 +#define HV_X64_MSR_REFERENCE_TSC 0x40000021 +#define HV_X64_MSR_TSC_FREQUENCY 0x40000022 +#define HV_X64_MSR_APIC_FREQUENCY 0x40000023 +#define HV_X64_MSR_EOI 0x40000070 +#define HV_X64_MSR_ICR 0x40000071 +#define HV_X64_MSR_TPR 0x40000072 +#define HV_X64_MSR_VP_ASSIST_PAGE 0x40000073 +#define HV_X64_MSR_SCONTROL 0x40000080 +#define HV_X64_MSR_SVERSION 0x40000081 +#define HV_X64_MSR_SIEFP 0x40000082 +#define HV_X64_MSR_SIMP 0x40000083 +#define HV_X64_MSR_EOM 0x40000084 +#define HV_X64_MSR_SINT0 0x40000090 +#define HV_X64_MSR_SINT1 0x40000091 +#define HV_X64_MSR_SINT2 0x40000092 +#define HV_X64_MSR_SINT3 0x40000093 +#define HV_X64_MSR_SINT4 0x40000094 +#define HV_X64_MSR_SINT5 0x40000095 +#define HV_X64_MSR_SINT6 0x40000096 +#define HV_X64_MSR_SINT7 0x40000097 +#define HV_X64_MSR_SINT8 0x40000098 +#define HV_X64_MSR_SINT9 0x40000099 +#define HV_X64_MSR_SINT10 0x4000009A +#define HV_X64_MSR_SINT11 0x4000009B +#define HV_X64_MSR_SINT12 0x4000009C +#define HV_X64_MSR_SINT13 0x4000009D +#define HV_X64_MSR_SINT14 0x4000009E +#define HV_X64_MSR_SINT15 0x4000009F +#define HV_X64_MSR_STIMER0_CONFIG 0x400000B0 +#define HV_X64_MSR_STIMER0_COUNT 0x400000B1 +#define HV_X64_MSR_STIMER1_CONFIG 0x400000B2 +#define HV_X64_MSR_STIMER1_COUNT 0x400000B3 +#define HV_X64_MSR_STIMER2_CONFIG 0x400000B4 +#define HV_X64_MSR_STIMER2_COUNT 0x400000B5 +#define HV_X64_MSR_STIMER3_CONFIG 0x400000B6 +#define HV_X64_MSR_STIMER3_COUNT 0x400000B7 +#define HV_X64_MSR_GUEST_IDLE 0x400000F0 +#define HV_X64_MSR_CRASH_P0 0x40000100 +#define HV_X64_MSR_CRASH_P1 0x40000101 +#define HV_X64_MSR_CRASH_P2 0x40000102 +#define HV_X64_MSR_CRASH_P3 0x40000103 +#define HV_X64_MSR_CRASH_P4 0x40000104 +#define HV_X64_MSR_CRASH_CTL 0x40000105 +#define HV_X64_MSR_REENLIGHTENMENT_CONTROL 0x40000106 +#define HV_X64_MSR_TSC_EMULATION_CONTROL 0x40000107 +#define HV_X64_MSR_TSC_EMULATION_STATUS 0x40000108 +#define HV_X64_MSR_TSC_INVARIANT_CONTROL 0x40000118 + +#define HV_X64_MSR_SYNDBG_CONTROL 0x400000F1 +#define HV_X64_MSR_SYNDBG_STATUS 0x400000F2 +#define HV_X64_MSR_SYNDBG_SEND_BUFFER 0x400000F3 +#define HV_X64_MSR_SYNDBG_RECV_BUFFER 0x400000F4 +#define HV_X64_MSR_SYNDBG_PENDING_BUFFER 0x400000F5 +#define HV_X64_MSR_SYNDBG_OPTIONS 0x400000FF + +/* HYPERV_CPUID_FEATURES.EAX */ +#define HV_MSR_VP_RUNTIME_AVAILABLE \ + KVM_X86_CPU_FEATURE(HYPERV_CPUID_FEATURES, 0, EAX, 0) +#define HV_MSR_TIME_REF_COUNT_AVAILABLE \ + KVM_X86_CPU_FEATURE(HYPERV_CPUID_FEATURES, 0, EAX, 1) +#define HV_MSR_SYNIC_AVAILABLE \ + KVM_X86_CPU_FEATURE(HYPERV_CPUID_FEATURES, 0, EAX, 2) +#define HV_MSR_SYNTIMER_AVAILABLE \ + KVM_X86_CPU_FEATURE(HYPERV_CPUID_FEATURES, 0, EAX, 3) +#define HV_MSR_APIC_ACCESS_AVAILABLE \ + KVM_X86_CPU_FEATURE(HYPERV_CPUID_FEATURES, 0, EAX, 4) +#define HV_MSR_HYPERCALL_AVAILABLE \ + KVM_X86_CPU_FEATURE(HYPERV_CPUID_FEATURES, 0, EAX, 5) +#define HV_MSR_VP_INDEX_AVAILABLE \ + KVM_X86_CPU_FEATURE(HYPERV_CPUID_FEATURES, 0, EAX, 6) +#define HV_MSR_RESET_AVAILABLE \ + KVM_X86_CPU_FEATURE(HYPERV_CPUID_FEATURES, 0, EAX, 7) +#define HV_MSR_STAT_PAGES_AVAILABLE \ + KVM_X86_CPU_FEATURE(HYPERV_CPUID_FEATURES, 0, EAX, 8) +#define HV_MSR_REFERENCE_TSC_AVAILABLE \ + KVM_X86_CPU_FEATURE(HYPERV_CPUID_FEATURES, 0, EAX, 9) +#define HV_MSR_GUEST_IDLE_AVAILABLE \ + KVM_X86_CPU_FEATURE(HYPERV_CPUID_FEATURES, 0, EAX, 10) +#define HV_ACCESS_FREQUENCY_MSRS \ + KVM_X86_CPU_FEATURE(HYPERV_CPUID_FEATURES, 0, EAX, 11) +#define HV_ACCESS_REENLIGHTENMENT \ + KVM_X86_CPU_FEATURE(HYPERV_CPUID_FEATURES, 0, EAX, 13) +#define HV_ACCESS_TSC_INVARIANT \ + KVM_X86_CPU_FEATURE(HYPERV_CPUID_FEATURES, 0, EAX, 15) + +/* HYPERV_CPUID_FEATURES.EBX */ +#define HV_CREATE_PARTITIONS \ + KVM_X86_CPU_FEATURE(HYPERV_CPUID_FEATURES, 0, EBX, 0) +#define HV_ACCESS_PARTITION_ID \ + KVM_X86_CPU_FEATURE(HYPERV_CPUID_FEATURES, 0, EBX, 1) +#define HV_ACCESS_MEMORY_POOL \ + KVM_X86_CPU_FEATURE(HYPERV_CPUID_FEATURES, 0, EBX, 2) +#define HV_ADJUST_MESSAGE_BUFFERS \ + KVM_X86_CPU_FEATURE(HYPERV_CPUID_FEATURES, 0, EBX, 3) +#define HV_POST_MESSAGES \ + KVM_X86_CPU_FEATURE(HYPERV_CPUID_FEATURES, 0, EBX, 4) +#define HV_SIGNAL_EVENTS \ + KVM_X86_CPU_FEATURE(HYPERV_CPUID_FEATURES, 0, EBX, 5) +#define HV_CREATE_PORT \ + KVM_X86_CPU_FEATURE(HYPERV_CPUID_FEATURES, 0, EBX, 6) +#define HV_CONNECT_PORT \ + KVM_X86_CPU_FEATURE(HYPERV_CPUID_FEATURES, 0, EBX, 7) +#define HV_ACCESS_STATS \ + KVM_X86_CPU_FEATURE(HYPERV_CPUID_FEATURES, 0, EBX, 8) +#define HV_DEBUGGING \ + KVM_X86_CPU_FEATURE(HYPERV_CPUID_FEATURES, 0, EBX, 11) +#define HV_CPU_MANAGEMENT \ + KVM_X86_CPU_FEATURE(HYPERV_CPUID_FEATURES, 0, EBX, 12) +#define HV_ENABLE_EXTENDED_HYPERCALLS \ + KVM_X86_CPU_FEATURE(HYPERV_CPUID_FEATURES, 0, EBX, 20) +#define HV_ISOLATION \ + KVM_X86_CPU_FEATURE(HYPERV_CPUID_FEATURES, 0, EBX, 22) + +/* HYPERV_CPUID_FEATURES.EDX */ +#define HV_X64_MWAIT_AVAILABLE \ + KVM_X86_CPU_FEATURE(HYPERV_CPUID_FEATURES, 0, EDX, 0) +#define HV_X64_GUEST_DEBUGGING_AVAILABLE \ + KVM_X86_CPU_FEATURE(HYPERV_CPUID_FEATURES, 0, EDX, 1) +#define HV_X64_PERF_MONITOR_AVAILABLE \ + KVM_X86_CPU_FEATURE(HYPERV_CPUID_FEATURES, 0, EDX, 2) +#define HV_X64_CPU_DYNAMIC_PARTITIONING_AVAILABLE \ + KVM_X86_CPU_FEATURE(HYPERV_CPUID_FEATURES, 0, EDX, 3) +#define HV_X64_HYPERCALL_XMM_INPUT_AVAILABLE \ + KVM_X86_CPU_FEATURE(HYPERV_CPUID_FEATURES, 0, EDX, 4) +#define HV_X64_GUEST_IDLE_STATE_AVAILABLE \ + KVM_X86_CPU_FEATURE(HYPERV_CPUID_FEATURES, 0, EDX, 5) +#define HV_FEATURE_FREQUENCY_MSRS_AVAILABLE \ + KVM_X86_CPU_FEATURE(HYPERV_CPUID_FEATURES, 0, EDX, 8) +#define HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE \ + KVM_X86_CPU_FEATURE(HYPERV_CPUID_FEATURES, 0, EDX, 10) +#define HV_FEATURE_DEBUG_MSRS_AVAILABLE \ + KVM_X86_CPU_FEATURE(HYPERV_CPUID_FEATURES, 0, EDX, 11) +#define HV_STIMER_DIRECT_MODE_AVAILABLE \ + KVM_X86_CPU_FEATURE(HYPERV_CPUID_FEATURES, 0, EDX, 19) + +/* HYPERV_CPUID_ENLIGHTMENT_INFO.EAX */ +#define HV_X64_AS_SWITCH_RECOMMENDED \ + KVM_X86_CPU_FEATURE(HYPERV_CPUID_ENLIGHTMENT_INFO, 0, EAX, 0) +#define HV_X64_LOCAL_TLB_FLUSH_RECOMMENDED \ + KVM_X86_CPU_FEATURE(HYPERV_CPUID_ENLIGHTMENT_INFO, 0, EAX, 1) +#define HV_X64_REMOTE_TLB_FLUSH_RECOMMENDED \ + KVM_X86_CPU_FEATURE(HYPERV_CPUID_ENLIGHTMENT_INFO, 0, EAX, 2) +#define HV_X64_APIC_ACCESS_RECOMMENDED \ + KVM_X86_CPU_FEATURE(HYPERV_CPUID_ENLIGHTMENT_INFO, 0, EAX, 3) +#define HV_X64_SYSTEM_RESET_RECOMMENDED \ + KVM_X86_CPU_FEATURE(HYPERV_CPUID_ENLIGHTMENT_INFO, 0, EAX, 4) +#define HV_X64_RELAXED_TIMING_RECOMMENDED \ + KVM_X86_CPU_FEATURE(HYPERV_CPUID_ENLIGHTMENT_INFO, 0, EAX, 5) +#define HV_DEPRECATING_AEOI_RECOMMENDED \ + KVM_X86_CPU_FEATURE(HYPERV_CPUID_ENLIGHTMENT_INFO, 0, EAX, 9) +#define HV_X64_CLUSTER_IPI_RECOMMENDED \ + KVM_X86_CPU_FEATURE(HYPERV_CPUID_ENLIGHTMENT_INFO, 0, EAX, 10) +#define HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED \ + KVM_X86_CPU_FEATURE(HYPERV_CPUID_ENLIGHTMENT_INFO, 0, EAX, 11) +#define HV_X64_ENLIGHTENED_VMCS_RECOMMENDED \ + KVM_X86_CPU_FEATURE(HYPERV_CPUID_ENLIGHTMENT_INFO, 0, EAX, 14) + +/* HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES.EAX */ +#define HV_X64_SYNDBG_CAP_ALLOW_KERNEL_DEBUGGING \ + KVM_X86_CPU_FEATURE(HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES, 0, EAX, 1) + +/* Hypercalls */ +#define HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE 0x0002 +#define HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST 0x0003 +#define HVCALL_NOTIFY_LONG_SPIN_WAIT 0x0008 +#define HVCALL_SEND_IPI 0x000b +#define HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX 0x0013 +#define HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX 0x0014 +#define HVCALL_SEND_IPI_EX 0x0015 +#define HVCALL_GET_PARTITION_ID 0x0046 +#define HVCALL_DEPOSIT_MEMORY 0x0048 +#define HVCALL_CREATE_VP 0x004e +#define HVCALL_GET_VP_REGISTERS 0x0050 +#define HVCALL_SET_VP_REGISTERS 0x0051 +#define HVCALL_POST_MESSAGE 0x005c +#define HVCALL_SIGNAL_EVENT 0x005d +#define HVCALL_POST_DEBUG_DATA 0x0069 +#define HVCALL_RETRIEVE_DEBUG_DATA 0x006a +#define HVCALL_RESET_DEBUG_SESSION 0x006b +#define HVCALL_ADD_LOGICAL_PROCESSOR 0x0076 +#define HVCALL_MAP_DEVICE_INTERRUPT 0x007c +#define HVCALL_UNMAP_DEVICE_INTERRUPT 0x007d +#define HVCALL_RETARGET_INTERRUPT 0x007e +#define HVCALL_FLUSH_GUEST_PHYSICAL_ADDRESS_SPACE 0x00af +#define HVCALL_FLUSH_GUEST_PHYSICAL_ADDRESS_LIST 0x00b0 + +/* Extended hypercalls */ +#define HV_EXT_CALL_QUERY_CAPABILITIES 0x8001 + +#define HV_FLUSH_ALL_PROCESSORS BIT(0) +#define HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES BIT(1) +#define HV_FLUSH_NON_GLOBAL_MAPPINGS_ONLY BIT(2) +#define HV_FLUSH_USE_EXTENDED_RANGE_FORMAT BIT(3) + +/* hypercall status code */ +#define HV_STATUS_SUCCESS 0 +#define HV_STATUS_INVALID_HYPERCALL_CODE 2 +#define HV_STATUS_INVALID_HYPERCALL_INPUT 3 +#define HV_STATUS_INVALID_ALIGNMENT 4 +#define HV_STATUS_INVALID_PARAMETER 5 +#define HV_STATUS_ACCESS_DENIED 6 +#define HV_STATUS_OPERATION_DENIED 8 +#define HV_STATUS_INSUFFICIENT_MEMORY 11 +#define HV_STATUS_INVALID_PORT_ID 17 +#define HV_STATUS_INVALID_CONNECTION_ID 18 +#define HV_STATUS_INSUFFICIENT_BUFFERS 19 + +/* hypercall options */ +#define HV_HYPERCALL_FAST_BIT BIT(16) +#define HV_HYPERCALL_VARHEAD_OFFSET 17 +#define HV_HYPERCALL_REP_COMP_OFFSET 32 + +/* + * Issue a Hyper-V hypercall. Returns exception vector raised or 0, 'hv_status' + * is set to the hypercall status (if no exception occurred). + */ +static inline uint8_t __hyperv_hypercall(u64 control, vm_vaddr_t input_address, + vm_vaddr_t output_address, + uint64_t *hv_status) +{ + uint64_t error_code; + uint8_t vector; + + /* Note both the hypercall and the "asm safe" clobber r9-r11. */ + asm volatile("mov %[output_address], %%r8\n\t" + KVM_ASM_SAFE("vmcall") + : "=a" (*hv_status), + "+c" (control), "+d" (input_address), + KVM_ASM_SAFE_OUTPUTS(vector, error_code) + : [output_address] "r"(output_address), + "a" (-EFAULT) + : "cc", "memory", "r8", KVM_ASM_SAFE_CLOBBERS); + return vector; +} + +/* Issue a Hyper-V hypercall and assert that it succeeded. */ +static inline void hyperv_hypercall(u64 control, vm_vaddr_t input_address, + vm_vaddr_t output_address) +{ + uint64_t hv_status; + uint8_t vector; + + vector = __hyperv_hypercall(control, input_address, output_address, &hv_status); + + GUEST_ASSERT(!vector); + GUEST_ASSERT((hv_status & 0xffff) == 0); +} + +/* Write 'Fast' hypercall input 'data' to the first 'n_sse_regs' SSE regs */ +static inline void hyperv_write_xmm_input(void *data, int n_sse_regs) +{ + int i; + + for (i = 0; i < n_sse_regs; i++) + write_sse_reg(i, (sse128_t *)(data + sizeof(sse128_t) * i)); +} + +/* Proper HV_X64_MSR_GUEST_OS_ID value */ +#define HYPERV_LINUX_OS_ID ((u64)0x8100 << 48) + +#define HV_X64_MSR_VP_ASSIST_PAGE 0x40000073 +#define HV_X64_MSR_VP_ASSIST_PAGE_ENABLE 0x00000001 +#define HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_SHIFT 12 +#define HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_MASK \ + (~((1ull << HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_SHIFT) - 1)) + +struct hv_nested_enlightenments_control { + struct { + __u32 directhypercall:1; + __u32 reserved:31; + } features; + struct { + __u32 reserved; + } hypercallControls; +} __packed; + +/* Define virtual processor assist page structure. */ +struct hv_vp_assist_page { + __u32 apic_assist; + __u32 reserved1; + __u64 vtl_control[3]; + struct hv_nested_enlightenments_control nested_control; + __u8 enlighten_vmentry; + __u8 reserved2[7]; + __u64 current_nested_vmcs; +} __packed; + +extern struct hv_vp_assist_page *current_vp_assist; + +int enable_vp_assist(uint64_t vp_assist_pa, void *vp_assist); + +struct hyperv_test_pages { + /* VP assist page */ + void *vp_assist_hva; + uint64_t vp_assist_gpa; + void *vp_assist; + + /* Partition assist page */ + void *partition_assist_hva; + uint64_t partition_assist_gpa; + void *partition_assist; + + /* Enlightened VMCS */ + void *enlightened_vmcs_hva; + uint64_t enlightened_vmcs_gpa; + void *enlightened_vmcs; +}; + +struct hyperv_test_pages *vcpu_alloc_hyperv_test_pages(struct kvm_vm *vm, + vm_vaddr_t *p_hv_pages_gva); + +/* HV_X64_MSR_TSC_INVARIANT_CONTROL bits */ +#define HV_INVARIANT_TSC_EXPOSED BIT_ULL(0) + +#endif /* !SELFTEST_KVM_HYPERV_H */ diff --git a/tools/testing/selftests/kvm/include/x86_64/kvm_util_arch.h b/tools/testing/selftests/kvm/include/x86_64/kvm_util_arch.h new file mode 100644 index 000000000000..972bb1c4ab4c --- /dev/null +++ b/tools/testing/selftests/kvm/include/x86_64/kvm_util_arch.h @@ -0,0 +1,51 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef SELFTEST_KVM_UTIL_ARCH_H +#define SELFTEST_KVM_UTIL_ARCH_H + +#include <stdbool.h> +#include <stdint.h> + +#include "kvm_util_types.h" +#include "test_util.h" + +extern bool is_forced_emulation_enabled; + +struct kvm_vm_arch { + vm_vaddr_t gdt; + vm_vaddr_t tss; + vm_vaddr_t idt; + + uint64_t c_bit; + uint64_t s_bit; + int sev_fd; + bool is_pt_protected; +}; + +static inline bool __vm_arch_has_protected_memory(struct kvm_vm_arch *arch) +{ + return arch->c_bit || arch->s_bit; +} + +#define vm_arch_has_protected_memory(vm) \ + __vm_arch_has_protected_memory(&(vm)->arch) + +#define vcpu_arch_put_guest(mem, __val) \ +do { \ + const typeof(mem) val = (__val); \ + \ + if (!is_forced_emulation_enabled || guest_random_bool(&guest_rng)) { \ + (mem) = val; \ + } else if (guest_random_bool(&guest_rng)) { \ + __asm__ __volatile__(KVM_FEP "mov %1, %0" \ + : "+m" (mem) \ + : "r" (val) : "memory"); \ + } else { \ + uint64_t __old = READ_ONCE(mem); \ + \ + __asm__ __volatile__(KVM_FEP LOCK_PREFIX "cmpxchg %[new], %[ptr]" \ + : [ptr] "+m" (mem), [old] "+a" (__old) \ + : [new]"r" (val) : "memory", "cc"); \ + } \ +} while (0) + +#endif // SELFTEST_KVM_UTIL_ARCH_H diff --git a/tools/testing/selftests/kvm/include/x86_64/mce.h b/tools/testing/selftests/kvm/include/x86_64/mce.h new file mode 100644 index 000000000000..6119321f3f5d --- /dev/null +++ b/tools/testing/selftests/kvm/include/x86_64/mce.h @@ -0,0 +1,25 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * tools/testing/selftests/kvm/include/x86_64/mce.h + * + * Copyright (C) 2022, Google LLC. + */ + +#ifndef SELFTEST_KVM_MCE_H +#define SELFTEST_KVM_MCE_H + +#define MCG_CTL_P BIT_ULL(8) /* MCG_CTL register available */ +#define MCG_SER_P BIT_ULL(24) /* MCA recovery/new status bits */ +#define MCG_LMCE_P BIT_ULL(27) /* Local machine check supported */ +#define MCG_CMCI_P BIT_ULL(10) /* CMCI supported */ +#define KVM_MAX_MCE_BANKS 32 +#define MCG_CAP_BANKS_MASK 0xff /* Bit 0-7 of the MCG_CAP register are #banks */ +#define MCI_STATUS_VAL (1ULL << 63) /* valid error */ +#define MCI_STATUS_UC (1ULL << 61) /* uncorrected error */ +#define MCI_STATUS_EN (1ULL << 60) /* error enabled */ +#define MCI_STATUS_MISCV (1ULL << 59) /* misc error reg. valid */ +#define MCI_STATUS_ADDRV (1ULL << 58) /* addr reg. valid */ +#define MCM_ADDR_PHYS 2 /* physical address */ +#define MCI_CTL2_CMCI_EN BIT_ULL(30) + +#endif /* SELFTEST_KVM_MCE_H */ diff --git a/tools/testing/selftests/kvm/include/x86_64/pmu.h b/tools/testing/selftests/kvm/include/x86_64/pmu.h new file mode 100644 index 000000000000..3c10c4dc0ae8 --- /dev/null +++ b/tools/testing/selftests/kvm/include/x86_64/pmu.h @@ -0,0 +1,97 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2023, Tencent, Inc. + */ +#ifndef SELFTEST_KVM_PMU_H +#define SELFTEST_KVM_PMU_H + +#include <stdint.h> + +#define KVM_PMU_EVENT_FILTER_MAX_EVENTS 300 + +/* + * Encode an eventsel+umask pair into event-select MSR format. Note, this is + * technically AMD's format, as Intel's format only supports 8 bits for the + * event selector, i.e. doesn't use bits 24:16 for the selector. But, OR-ing + * in '0' is a nop and won't clobber the CMASK. + */ +#define RAW_EVENT(eventsel, umask) (((eventsel & 0xf00UL) << 24) | \ + ((eventsel) & 0xff) | \ + ((umask) & 0xff) << 8) + +/* + * These are technically Intel's definitions, but except for CMASK (see above), + * AMD's layout is compatible with Intel's. + */ +#define ARCH_PERFMON_EVENTSEL_EVENT GENMASK_ULL(7, 0) +#define ARCH_PERFMON_EVENTSEL_UMASK GENMASK_ULL(15, 8) +#define ARCH_PERFMON_EVENTSEL_USR BIT_ULL(16) +#define ARCH_PERFMON_EVENTSEL_OS BIT_ULL(17) +#define ARCH_PERFMON_EVENTSEL_EDGE BIT_ULL(18) +#define ARCH_PERFMON_EVENTSEL_PIN_CONTROL BIT_ULL(19) +#define ARCH_PERFMON_EVENTSEL_INT BIT_ULL(20) +#define ARCH_PERFMON_EVENTSEL_ANY BIT_ULL(21) +#define ARCH_PERFMON_EVENTSEL_ENABLE BIT_ULL(22) +#define ARCH_PERFMON_EVENTSEL_INV BIT_ULL(23) +#define ARCH_PERFMON_EVENTSEL_CMASK GENMASK_ULL(31, 24) + +/* RDPMC control flags, Intel only. */ +#define INTEL_RDPMC_METRICS BIT_ULL(29) +#define INTEL_RDPMC_FIXED BIT_ULL(30) +#define INTEL_RDPMC_FAST BIT_ULL(31) + +/* Fixed PMC controls, Intel only. */ +#define FIXED_PMC_GLOBAL_CTRL_ENABLE(_idx) BIT_ULL((32 + (_idx))) + +#define FIXED_PMC_KERNEL BIT_ULL(0) +#define FIXED_PMC_USER BIT_ULL(1) +#define FIXED_PMC_ANYTHREAD BIT_ULL(2) +#define FIXED_PMC_ENABLE_PMI BIT_ULL(3) +#define FIXED_PMC_NR_BITS 4 +#define FIXED_PMC_CTRL(_idx, _val) ((_val) << ((_idx) * FIXED_PMC_NR_BITS)) + +#define PMU_CAP_FW_WRITES BIT_ULL(13) +#define PMU_CAP_LBR_FMT 0x3f + +#define INTEL_ARCH_CPU_CYCLES RAW_EVENT(0x3c, 0x00) +#define INTEL_ARCH_INSTRUCTIONS_RETIRED RAW_EVENT(0xc0, 0x00) +#define INTEL_ARCH_REFERENCE_CYCLES RAW_EVENT(0x3c, 0x01) +#define INTEL_ARCH_LLC_REFERENCES RAW_EVENT(0x2e, 0x4f) +#define INTEL_ARCH_LLC_MISSES RAW_EVENT(0x2e, 0x41) +#define INTEL_ARCH_BRANCHES_RETIRED RAW_EVENT(0xc4, 0x00) +#define INTEL_ARCH_BRANCHES_MISPREDICTED RAW_EVENT(0xc5, 0x00) +#define INTEL_ARCH_TOPDOWN_SLOTS RAW_EVENT(0xa4, 0x01) + +#define AMD_ZEN_CORE_CYCLES RAW_EVENT(0x76, 0x00) +#define AMD_ZEN_INSTRUCTIONS_RETIRED RAW_EVENT(0xc0, 0x00) +#define AMD_ZEN_BRANCHES_RETIRED RAW_EVENT(0xc2, 0x00) +#define AMD_ZEN_BRANCHES_MISPREDICTED RAW_EVENT(0xc3, 0x00) + +/* + * Note! The order and thus the index of the architectural events matters as + * support for each event is enumerated via CPUID using the index of the event. + */ +enum intel_pmu_architectural_events { + INTEL_ARCH_CPU_CYCLES_INDEX, + INTEL_ARCH_INSTRUCTIONS_RETIRED_INDEX, + INTEL_ARCH_REFERENCE_CYCLES_INDEX, + INTEL_ARCH_LLC_REFERENCES_INDEX, + INTEL_ARCH_LLC_MISSES_INDEX, + INTEL_ARCH_BRANCHES_RETIRED_INDEX, + INTEL_ARCH_BRANCHES_MISPREDICTED_INDEX, + INTEL_ARCH_TOPDOWN_SLOTS_INDEX, + NR_INTEL_ARCH_EVENTS, +}; + +enum amd_pmu_zen_events { + AMD_ZEN_CORE_CYCLES_INDEX, + AMD_ZEN_INSTRUCTIONS_INDEX, + AMD_ZEN_BRANCHES_INDEX, + AMD_ZEN_BRANCH_MISSES_INDEX, + NR_AMD_ZEN_EVENTS, +}; + +extern const uint64_t intel_pmu_arch_events[]; +extern const uint64_t amd_pmu_zen_events[]; + +#endif /* SELFTEST_KVM_PMU_H */ diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h index 82b7fe16a824..8eb57de0b587 100644 --- a/tools/testing/selftests/kvm/include/x86_64/processor.h +++ b/tools/testing/selftests/kvm/include/x86_64/processor.h @@ -10,8 +10,24 @@ #include <assert.h> #include <stdint.h> +#include <syscall.h> #include <asm/msr-index.h> +#include <asm/prctl.h> + +#include <linux/kvm_para.h> +#include <linux/stringify.h> + +#include "kvm_util.h" +#include "ucall_common.h" + +extern bool host_cpu_is_intel; +extern bool host_cpu_is_amd; + +/* Forced emulation prefix, used to invoke the emulator unconditionally. */ +#define KVM_FEP "ud2; .byte 'k', 'v', 'm';" + +#define NMI_VECTOR 0x02 #define X86_EFLAGS_FIXED (1u << 1) @@ -27,6 +43,7 @@ #define X86_CR4_OSFXSR (1ul << 9) #define X86_CR4_OSXMMEXCPT (1ul << 10) #define X86_CR4_UMIP (1ul << 11) +#define X86_CR4_LA57 (1ul << 12) #define X86_CR4_VMXE (1ul << 13) #define X86_CR4_SMXE (1ul << 14) #define X86_CR4_FSGSBASE (1ul << 16) @@ -36,6 +53,316 @@ #define X86_CR4_SMAP (1ul << 21) #define X86_CR4_PKE (1ul << 22) +struct xstate_header { + u64 xstate_bv; + u64 xcomp_bv; + u64 reserved[6]; +} __attribute__((packed)); + +struct xstate { + u8 i387[512]; + struct xstate_header header; + u8 extended_state_area[0]; +} __attribute__ ((packed, aligned (64))); + +#define XFEATURE_MASK_FP BIT_ULL(0) +#define XFEATURE_MASK_SSE BIT_ULL(1) +#define XFEATURE_MASK_YMM BIT_ULL(2) +#define XFEATURE_MASK_BNDREGS BIT_ULL(3) +#define XFEATURE_MASK_BNDCSR BIT_ULL(4) +#define XFEATURE_MASK_OPMASK BIT_ULL(5) +#define XFEATURE_MASK_ZMM_Hi256 BIT_ULL(6) +#define XFEATURE_MASK_Hi16_ZMM BIT_ULL(7) +#define XFEATURE_MASK_PT BIT_ULL(8) +#define XFEATURE_MASK_PKRU BIT_ULL(9) +#define XFEATURE_MASK_PASID BIT_ULL(10) +#define XFEATURE_MASK_CET_USER BIT_ULL(11) +#define XFEATURE_MASK_CET_KERNEL BIT_ULL(12) +#define XFEATURE_MASK_LBR BIT_ULL(15) +#define XFEATURE_MASK_XTILE_CFG BIT_ULL(17) +#define XFEATURE_MASK_XTILE_DATA BIT_ULL(18) + +#define XFEATURE_MASK_AVX512 (XFEATURE_MASK_OPMASK | \ + XFEATURE_MASK_ZMM_Hi256 | \ + XFEATURE_MASK_Hi16_ZMM) +#define XFEATURE_MASK_XTILE (XFEATURE_MASK_XTILE_DATA | \ + XFEATURE_MASK_XTILE_CFG) + +/* Note, these are ordered alphabetically to match kvm_cpuid_entry2. Eww. */ +enum cpuid_output_regs { + KVM_CPUID_EAX, + KVM_CPUID_EBX, + KVM_CPUID_ECX, + KVM_CPUID_EDX +}; + +/* + * Pack the information into a 64-bit value so that each X86_FEATURE_XXX can be + * passed by value with no overhead. + */ +struct kvm_x86_cpu_feature { + u32 function; + u16 index; + u8 reg; + u8 bit; +}; +#define KVM_X86_CPU_FEATURE(fn, idx, gpr, __bit) \ +({ \ + struct kvm_x86_cpu_feature feature = { \ + .function = fn, \ + .index = idx, \ + .reg = KVM_CPUID_##gpr, \ + .bit = __bit, \ + }; \ + \ + kvm_static_assert((fn & 0xc0000000) == 0 || \ + (fn & 0xc0000000) == 0x40000000 || \ + (fn & 0xc0000000) == 0x80000000 || \ + (fn & 0xc0000000) == 0xc0000000); \ + kvm_static_assert(idx < BIT(sizeof(feature.index) * BITS_PER_BYTE)); \ + feature; \ +}) + +/* + * Basic Leafs, a.k.a. Intel defined + */ +#define X86_FEATURE_MWAIT KVM_X86_CPU_FEATURE(0x1, 0, ECX, 3) +#define X86_FEATURE_VMX KVM_X86_CPU_FEATURE(0x1, 0, ECX, 5) +#define X86_FEATURE_SMX KVM_X86_CPU_FEATURE(0x1, 0, ECX, 6) +#define X86_FEATURE_PDCM KVM_X86_CPU_FEATURE(0x1, 0, ECX, 15) +#define X86_FEATURE_PCID KVM_X86_CPU_FEATURE(0x1, 0, ECX, 17) +#define X86_FEATURE_X2APIC KVM_X86_CPU_FEATURE(0x1, 0, ECX, 21) +#define X86_FEATURE_MOVBE KVM_X86_CPU_FEATURE(0x1, 0, ECX, 22) +#define X86_FEATURE_TSC_DEADLINE_TIMER KVM_X86_CPU_FEATURE(0x1, 0, ECX, 24) +#define X86_FEATURE_XSAVE KVM_X86_CPU_FEATURE(0x1, 0, ECX, 26) +#define X86_FEATURE_OSXSAVE KVM_X86_CPU_FEATURE(0x1, 0, ECX, 27) +#define X86_FEATURE_RDRAND KVM_X86_CPU_FEATURE(0x1, 0, ECX, 30) +#define X86_FEATURE_HYPERVISOR KVM_X86_CPU_FEATURE(0x1, 0, ECX, 31) +#define X86_FEATURE_PAE KVM_X86_CPU_FEATURE(0x1, 0, EDX, 6) +#define X86_FEATURE_MCE KVM_X86_CPU_FEATURE(0x1, 0, EDX, 7) +#define X86_FEATURE_APIC KVM_X86_CPU_FEATURE(0x1, 0, EDX, 9) +#define X86_FEATURE_CLFLUSH KVM_X86_CPU_FEATURE(0x1, 0, EDX, 19) +#define X86_FEATURE_XMM KVM_X86_CPU_FEATURE(0x1, 0, EDX, 25) +#define X86_FEATURE_XMM2 KVM_X86_CPU_FEATURE(0x1, 0, EDX, 26) +#define X86_FEATURE_FSGSBASE KVM_X86_CPU_FEATURE(0x7, 0, EBX, 0) +#define X86_FEATURE_TSC_ADJUST KVM_X86_CPU_FEATURE(0x7, 0, EBX, 1) +#define X86_FEATURE_SGX KVM_X86_CPU_FEATURE(0x7, 0, EBX, 2) +#define X86_FEATURE_HLE KVM_X86_CPU_FEATURE(0x7, 0, EBX, 4) +#define X86_FEATURE_SMEP KVM_X86_CPU_FEATURE(0x7, 0, EBX, 7) +#define X86_FEATURE_INVPCID KVM_X86_CPU_FEATURE(0x7, 0, EBX, 10) +#define X86_FEATURE_RTM KVM_X86_CPU_FEATURE(0x7, 0, EBX, 11) +#define X86_FEATURE_MPX KVM_X86_CPU_FEATURE(0x7, 0, EBX, 14) +#define X86_FEATURE_SMAP KVM_X86_CPU_FEATURE(0x7, 0, EBX, 20) +#define X86_FEATURE_PCOMMIT KVM_X86_CPU_FEATURE(0x7, 0, EBX, 22) +#define X86_FEATURE_CLFLUSHOPT KVM_X86_CPU_FEATURE(0x7, 0, EBX, 23) +#define X86_FEATURE_CLWB KVM_X86_CPU_FEATURE(0x7, 0, EBX, 24) +#define X86_FEATURE_UMIP KVM_X86_CPU_FEATURE(0x7, 0, ECX, 2) +#define X86_FEATURE_PKU KVM_X86_CPU_FEATURE(0x7, 0, ECX, 3) +#define X86_FEATURE_OSPKE KVM_X86_CPU_FEATURE(0x7, 0, ECX, 4) +#define X86_FEATURE_LA57 KVM_X86_CPU_FEATURE(0x7, 0, ECX, 16) +#define X86_FEATURE_RDPID KVM_X86_CPU_FEATURE(0x7, 0, ECX, 22) +#define X86_FEATURE_SGX_LC KVM_X86_CPU_FEATURE(0x7, 0, ECX, 30) +#define X86_FEATURE_SHSTK KVM_X86_CPU_FEATURE(0x7, 0, ECX, 7) +#define X86_FEATURE_IBT KVM_X86_CPU_FEATURE(0x7, 0, EDX, 20) +#define X86_FEATURE_AMX_TILE KVM_X86_CPU_FEATURE(0x7, 0, EDX, 24) +#define X86_FEATURE_SPEC_CTRL KVM_X86_CPU_FEATURE(0x7, 0, EDX, 26) +#define X86_FEATURE_ARCH_CAPABILITIES KVM_X86_CPU_FEATURE(0x7, 0, EDX, 29) +#define X86_FEATURE_PKS KVM_X86_CPU_FEATURE(0x7, 0, ECX, 31) +#define X86_FEATURE_XTILECFG KVM_X86_CPU_FEATURE(0xD, 0, EAX, 17) +#define X86_FEATURE_XTILEDATA KVM_X86_CPU_FEATURE(0xD, 0, EAX, 18) +#define X86_FEATURE_XSAVES KVM_X86_CPU_FEATURE(0xD, 1, EAX, 3) +#define X86_FEATURE_XFD KVM_X86_CPU_FEATURE(0xD, 1, EAX, 4) +#define X86_FEATURE_XTILEDATA_XFD KVM_X86_CPU_FEATURE(0xD, 18, ECX, 2) + +/* + * Extended Leafs, a.k.a. AMD defined + */ +#define X86_FEATURE_SVM KVM_X86_CPU_FEATURE(0x80000001, 0, ECX, 2) +#define X86_FEATURE_NX KVM_X86_CPU_FEATURE(0x80000001, 0, EDX, 20) +#define X86_FEATURE_GBPAGES KVM_X86_CPU_FEATURE(0x80000001, 0, EDX, 26) +#define X86_FEATURE_RDTSCP KVM_X86_CPU_FEATURE(0x80000001, 0, EDX, 27) +#define X86_FEATURE_LM KVM_X86_CPU_FEATURE(0x80000001, 0, EDX, 29) +#define X86_FEATURE_INVTSC KVM_X86_CPU_FEATURE(0x80000007, 0, EDX, 8) +#define X86_FEATURE_RDPRU KVM_X86_CPU_FEATURE(0x80000008, 0, EBX, 4) +#define X86_FEATURE_AMD_IBPB KVM_X86_CPU_FEATURE(0x80000008, 0, EBX, 12) +#define X86_FEATURE_NPT KVM_X86_CPU_FEATURE(0x8000000A, 0, EDX, 0) +#define X86_FEATURE_LBRV KVM_X86_CPU_FEATURE(0x8000000A, 0, EDX, 1) +#define X86_FEATURE_NRIPS KVM_X86_CPU_FEATURE(0x8000000A, 0, EDX, 3) +#define X86_FEATURE_TSCRATEMSR KVM_X86_CPU_FEATURE(0x8000000A, 0, EDX, 4) +#define X86_FEATURE_PAUSEFILTER KVM_X86_CPU_FEATURE(0x8000000A, 0, EDX, 10) +#define X86_FEATURE_PFTHRESHOLD KVM_X86_CPU_FEATURE(0x8000000A, 0, EDX, 12) +#define X86_FEATURE_VGIF KVM_X86_CPU_FEATURE(0x8000000A, 0, EDX, 16) +#define X86_FEATURE_SEV KVM_X86_CPU_FEATURE(0x8000001F, 0, EAX, 1) +#define X86_FEATURE_SEV_ES KVM_X86_CPU_FEATURE(0x8000001F, 0, EAX, 3) + +/* + * KVM defined paravirt features. + */ +#define X86_FEATURE_KVM_CLOCKSOURCE KVM_X86_CPU_FEATURE(0x40000001, 0, EAX, 0) +#define X86_FEATURE_KVM_NOP_IO_DELAY KVM_X86_CPU_FEATURE(0x40000001, 0, EAX, 1) +#define X86_FEATURE_KVM_MMU_OP KVM_X86_CPU_FEATURE(0x40000001, 0, EAX, 2) +#define X86_FEATURE_KVM_CLOCKSOURCE2 KVM_X86_CPU_FEATURE(0x40000001, 0, EAX, 3) +#define X86_FEATURE_KVM_ASYNC_PF KVM_X86_CPU_FEATURE(0x40000001, 0, EAX, 4) +#define X86_FEATURE_KVM_STEAL_TIME KVM_X86_CPU_FEATURE(0x40000001, 0, EAX, 5) +#define X86_FEATURE_KVM_PV_EOI KVM_X86_CPU_FEATURE(0x40000001, 0, EAX, 6) +#define X86_FEATURE_KVM_PV_UNHALT KVM_X86_CPU_FEATURE(0x40000001, 0, EAX, 7) +/* Bit 8 apparently isn't used?!?! */ +#define X86_FEATURE_KVM_PV_TLB_FLUSH KVM_X86_CPU_FEATURE(0x40000001, 0, EAX, 9) +#define X86_FEATURE_KVM_ASYNC_PF_VMEXIT KVM_X86_CPU_FEATURE(0x40000001, 0, EAX, 10) +#define X86_FEATURE_KVM_PV_SEND_IPI KVM_X86_CPU_FEATURE(0x40000001, 0, EAX, 11) +#define X86_FEATURE_KVM_POLL_CONTROL KVM_X86_CPU_FEATURE(0x40000001, 0, EAX, 12) +#define X86_FEATURE_KVM_PV_SCHED_YIELD KVM_X86_CPU_FEATURE(0x40000001, 0, EAX, 13) +#define X86_FEATURE_KVM_ASYNC_PF_INT KVM_X86_CPU_FEATURE(0x40000001, 0, EAX, 14) +#define X86_FEATURE_KVM_MSI_EXT_DEST_ID KVM_X86_CPU_FEATURE(0x40000001, 0, EAX, 15) +#define X86_FEATURE_KVM_HC_MAP_GPA_RANGE KVM_X86_CPU_FEATURE(0x40000001, 0, EAX, 16) +#define X86_FEATURE_KVM_MIGRATION_CONTROL KVM_X86_CPU_FEATURE(0x40000001, 0, EAX, 17) + +/* + * Same idea as X86_FEATURE_XXX, but X86_PROPERTY_XXX retrieves a multi-bit + * value/property as opposed to a single-bit feature. Again, pack the info + * into a 64-bit value to pass by value with no overhead. + */ +struct kvm_x86_cpu_property { + u32 function; + u8 index; + u8 reg; + u8 lo_bit; + u8 hi_bit; +}; +#define KVM_X86_CPU_PROPERTY(fn, idx, gpr, low_bit, high_bit) \ +({ \ + struct kvm_x86_cpu_property property = { \ + .function = fn, \ + .index = idx, \ + .reg = KVM_CPUID_##gpr, \ + .lo_bit = low_bit, \ + .hi_bit = high_bit, \ + }; \ + \ + kvm_static_assert(low_bit < high_bit); \ + kvm_static_assert((fn & 0xc0000000) == 0 || \ + (fn & 0xc0000000) == 0x40000000 || \ + (fn & 0xc0000000) == 0x80000000 || \ + (fn & 0xc0000000) == 0xc0000000); \ + kvm_static_assert(idx < BIT(sizeof(property.index) * BITS_PER_BYTE)); \ + property; \ +}) + +#define X86_PROPERTY_MAX_BASIC_LEAF KVM_X86_CPU_PROPERTY(0, 0, EAX, 0, 31) +#define X86_PROPERTY_PMU_VERSION KVM_X86_CPU_PROPERTY(0xa, 0, EAX, 0, 7) +#define X86_PROPERTY_PMU_NR_GP_COUNTERS KVM_X86_CPU_PROPERTY(0xa, 0, EAX, 8, 15) +#define X86_PROPERTY_PMU_GP_COUNTERS_BIT_WIDTH KVM_X86_CPU_PROPERTY(0xa, 0, EAX, 16, 23) +#define X86_PROPERTY_PMU_EBX_BIT_VECTOR_LENGTH KVM_X86_CPU_PROPERTY(0xa, 0, EAX, 24, 31) +#define X86_PROPERTY_PMU_EVENTS_MASK KVM_X86_CPU_PROPERTY(0xa, 0, EBX, 0, 7) +#define X86_PROPERTY_PMU_FIXED_COUNTERS_BITMASK KVM_X86_CPU_PROPERTY(0xa, 0, ECX, 0, 31) +#define X86_PROPERTY_PMU_NR_FIXED_COUNTERS KVM_X86_CPU_PROPERTY(0xa, 0, EDX, 0, 4) +#define X86_PROPERTY_PMU_FIXED_COUNTERS_BIT_WIDTH KVM_X86_CPU_PROPERTY(0xa, 0, EDX, 5, 12) + +#define X86_PROPERTY_SUPPORTED_XCR0_LO KVM_X86_CPU_PROPERTY(0xd, 0, EAX, 0, 31) +#define X86_PROPERTY_XSTATE_MAX_SIZE_XCR0 KVM_X86_CPU_PROPERTY(0xd, 0, EBX, 0, 31) +#define X86_PROPERTY_XSTATE_MAX_SIZE KVM_X86_CPU_PROPERTY(0xd, 0, ECX, 0, 31) +#define X86_PROPERTY_SUPPORTED_XCR0_HI KVM_X86_CPU_PROPERTY(0xd, 0, EDX, 0, 31) + +#define X86_PROPERTY_XSTATE_TILE_SIZE KVM_X86_CPU_PROPERTY(0xd, 18, EAX, 0, 31) +#define X86_PROPERTY_XSTATE_TILE_OFFSET KVM_X86_CPU_PROPERTY(0xd, 18, EBX, 0, 31) +#define X86_PROPERTY_AMX_MAX_PALETTE_TABLES KVM_X86_CPU_PROPERTY(0x1d, 0, EAX, 0, 31) +#define X86_PROPERTY_AMX_TOTAL_TILE_BYTES KVM_X86_CPU_PROPERTY(0x1d, 1, EAX, 0, 15) +#define X86_PROPERTY_AMX_BYTES_PER_TILE KVM_X86_CPU_PROPERTY(0x1d, 1, EAX, 16, 31) +#define X86_PROPERTY_AMX_BYTES_PER_ROW KVM_X86_CPU_PROPERTY(0x1d, 1, EBX, 0, 15) +#define X86_PROPERTY_AMX_NR_TILE_REGS KVM_X86_CPU_PROPERTY(0x1d, 1, EBX, 16, 31) +#define X86_PROPERTY_AMX_MAX_ROWS KVM_X86_CPU_PROPERTY(0x1d, 1, ECX, 0, 15) + +#define X86_PROPERTY_MAX_KVM_LEAF KVM_X86_CPU_PROPERTY(0x40000000, 0, EAX, 0, 31) + +#define X86_PROPERTY_MAX_EXT_LEAF KVM_X86_CPU_PROPERTY(0x80000000, 0, EAX, 0, 31) +#define X86_PROPERTY_MAX_PHY_ADDR KVM_X86_CPU_PROPERTY(0x80000008, 0, EAX, 0, 7) +#define X86_PROPERTY_MAX_VIRT_ADDR KVM_X86_CPU_PROPERTY(0x80000008, 0, EAX, 8, 15) +#define X86_PROPERTY_SEV_C_BIT KVM_X86_CPU_PROPERTY(0x8000001F, 0, EBX, 0, 5) +#define X86_PROPERTY_PHYS_ADDR_REDUCTION KVM_X86_CPU_PROPERTY(0x8000001F, 0, EBX, 6, 11) + +#define X86_PROPERTY_MAX_CENTAUR_LEAF KVM_X86_CPU_PROPERTY(0xC0000000, 0, EAX, 0, 31) + +/* + * Intel's architectural PMU events are bizarre. They have a "feature" bit + * that indicates the feature is _not_ supported, and a property that states + * the length of the bit mask of unsupported features. A feature is supported + * if the size of the bit mask is larger than the "unavailable" bit, and said + * bit is not set. Fixed counters also bizarre enumeration, but inverted from + * arch events for general purpose counters. Fixed counters are supported if a + * feature flag is set **OR** the total number of fixed counters is greater + * than index of the counter. + * + * Wrap the events for general purpose and fixed counters to simplify checking + * whether or not a given architectural event is supported. + */ +struct kvm_x86_pmu_feature { + struct kvm_x86_cpu_feature f; +}; +#define KVM_X86_PMU_FEATURE(__reg, __bit) \ +({ \ + struct kvm_x86_pmu_feature feature = { \ + .f = KVM_X86_CPU_FEATURE(0xa, 0, __reg, __bit), \ + }; \ + \ + kvm_static_assert(KVM_CPUID_##__reg == KVM_CPUID_EBX || \ + KVM_CPUID_##__reg == KVM_CPUID_ECX); \ + feature; \ +}) + +#define X86_PMU_FEATURE_CPU_CYCLES KVM_X86_PMU_FEATURE(EBX, 0) +#define X86_PMU_FEATURE_INSNS_RETIRED KVM_X86_PMU_FEATURE(EBX, 1) +#define X86_PMU_FEATURE_REFERENCE_CYCLES KVM_X86_PMU_FEATURE(EBX, 2) +#define X86_PMU_FEATURE_LLC_REFERENCES KVM_X86_PMU_FEATURE(EBX, 3) +#define X86_PMU_FEATURE_LLC_MISSES KVM_X86_PMU_FEATURE(EBX, 4) +#define X86_PMU_FEATURE_BRANCH_INSNS_RETIRED KVM_X86_PMU_FEATURE(EBX, 5) +#define X86_PMU_FEATURE_BRANCHES_MISPREDICTED KVM_X86_PMU_FEATURE(EBX, 6) +#define X86_PMU_FEATURE_TOPDOWN_SLOTS KVM_X86_PMU_FEATURE(EBX, 7) + +#define X86_PMU_FEATURE_INSNS_RETIRED_FIXED KVM_X86_PMU_FEATURE(ECX, 0) +#define X86_PMU_FEATURE_CPU_CYCLES_FIXED KVM_X86_PMU_FEATURE(ECX, 1) +#define X86_PMU_FEATURE_REFERENCE_TSC_CYCLES_FIXED KVM_X86_PMU_FEATURE(ECX, 2) +#define X86_PMU_FEATURE_TOPDOWN_SLOTS_FIXED KVM_X86_PMU_FEATURE(ECX, 3) + +static inline unsigned int x86_family(unsigned int eax) +{ + unsigned int x86; + + x86 = (eax >> 8) & 0xf; + + if (x86 == 0xf) + x86 += (eax >> 20) & 0xff; + + return x86; +} + +static inline unsigned int x86_model(unsigned int eax) +{ + return ((eax >> 12) & 0xf0) | ((eax >> 4) & 0x0f); +} + +/* Page table bitfield declarations */ +#define PTE_PRESENT_MASK BIT_ULL(0) +#define PTE_WRITABLE_MASK BIT_ULL(1) +#define PTE_USER_MASK BIT_ULL(2) +#define PTE_ACCESSED_MASK BIT_ULL(5) +#define PTE_DIRTY_MASK BIT_ULL(6) +#define PTE_LARGE_MASK BIT_ULL(7) +#define PTE_GLOBAL_MASK BIT_ULL(8) +#define PTE_NX_MASK BIT_ULL(63) + +#define PHYSICAL_PAGE_MASK GENMASK_ULL(51, 12) + +#define PAGE_SHIFT 12 +#define PAGE_SIZE (1ULL << PAGE_SHIFT) +#define PAGE_MASK (~(PAGE_SIZE-1) & PHYSICAL_PAGE_MASK) + +#define HUGEPAGE_SHIFT(x) (PAGE_SHIFT + (((x) - 1) * 9)) +#define HUGEPAGE_SIZE(x) (1UL << HUGEPAGE_SHIFT(x)) +#define HUGEPAGE_MASK(x) (~(HUGEPAGE_SIZE(x) - 1) & PHYSICAL_PAGE_MASK) + +#define PTE_GET_PA(pte) ((pte) & PHYSICAL_PAGE_MASK) +#define PTE_GET_PFN(pte) (PTE_GET_PA(pte) >> PAGE_SHIFT) + /* General Registers in 64-Bit Mode */ struct gpr64_regs { u64 rax; @@ -59,7 +386,7 @@ struct gpr64_regs { struct desc64 { uint16_t limit0; uint16_t base0; - unsigned base1:8, s:1, type:4, dpl:2, p:1; + unsigned base1:8, type:4, s:1, dpl:2, p:1; unsigned limit1:4, avl:1, l:1, db:1, g:1, base2:8; uint32_t base3; uint32_t zero1; @@ -70,6 +397,21 @@ struct desc_ptr { uint64_t address; } __attribute__((packed)); +struct kvm_x86_state { + struct kvm_xsave *xsave; + struct kvm_vcpu_events events; + struct kvm_mp_state mp_state; + struct kvm_regs regs; + struct kvm_xcrs xcrs; + struct kvm_sregs sregs; + struct kvm_debugregs debugregs; + union { + struct kvm_nested_state nested; + char nested_[16384]; + }; + struct kvm_msrs msrs; +}; + static inline uint64_t get_desc64_base(const struct desc64 *desc) { return ((uint64_t)desc->base3 << 32) | @@ -223,6 +565,31 @@ static inline void set_cr4(uint64_t val) __asm__ __volatile__("mov %0, %%cr4" : : "r" (val) : "memory"); } +static inline u64 xgetbv(u32 index) +{ + u32 eax, edx; + + __asm__ __volatile__("xgetbv;" + : "=a" (eax), "=d" (edx) + : "c" (index)); + return eax | ((u64)edx << 32); +} + +static inline void xsetbv(u32 index, u64 value) +{ + u32 eax = value; + u32 edx = value >> 32; + + __asm__ __volatile__("xsetbv" :: "a" (eax), "d" (edx), "c" (index)); +} + +static inline void wrpkru(u32 pkru) +{ + /* Note, ECX and EDX are architecturally required to be '0'. */ + asm volatile(".byte 0x0f,0x01,0xef\n\t" + : : "a" (pkru), "c"(0), "d"(0)); +} + static inline struct desc_ptr get_gdt(void) { struct desc_ptr gdt; @@ -239,104 +606,729 @@ static inline struct desc_ptr get_idt(void) return idt; } -#define SET_XMM(__var, __xmm) \ - asm volatile("movq %0, %%"#__xmm : : "r"(__var) : #__xmm) +static inline void outl(uint16_t port, uint32_t value) +{ + __asm__ __volatile__("outl %%eax, %%dx" : : "d"(port), "a"(value)); +} + +static inline void __cpuid(uint32_t function, uint32_t index, + uint32_t *eax, uint32_t *ebx, + uint32_t *ecx, uint32_t *edx) +{ + *eax = function; + *ecx = index; + + asm volatile("cpuid" + : "=a" (*eax), + "=b" (*ebx), + "=c" (*ecx), + "=d" (*edx) + : "0" (*eax), "2" (*ecx) + : "memory"); +} + +static inline void cpuid(uint32_t function, + uint32_t *eax, uint32_t *ebx, + uint32_t *ecx, uint32_t *edx) +{ + return __cpuid(function, 0, eax, ebx, ecx, edx); +} + +static inline uint32_t this_cpu_fms(void) +{ + uint32_t eax, ebx, ecx, edx; + + cpuid(1, &eax, &ebx, &ecx, &edx); + return eax; +} + +static inline uint32_t this_cpu_family(void) +{ + return x86_family(this_cpu_fms()); +} + +static inline uint32_t this_cpu_model(void) +{ + return x86_model(this_cpu_fms()); +} + +static inline bool this_cpu_vendor_string_is(const char *vendor) +{ + const uint32_t *chunk = (const uint32_t *)vendor; + uint32_t eax, ebx, ecx, edx; + + cpuid(0, &eax, &ebx, &ecx, &edx); + return (ebx == chunk[0] && edx == chunk[1] && ecx == chunk[2]); +} + +static inline bool this_cpu_is_intel(void) +{ + return this_cpu_vendor_string_is("GenuineIntel"); +} + +/* + * Exclude early K5 samples with a vendor string of "AMDisbetter!" + */ +static inline bool this_cpu_is_amd(void) +{ + return this_cpu_vendor_string_is("AuthenticAMD"); +} + +static inline uint32_t __this_cpu_has(uint32_t function, uint32_t index, + uint8_t reg, uint8_t lo, uint8_t hi) +{ + uint32_t gprs[4]; + + __cpuid(function, index, + &gprs[KVM_CPUID_EAX], &gprs[KVM_CPUID_EBX], + &gprs[KVM_CPUID_ECX], &gprs[KVM_CPUID_EDX]); + + return (gprs[reg] & GENMASK(hi, lo)) >> lo; +} + +static inline bool this_cpu_has(struct kvm_x86_cpu_feature feature) +{ + return __this_cpu_has(feature.function, feature.index, + feature.reg, feature.bit, feature.bit); +} + +static inline uint32_t this_cpu_property(struct kvm_x86_cpu_property property) +{ + return __this_cpu_has(property.function, property.index, + property.reg, property.lo_bit, property.hi_bit); +} + +static __always_inline bool this_cpu_has_p(struct kvm_x86_cpu_property property) +{ + uint32_t max_leaf; + + switch (property.function & 0xc0000000) { + case 0: + max_leaf = this_cpu_property(X86_PROPERTY_MAX_BASIC_LEAF); + break; + case 0x40000000: + max_leaf = this_cpu_property(X86_PROPERTY_MAX_KVM_LEAF); + break; + case 0x80000000: + max_leaf = this_cpu_property(X86_PROPERTY_MAX_EXT_LEAF); + break; + case 0xc0000000: + max_leaf = this_cpu_property(X86_PROPERTY_MAX_CENTAUR_LEAF); + } + return max_leaf >= property.function; +} -static inline void set_xmm(int n, unsigned long val) +static inline bool this_pmu_has(struct kvm_x86_pmu_feature feature) { - switch (n) { + uint32_t nr_bits; + + if (feature.f.reg == KVM_CPUID_EBX) { + nr_bits = this_cpu_property(X86_PROPERTY_PMU_EBX_BIT_VECTOR_LENGTH); + return nr_bits > feature.f.bit && !this_cpu_has(feature.f); + } + + GUEST_ASSERT(feature.f.reg == KVM_CPUID_ECX); + nr_bits = this_cpu_property(X86_PROPERTY_PMU_NR_FIXED_COUNTERS); + return nr_bits > feature.f.bit || this_cpu_has(feature.f); +} + +static __always_inline uint64_t this_cpu_supported_xcr0(void) +{ + if (!this_cpu_has_p(X86_PROPERTY_SUPPORTED_XCR0_LO)) + return 0; + + return this_cpu_property(X86_PROPERTY_SUPPORTED_XCR0_LO) | + ((uint64_t)this_cpu_property(X86_PROPERTY_SUPPORTED_XCR0_HI) << 32); +} + +typedef u32 __attribute__((vector_size(16))) sse128_t; +#define __sse128_u union { sse128_t vec; u64 as_u64[2]; u32 as_u32[4]; } +#define sse128_lo(x) ({ __sse128_u t; t.vec = x; t.as_u64[0]; }) +#define sse128_hi(x) ({ __sse128_u t; t.vec = x; t.as_u64[1]; }) + +static inline void read_sse_reg(int reg, sse128_t *data) +{ + switch (reg) { case 0: - SET_XMM(val, xmm0); + asm("movdqa %%xmm0, %0" : "=m"(*data)); break; case 1: - SET_XMM(val, xmm1); + asm("movdqa %%xmm1, %0" : "=m"(*data)); break; case 2: - SET_XMM(val, xmm2); + asm("movdqa %%xmm2, %0" : "=m"(*data)); break; case 3: - SET_XMM(val, xmm3); + asm("movdqa %%xmm3, %0" : "=m"(*data)); break; case 4: - SET_XMM(val, xmm4); + asm("movdqa %%xmm4, %0" : "=m"(*data)); break; case 5: - SET_XMM(val, xmm5); + asm("movdqa %%xmm5, %0" : "=m"(*data)); break; case 6: - SET_XMM(val, xmm6); + asm("movdqa %%xmm6, %0" : "=m"(*data)); break; case 7: - SET_XMM(val, xmm7); + asm("movdqa %%xmm7, %0" : "=m"(*data)); break; + default: + BUG(); } } -typedef unsigned long v1di __attribute__ ((vector_size (8))); -static inline unsigned long get_xmm(int n) +static inline void write_sse_reg(int reg, const sse128_t *data) { - assert(n >= 0 && n <= 7); - - register v1di xmm0 __asm__("%xmm0"); - register v1di xmm1 __asm__("%xmm1"); - register v1di xmm2 __asm__("%xmm2"); - register v1di xmm3 __asm__("%xmm3"); - register v1di xmm4 __asm__("%xmm4"); - register v1di xmm5 __asm__("%xmm5"); - register v1di xmm6 __asm__("%xmm6"); - register v1di xmm7 __asm__("%xmm7"); - switch (n) { + switch (reg) { case 0: - return (unsigned long)xmm0; + asm("movdqa %0, %%xmm0" : : "m"(*data)); + break; case 1: - return (unsigned long)xmm1; + asm("movdqa %0, %%xmm1" : : "m"(*data)); + break; case 2: - return (unsigned long)xmm2; + asm("movdqa %0, %%xmm2" : : "m"(*data)); + break; case 3: - return (unsigned long)xmm3; + asm("movdqa %0, %%xmm3" : : "m"(*data)); + break; case 4: - return (unsigned long)xmm4; + asm("movdqa %0, %%xmm4" : : "m"(*data)); + break; case 5: - return (unsigned long)xmm5; + asm("movdqa %0, %%xmm5" : : "m"(*data)); + break; case 6: - return (unsigned long)xmm6; + asm("movdqa %0, %%xmm6" : : "m"(*data)); + break; case 7: - return (unsigned long)xmm7; + asm("movdqa %0, %%xmm7" : : "m"(*data)); + break; + default: + BUG(); } +} + +static inline void cpu_relax(void) +{ + asm volatile("rep; nop" ::: "memory"); +} + +#define ud2() \ + __asm__ __volatile__( \ + "ud2\n" \ + ) + +#define hlt() \ + __asm__ __volatile__( \ + "hlt\n" \ + ) + +struct kvm_x86_state *vcpu_save_state(struct kvm_vcpu *vcpu); +void vcpu_load_state(struct kvm_vcpu *vcpu, struct kvm_x86_state *state); +void kvm_x86_state_cleanup(struct kvm_x86_state *state); + +const struct kvm_msr_list *kvm_get_msr_index_list(void); +const struct kvm_msr_list *kvm_get_feature_msr_index_list(void); +bool kvm_msr_is_in_save_restore_list(uint32_t msr_index); +uint64_t kvm_get_feature_msr(uint64_t msr_index); + +static inline void vcpu_msrs_get(struct kvm_vcpu *vcpu, + struct kvm_msrs *msrs) +{ + int r = __vcpu_ioctl(vcpu, KVM_GET_MSRS, msrs); + + TEST_ASSERT(r == msrs->nmsrs, + "KVM_GET_MSRS failed, r: %i (failed on MSR %x)", + r, r < 0 || r >= msrs->nmsrs ? -1 : msrs->entries[r].index); +} +static inline void vcpu_msrs_set(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs) +{ + int r = __vcpu_ioctl(vcpu, KVM_SET_MSRS, msrs); + + TEST_ASSERT(r == msrs->nmsrs, + "KVM_SET_MSRS failed, r: %i (failed on MSR %x)", + r, r < 0 || r >= msrs->nmsrs ? -1 : msrs->entries[r].index); +} +static inline void vcpu_debugregs_get(struct kvm_vcpu *vcpu, + struct kvm_debugregs *debugregs) +{ + vcpu_ioctl(vcpu, KVM_GET_DEBUGREGS, debugregs); +} +static inline void vcpu_debugregs_set(struct kvm_vcpu *vcpu, + struct kvm_debugregs *debugregs) +{ + vcpu_ioctl(vcpu, KVM_SET_DEBUGREGS, debugregs); +} +static inline void vcpu_xsave_get(struct kvm_vcpu *vcpu, + struct kvm_xsave *xsave) +{ + vcpu_ioctl(vcpu, KVM_GET_XSAVE, xsave); +} +static inline void vcpu_xsave2_get(struct kvm_vcpu *vcpu, + struct kvm_xsave *xsave) +{ + vcpu_ioctl(vcpu, KVM_GET_XSAVE2, xsave); +} +static inline void vcpu_xsave_set(struct kvm_vcpu *vcpu, + struct kvm_xsave *xsave) +{ + vcpu_ioctl(vcpu, KVM_SET_XSAVE, xsave); +} +static inline void vcpu_xcrs_get(struct kvm_vcpu *vcpu, + struct kvm_xcrs *xcrs) +{ + vcpu_ioctl(vcpu, KVM_GET_XCRS, xcrs); +} +static inline void vcpu_xcrs_set(struct kvm_vcpu *vcpu, struct kvm_xcrs *xcrs) +{ + vcpu_ioctl(vcpu, KVM_SET_XCRS, xcrs); +} + +const struct kvm_cpuid_entry2 *get_cpuid_entry(const struct kvm_cpuid2 *cpuid, + uint32_t function, uint32_t index); +const struct kvm_cpuid2 *kvm_get_supported_cpuid(void); +const struct kvm_cpuid2 *kvm_get_supported_hv_cpuid(void); +const struct kvm_cpuid2 *vcpu_get_supported_hv_cpuid(struct kvm_vcpu *vcpu); + +static inline uint32_t kvm_cpu_fms(void) +{ + return get_cpuid_entry(kvm_get_supported_cpuid(), 0x1, 0)->eax; +} + +static inline uint32_t kvm_cpu_family(void) +{ + return x86_family(kvm_cpu_fms()); +} + +static inline uint32_t kvm_cpu_model(void) +{ + return x86_model(kvm_cpu_fms()); +} + +bool kvm_cpuid_has(const struct kvm_cpuid2 *cpuid, + struct kvm_x86_cpu_feature feature); + +static inline bool kvm_cpu_has(struct kvm_x86_cpu_feature feature) +{ + return kvm_cpuid_has(kvm_get_supported_cpuid(), feature); +} + +uint32_t kvm_cpuid_property(const struct kvm_cpuid2 *cpuid, + struct kvm_x86_cpu_property property); + +static inline uint32_t kvm_cpu_property(struct kvm_x86_cpu_property property) +{ + return kvm_cpuid_property(kvm_get_supported_cpuid(), property); +} + +static __always_inline bool kvm_cpu_has_p(struct kvm_x86_cpu_property property) +{ + uint32_t max_leaf; + + switch (property.function & 0xc0000000) { + case 0: + max_leaf = kvm_cpu_property(X86_PROPERTY_MAX_BASIC_LEAF); + break; + case 0x40000000: + max_leaf = kvm_cpu_property(X86_PROPERTY_MAX_KVM_LEAF); + break; + case 0x80000000: + max_leaf = kvm_cpu_property(X86_PROPERTY_MAX_EXT_LEAF); + break; + case 0xc0000000: + max_leaf = kvm_cpu_property(X86_PROPERTY_MAX_CENTAUR_LEAF); + } + return max_leaf >= property.function; +} + +static inline bool kvm_pmu_has(struct kvm_x86_pmu_feature feature) +{ + uint32_t nr_bits; + + if (feature.f.reg == KVM_CPUID_EBX) { + nr_bits = kvm_cpu_property(X86_PROPERTY_PMU_EBX_BIT_VECTOR_LENGTH); + return nr_bits > feature.f.bit && !kvm_cpu_has(feature.f); + } + + TEST_ASSERT_EQ(feature.f.reg, KVM_CPUID_ECX); + nr_bits = kvm_cpu_property(X86_PROPERTY_PMU_NR_FIXED_COUNTERS); + return nr_bits > feature.f.bit || kvm_cpu_has(feature.f); +} + +static __always_inline uint64_t kvm_cpu_supported_xcr0(void) +{ + if (!kvm_cpu_has_p(X86_PROPERTY_SUPPORTED_XCR0_LO)) + return 0; + + return kvm_cpu_property(X86_PROPERTY_SUPPORTED_XCR0_LO) | + ((uint64_t)kvm_cpu_property(X86_PROPERTY_SUPPORTED_XCR0_HI) << 32); +} + +static inline size_t kvm_cpuid2_size(int nr_entries) +{ + return sizeof(struct kvm_cpuid2) + + sizeof(struct kvm_cpuid_entry2) * nr_entries; +} + +/* + * Allocate a "struct kvm_cpuid2* instance, with the 0-length arrary of + * entries sized to hold @nr_entries. The caller is responsible for freeing + * the struct. + */ +static inline struct kvm_cpuid2 *allocate_kvm_cpuid2(int nr_entries) +{ + struct kvm_cpuid2 *cpuid; + + cpuid = malloc(kvm_cpuid2_size(nr_entries)); + TEST_ASSERT(cpuid, "-ENOMEM when allocating kvm_cpuid2"); + + cpuid->nent = nr_entries; + + return cpuid; +} + +void vcpu_init_cpuid(struct kvm_vcpu *vcpu, const struct kvm_cpuid2 *cpuid); +void vcpu_set_hv_cpuid(struct kvm_vcpu *vcpu); + +static inline struct kvm_cpuid_entry2 *__vcpu_get_cpuid_entry(struct kvm_vcpu *vcpu, + uint32_t function, + uint32_t index) +{ + return (struct kvm_cpuid_entry2 *)get_cpuid_entry(vcpu->cpuid, + function, index); +} + +static inline struct kvm_cpuid_entry2 *vcpu_get_cpuid_entry(struct kvm_vcpu *vcpu, + uint32_t function) +{ + return __vcpu_get_cpuid_entry(vcpu, function, 0); +} + +static inline int __vcpu_set_cpuid(struct kvm_vcpu *vcpu) +{ + int r; + + TEST_ASSERT(vcpu->cpuid, "Must do vcpu_init_cpuid() first"); + r = __vcpu_ioctl(vcpu, KVM_SET_CPUID2, vcpu->cpuid); + if (r) + return r; + + /* On success, refresh the cache to pick up adjustments made by KVM. */ + vcpu_ioctl(vcpu, KVM_GET_CPUID2, vcpu->cpuid); return 0; } -bool is_intel_cpu(void); +static inline void vcpu_set_cpuid(struct kvm_vcpu *vcpu) +{ + TEST_ASSERT(vcpu->cpuid, "Must do vcpu_init_cpuid() first"); + vcpu_ioctl(vcpu, KVM_SET_CPUID2, vcpu->cpuid); + + /* Refresh the cache to pick up adjustments made by KVM. */ + vcpu_ioctl(vcpu, KVM_GET_CPUID2, vcpu->cpuid); +} + +void vcpu_set_cpuid_property(struct kvm_vcpu *vcpu, + struct kvm_x86_cpu_property property, + uint32_t value); +void vcpu_set_cpuid_maxphyaddr(struct kvm_vcpu *vcpu, uint8_t maxphyaddr); + +void vcpu_clear_cpuid_entry(struct kvm_vcpu *vcpu, uint32_t function); -struct kvm_x86_state; -struct kvm_x86_state *vcpu_save_state(struct kvm_vm *vm, uint32_t vcpuid); -void vcpu_load_state(struct kvm_vm *vm, uint32_t vcpuid, - struct kvm_x86_state *state); +static inline bool vcpu_cpuid_has(struct kvm_vcpu *vcpu, + struct kvm_x86_cpu_feature feature) +{ + struct kvm_cpuid_entry2 *entry; -struct kvm_msr_list *kvm_get_msr_index_list(void); + entry = __vcpu_get_cpuid_entry(vcpu, feature.function, feature.index); + return *((&entry->eax) + feature.reg) & BIT(feature.bit); +} -struct kvm_cpuid2 *kvm_get_supported_cpuid(void); -void vcpu_set_cpuid(struct kvm_vm *vm, uint32_t vcpuid, - struct kvm_cpuid2 *cpuid); +void vcpu_set_or_clear_cpuid_feature(struct kvm_vcpu *vcpu, + struct kvm_x86_cpu_feature feature, + bool set); -struct kvm_cpuid_entry2 * -kvm_get_supported_cpuid_index(uint32_t function, uint32_t index); +static inline void vcpu_set_cpuid_feature(struct kvm_vcpu *vcpu, + struct kvm_x86_cpu_feature feature) +{ + vcpu_set_or_clear_cpuid_feature(vcpu, feature, true); + +} -static inline struct kvm_cpuid_entry2 * -kvm_get_supported_cpuid_entry(uint32_t function) +static inline void vcpu_clear_cpuid_feature(struct kvm_vcpu *vcpu, + struct kvm_x86_cpu_feature feature) +{ + vcpu_set_or_clear_cpuid_feature(vcpu, feature, false); +} + +uint64_t vcpu_get_msr(struct kvm_vcpu *vcpu, uint64_t msr_index); +int _vcpu_set_msr(struct kvm_vcpu *vcpu, uint64_t msr_index, uint64_t msr_value); + +/* + * Assert on an MSR access(es) and pretty print the MSR name when possible. + * Note, the caller provides the stringified name so that the name of macro is + * printed, not the value the macro resolves to (due to macro expansion). + */ +#define TEST_ASSERT_MSR(cond, fmt, msr, str, args...) \ +do { \ + if (__builtin_constant_p(msr)) { \ + TEST_ASSERT(cond, fmt, str, args); \ + } else if (!(cond)) { \ + char buf[16]; \ + \ + snprintf(buf, sizeof(buf), "MSR 0x%x", msr); \ + TEST_ASSERT(cond, fmt, buf, args); \ + } \ +} while (0) + +/* + * Returns true if KVM should return the last written value when reading an MSR + * from userspace, e.g. the MSR isn't a command MSR, doesn't emulate state that + * is changing, etc. This is NOT an exhaustive list! The intent is to filter + * out MSRs that are not durable _and_ that a selftest wants to write. + */ +static inline bool is_durable_msr(uint32_t msr) { - return kvm_get_supported_cpuid_index(function, 0); + return msr != MSR_IA32_TSC; } -uint64_t vcpu_get_msr(struct kvm_vm *vm, uint32_t vcpuid, uint64_t msr_index); -int _vcpu_set_msr(struct kvm_vm *vm, uint32_t vcpuid, uint64_t msr_index, - uint64_t msr_value); -void vcpu_set_msr(struct kvm_vm *vm, uint32_t vcpuid, uint64_t msr_index, - uint64_t msr_value); +#define vcpu_set_msr(vcpu, msr, val) \ +do { \ + uint64_t r, v = val; \ + \ + TEST_ASSERT_MSR(_vcpu_set_msr(vcpu, msr, v) == 1, \ + "KVM_SET_MSRS failed on %s, value = 0x%lx", msr, #msr, v); \ + if (!is_durable_msr(msr)) \ + break; \ + r = vcpu_get_msr(vcpu, msr); \ + TEST_ASSERT_MSR(r == v, "Set %s to '0x%lx', got back '0x%lx'", msr, #msr, v, r);\ +} while (0) -uint32_t kvm_get_cpuid_max_basic(void); -uint32_t kvm_get_cpuid_max_extended(void); void kvm_get_cpu_address_width(unsigned int *pa_bits, unsigned int *va_bits); +void kvm_init_vm_address_properties(struct kvm_vm *vm); +bool vm_is_unrestricted_guest(struct kvm_vm *vm); + +struct ex_regs { + uint64_t rax, rcx, rdx, rbx; + uint64_t rbp, rsi, rdi; + uint64_t r8, r9, r10, r11; + uint64_t r12, r13, r14, r15; + uint64_t vector; + uint64_t error_code; + uint64_t rip; + uint64_t cs; + uint64_t rflags; +}; + +struct idt_entry { + uint16_t offset0; + uint16_t selector; + uint16_t ist : 3; + uint16_t : 5; + uint16_t type : 4; + uint16_t : 1; + uint16_t dpl : 2; + uint16_t p : 1; + uint16_t offset1; + uint32_t offset2; uint32_t reserved; +}; + +void vm_install_exception_handler(struct kvm_vm *vm, int vector, + void (*handler)(struct ex_regs *)); + +/* If a toddler were to say "abracadabra". */ +#define KVM_EXCEPTION_MAGIC 0xabacadabaULL + +/* + * KVM selftest exception fixup uses registers to coordinate with the exception + * handler, versus the kernel's in-memory tables and KVM-Unit-Tests's in-memory + * per-CPU data. Using only registers avoids having to map memory into the + * guest, doesn't require a valid, stable GS.base, and reduces the risk of + * for recursive faults when accessing memory in the handler. The downside to + * using registers is that it restricts what registers can be used by the actual + * instruction. But, selftests are 64-bit only, making register* pressure a + * minor concern. Use r9-r11 as they are volatile, i.e. don't need to be saved + * by the callee, and except for r11 are not implicit parameters to any + * instructions. Ideally, fixup would use r8-r10 and thus avoid implicit + * parameters entirely, but Hyper-V's hypercall ABI uses r8 and testing Hyper-V + * is higher priority than testing non-faulting SYSCALL/SYSRET. + * + * Note, the fixup handler deliberately does not handle #DE, i.e. the vector + * is guaranteed to be non-zero on fault. + * + * REGISTER INPUTS: + * r9 = MAGIC + * r10 = RIP + * r11 = new RIP on fault + * + * REGISTER OUTPUTS: + * r9 = exception vector (non-zero) + * r10 = error code + */ +#define __KVM_ASM_SAFE(insn, fep) \ + "mov $" __stringify(KVM_EXCEPTION_MAGIC) ", %%r9\n\t" \ + "lea 1f(%%rip), %%r10\n\t" \ + "lea 2f(%%rip), %%r11\n\t" \ + fep "1: " insn "\n\t" \ + "xor %%r9, %%r9\n\t" \ + "2:\n\t" \ + "mov %%r9b, %[vector]\n\t" \ + "mov %%r10, %[error_code]\n\t" + +#define KVM_ASM_SAFE(insn) __KVM_ASM_SAFE(insn, "") +#define KVM_ASM_SAFE_FEP(insn) __KVM_ASM_SAFE(insn, KVM_FEP) + +#define KVM_ASM_SAFE_OUTPUTS(v, ec) [vector] "=qm"(v), [error_code] "=rm"(ec) +#define KVM_ASM_SAFE_CLOBBERS "r9", "r10", "r11" + +#define kvm_asm_safe(insn, inputs...) \ +({ \ + uint64_t ign_error_code; \ + uint8_t vector; \ + \ + asm volatile(KVM_ASM_SAFE(insn) \ + : KVM_ASM_SAFE_OUTPUTS(vector, ign_error_code) \ + : inputs \ + : KVM_ASM_SAFE_CLOBBERS); \ + vector; \ +}) + +#define kvm_asm_safe_ec(insn, error_code, inputs...) \ +({ \ + uint8_t vector; \ + \ + asm volatile(KVM_ASM_SAFE(insn) \ + : KVM_ASM_SAFE_OUTPUTS(vector, error_code) \ + : inputs \ + : KVM_ASM_SAFE_CLOBBERS); \ + vector; \ +}) + +#define kvm_asm_safe_fep(insn, inputs...) \ +({ \ + uint64_t ign_error_code; \ + uint8_t vector; \ + \ + asm volatile(KVM_ASM_SAFE(insn) \ + : KVM_ASM_SAFE_OUTPUTS(vector, ign_error_code) \ + : inputs \ + : KVM_ASM_SAFE_CLOBBERS); \ + vector; \ +}) + +#define kvm_asm_safe_ec_fep(insn, error_code, inputs...) \ +({ \ + uint8_t vector; \ + \ + asm volatile(KVM_ASM_SAFE_FEP(insn) \ + : KVM_ASM_SAFE_OUTPUTS(vector, error_code) \ + : inputs \ + : KVM_ASM_SAFE_CLOBBERS); \ + vector; \ +}) + +#define BUILD_READ_U64_SAFE_HELPER(insn, _fep, _FEP) \ +static inline uint8_t insn##_safe ##_fep(uint32_t idx, uint64_t *val) \ +{ \ + uint64_t error_code; \ + uint8_t vector; \ + uint32_t a, d; \ + \ + asm volatile(KVM_ASM_SAFE##_FEP(#insn) \ + : "=a"(a), "=d"(d), \ + KVM_ASM_SAFE_OUTPUTS(vector, error_code) \ + : "c"(idx) \ + : KVM_ASM_SAFE_CLOBBERS); \ + \ + *val = (uint64_t)a | ((uint64_t)d << 32); \ + return vector; \ +} + +/* + * Generate {insn}_safe() and {insn}_safe_fep() helpers for instructions that + * use ECX as in input index, and EDX:EAX as a 64-bit output. + */ +#define BUILD_READ_U64_SAFE_HELPERS(insn) \ + BUILD_READ_U64_SAFE_HELPER(insn, , ) \ + BUILD_READ_U64_SAFE_HELPER(insn, _fep, _FEP) \ + +BUILD_READ_U64_SAFE_HELPERS(rdmsr) +BUILD_READ_U64_SAFE_HELPERS(rdpmc) +BUILD_READ_U64_SAFE_HELPERS(xgetbv) + +static inline uint8_t wrmsr_safe(uint32_t msr, uint64_t val) +{ + return kvm_asm_safe("wrmsr", "a"(val & -1u), "d"(val >> 32), "c"(msr)); +} + +static inline uint8_t xsetbv_safe(uint32_t index, uint64_t value) +{ + u32 eax = value; + u32 edx = value >> 32; + + return kvm_asm_safe("xsetbv", "a" (eax), "d" (edx), "c" (index)); +} + +bool kvm_is_tdp_enabled(void); + +static inline bool kvm_is_pmu_enabled(void) +{ + return get_kvm_param_bool("enable_pmu"); +} + +static inline bool kvm_is_forced_emulation_enabled(void) +{ + return !!get_kvm_param_integer("force_emulation_prefix"); +} + +uint64_t *__vm_get_page_table_entry(struct kvm_vm *vm, uint64_t vaddr, + int *level); +uint64_t *vm_get_page_table_entry(struct kvm_vm *vm, uint64_t vaddr); + +uint64_t kvm_hypercall(uint64_t nr, uint64_t a0, uint64_t a1, uint64_t a2, + uint64_t a3); +uint64_t __xen_hypercall(uint64_t nr, uint64_t a0, void *a1); +void xen_hypercall(uint64_t nr, uint64_t a0, void *a1); + +static inline uint64_t __kvm_hypercall_map_gpa_range(uint64_t gpa, + uint64_t size, uint64_t flags) +{ + return kvm_hypercall(KVM_HC_MAP_GPA_RANGE, gpa, size >> PAGE_SHIFT, flags, 0); +} + +static inline void kvm_hypercall_map_gpa_range(uint64_t gpa, uint64_t size, + uint64_t flags) +{ + uint64_t ret = __kvm_hypercall_map_gpa_range(gpa, size, flags); + + GUEST_ASSERT(!ret); +} + +void __vm_xsave_require_permission(uint64_t xfeature, const char *name); + +#define vm_xsave_require_permission(xfeature) \ + __vm_xsave_require_permission(xfeature, #xfeature) + +enum pg_level { + PG_LEVEL_NONE, + PG_LEVEL_4K, + PG_LEVEL_2M, + PG_LEVEL_1G, + PG_LEVEL_512G, + PG_LEVEL_NUM +}; + +#define PG_LEVEL_SHIFT(_level) ((_level - 1) * 9 + 12) +#define PG_LEVEL_SIZE(_level) (1ull << PG_LEVEL_SHIFT(_level)) + +#define PG_SIZE_4K PG_LEVEL_SIZE(PG_LEVEL_4K) +#define PG_SIZE_2M PG_LEVEL_SIZE(PG_LEVEL_2M) +#define PG_SIZE_1G PG_LEVEL_SIZE(PG_LEVEL_1G) + +void __virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, int level); +void virt_map_level(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, + uint64_t nr_bytes, int level); /* * Basic CPU control in CR0 @@ -353,34 +1345,28 @@ void kvm_get_cpu_address_width(unsigned int *pa_bits, unsigned int *va_bits); #define X86_CR0_CD (1UL<<30) /* Cache Disable */ #define X86_CR0_PG (1UL<<31) /* Paging */ -#define APIC_BASE_MSR 0x800 -#define X2APIC_ENABLE (1UL << 10) -#define APIC_ICR 0x300 -#define APIC_DEST_SELF 0x40000 -#define APIC_DEST_ALLINC 0x80000 -#define APIC_DEST_ALLBUT 0xC0000 -#define APIC_ICR_RR_MASK 0x30000 -#define APIC_ICR_RR_INVALID 0x00000 -#define APIC_ICR_RR_INPROG 0x10000 -#define APIC_ICR_RR_VALID 0x20000 -#define APIC_INT_LEVELTRIG 0x08000 -#define APIC_INT_ASSERT 0x04000 -#define APIC_ICR_BUSY 0x01000 -#define APIC_DEST_LOGICAL 0x00800 -#define APIC_DEST_PHYSICAL 0x00000 -#define APIC_DM_FIXED 0x00000 -#define APIC_DM_FIXED_MASK 0x00700 -#define APIC_DM_LOWEST 0x00100 -#define APIC_DM_SMI 0x00200 -#define APIC_DM_REMRD 0x00300 -#define APIC_DM_NMI 0x00400 -#define APIC_DM_INIT 0x00500 -#define APIC_DM_STARTUP 0x00600 -#define APIC_DM_EXTINT 0x00700 -#define APIC_VECTOR_MASK 0x000FF -#define APIC_ICR2 0x310 - -/* VMX_EPT_VPID_CAP bits */ -#define VMX_EPT_VPID_CAP_AD_BITS (1ULL << 21) +#define PFERR_PRESENT_BIT 0 +#define PFERR_WRITE_BIT 1 +#define PFERR_USER_BIT 2 +#define PFERR_RSVD_BIT 3 +#define PFERR_FETCH_BIT 4 +#define PFERR_PK_BIT 5 +#define PFERR_SGX_BIT 15 +#define PFERR_GUEST_FINAL_BIT 32 +#define PFERR_GUEST_PAGE_BIT 33 +#define PFERR_IMPLICIT_ACCESS_BIT 48 + +#define PFERR_PRESENT_MASK BIT(PFERR_PRESENT_BIT) +#define PFERR_WRITE_MASK BIT(PFERR_WRITE_BIT) +#define PFERR_USER_MASK BIT(PFERR_USER_BIT) +#define PFERR_RSVD_MASK BIT(PFERR_RSVD_BIT) +#define PFERR_FETCH_MASK BIT(PFERR_FETCH_BIT) +#define PFERR_PK_MASK BIT(PFERR_PK_BIT) +#define PFERR_SGX_MASK BIT(PFERR_SGX_BIT) +#define PFERR_GUEST_FINAL_MASK BIT_ULL(PFERR_GUEST_FINAL_BIT) +#define PFERR_GUEST_PAGE_MASK BIT_ULL(PFERR_GUEST_PAGE_BIT) +#define PFERR_IMPLICIT_ACCESS BIT_ULL(PFERR_IMPLICIT_ACCESS_BIT) + +bool sys_clocksource_is_based_on_tsc(void); #endif /* SELFTEST_KVM_PROCESSOR_H */ diff --git a/tools/testing/selftests/kvm/include/x86_64/sev.h b/tools/testing/selftests/kvm/include/x86_64/sev.h new file mode 100644 index 000000000000..82c11c81a956 --- /dev/null +++ b/tools/testing/selftests/kvm/include/x86_64/sev.h @@ -0,0 +1,96 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Helpers used for SEV guests + * + */ +#ifndef SELFTEST_KVM_SEV_H +#define SELFTEST_KVM_SEV_H + +#include <stdint.h> +#include <stdbool.h> + +#include "linux/psp-sev.h" + +#include "kvm_util.h" +#include "svm_util.h" +#include "processor.h" + +enum sev_guest_state { + SEV_GUEST_STATE_UNINITIALIZED = 0, + SEV_GUEST_STATE_LAUNCH_UPDATE, + SEV_GUEST_STATE_LAUNCH_SECRET, + SEV_GUEST_STATE_RUNNING, +}; + +#define SEV_POLICY_NO_DBG (1UL << 0) +#define SEV_POLICY_ES (1UL << 2) + +#define GHCB_MSR_TERM_REQ 0x100 + +void sev_vm_launch(struct kvm_vm *vm, uint32_t policy); +void sev_vm_launch_measure(struct kvm_vm *vm, uint8_t *measurement); +void sev_vm_launch_finish(struct kvm_vm *vm); + +struct kvm_vm *vm_sev_create_with_one_vcpu(uint32_t type, void *guest_code, + struct kvm_vcpu **cpu); +void vm_sev_launch(struct kvm_vm *vm, uint32_t policy, uint8_t *measurement); + +kvm_static_assert(SEV_RET_SUCCESS == 0); + +/* + * The KVM_MEMORY_ENCRYPT_OP uAPI is utter garbage and takes an "unsigned long" + * instead of a proper struct. The size of the parameter is embedded in the + * ioctl number, i.e. is ABI and thus immutable. Hack around the mess by + * creating an overlay to pass in an "unsigned long" without a cast (casting + * will make the compiler unhappy due to dereferencing an aliased pointer). + */ +#define __vm_sev_ioctl(vm, cmd, arg) \ +({ \ + int r; \ + \ + union { \ + struct kvm_sev_cmd c; \ + unsigned long raw; \ + } sev_cmd = { .c = { \ + .id = (cmd), \ + .data = (uint64_t)(arg), \ + .sev_fd = (vm)->arch.sev_fd, \ + } }; \ + \ + r = __vm_ioctl(vm, KVM_MEMORY_ENCRYPT_OP, &sev_cmd.raw); \ + r ?: sev_cmd.c.error; \ +}) + +#define vm_sev_ioctl(vm, cmd, arg) \ +({ \ + int ret = __vm_sev_ioctl(vm, cmd, arg); \ + \ + __TEST_ASSERT_VM_VCPU_IOCTL(!ret, #cmd, ret, vm); \ +}) + +void sev_vm_init(struct kvm_vm *vm); +void sev_es_vm_init(struct kvm_vm *vm); + +static inline void sev_register_encrypted_memory(struct kvm_vm *vm, + struct userspace_mem_region *region) +{ + struct kvm_enc_region range = { + .addr = region->region.userspace_addr, + .size = region->region.memory_size, + }; + + vm_ioctl(vm, KVM_MEMORY_ENCRYPT_REG_REGION, &range); +} + +static inline void sev_launch_update_data(struct kvm_vm *vm, vm_paddr_t gpa, + uint64_t size) +{ + struct kvm_sev_launch_update_data update_data = { + .uaddr = (unsigned long)addr_gpa2hva(vm, gpa), + .len = size, + }; + + vm_sev_ioctl(vm, KVM_SEV_LAUNCH_UPDATE_DATA, &update_data); +} + +#endif /* SELFTEST_KVM_SEV_H */ diff --git a/tools/testing/selftests/kvm/include/x86_64/svm.h b/tools/testing/selftests/kvm/include/x86_64/svm.h index f4ea2355dbc2..4803e1056055 100644 --- a/tools/testing/selftests/kvm/include/x86_64/svm.h +++ b/tools/testing/selftests/kvm/include/x86_64/svm.h @@ -58,6 +58,27 @@ enum { INTERCEPT_RDPRU, }; +struct hv_vmcb_enlightenments { + struct __packed hv_enlightenments_control { + u32 nested_flush_hypercall:1; + u32 msr_bitmap:1; + u32 enlightened_npt_tlb: 1; + u32 reserved:29; + } __packed hv_enlightenments_control; + u32 hv_vp_id; + u64 hv_vm_id; + u64 partition_assist_page; + u64 reserved; +} __packed; + +/* + * Hyper-V uses the software reserved clean bit in VMCB + */ +#define HV_VMCB_NESTED_ENLIGHTENMENTS (1U << 31) + +/* Synthetic VM-Exit */ +#define HV_SVM_EXITCODE_ENL 0xf0000000 +#define HV_SVM_ENL_EXITCODE_TRAP_AFTER_FLUSH (1) struct __attribute__ ((__packed__)) vmcb_control_area { u32 intercept_cr; @@ -99,7 +120,17 @@ struct __attribute__ ((__packed__)) vmcb_control_area { u8 reserved_6[8]; /* Offset 0xe8 */ u64 avic_logical_id; /* Offset 0xf0 */ u64 avic_physical_id; /* Offset 0xf8 */ - u8 reserved_7[768]; + u8 reserved_7[8]; + u64 vmsa_pa; /* Used for an SEV-ES guest */ + u8 reserved_8[720]; + /* + * Offset 0x3e0, 32 bytes reserved + * for use by hypervisor/software. + */ + union { + struct hv_vmcb_enlightenments hv_enlightenments; + u8 reserved_sw[32]; + }; }; @@ -211,8 +242,6 @@ struct __attribute__ ((__packed__)) vmcb { struct vmcb_save_area save; }; -#define SVM_CPUID_FUNC 0x8000000a - #define SVM_VM_CR_SVM_DISABLE 4 #define SVM_SELECTOR_S_SHIFT 4 diff --git a/tools/testing/selftests/kvm/include/x86_64/svm_util.h b/tools/testing/selftests/kvm/include/x86_64/svm_util.h index b7531c83b8ae..044f0f872ba9 100644 --- a/tools/testing/selftests/kvm/include/x86_64/svm_util.h +++ b/tools/testing/selftests/kvm/include/x86_64/svm_util.h @@ -9,15 +9,12 @@ #ifndef SELFTEST_KVM_SVM_UTILS_H #define SELFTEST_KVM_SVM_UTILS_H +#include <asm/svm.h> + #include <stdint.h> #include "svm.h" #include "processor.h" -#define CPUID_SVM_BIT 2 -#define CPUID_SVM BIT_ULL(CPUID_SVM_BIT) - -#define SVM_EXIT_VMMCALL 0x081 - struct svm_test_data { /* VMCB */ struct vmcb *vmcb; /* gva */ @@ -28,22 +25,41 @@ struct svm_test_data { struct vmcb_save_area *save_area; /* gva */ void *save_area_hva; uint64_t save_area_gpa; + + /* MSR-Bitmap */ + void *msr; /* gva */ + void *msr_hva; + uint64_t msr_gpa; }; +static inline void vmmcall(void) +{ + /* + * Stuff RAX and RCX with "safe" values to make sure L0 doesn't handle + * it as a valid hypercall (e.g. Hyper-V L2 TLB flush) as the intended + * use of this function is to exit to L1 from L2. Clobber all other + * GPRs as L1 doesn't correctly preserve them during vmexits. + */ + __asm__ __volatile__("push %%rbp; vmmcall; pop %%rbp" + : : "a"(0xdeadbeef), "c"(0xbeefdead) + : "rbx", "rdx", "rsi", "rdi", "r8", "r9", + "r10", "r11", "r12", "r13", "r14", "r15"); +} + +#define stgi() \ + __asm__ __volatile__( \ + "stgi\n" \ + ) + +#define clgi() \ + __asm__ __volatile__( \ + "clgi\n" \ + ) + struct svm_test_data *vcpu_alloc_svm(struct kvm_vm *vm, vm_vaddr_t *p_svm_gva); void generic_svm_setup(struct svm_test_data *svm, void *guest_rip, void *guest_rsp); void run_guest(struct vmcb *vmcb, uint64_t vmcb_gpa); -bool nested_svm_supported(void); -void nested_svm_check_supported(void); - -static inline bool cpu_has_svm(void) -{ - u32 eax = 0x80000001, ecx; - - asm("cpuid" : - "=a" (eax), "=c" (ecx) : "0" (eax) : "ebx", "edx"); - return ecx & CPUID_SVM; -} +int open_sev_dev_path_or_exit(void); #endif /* SELFTEST_KVM_SVM_UTILS_H */ diff --git a/tools/testing/selftests/kvm/include/x86_64/ucall.h b/tools/testing/selftests/kvm/include/x86_64/ucall.h new file mode 100644 index 000000000000..d3825dcc3cd9 --- /dev/null +++ b/tools/testing/selftests/kvm/include/x86_64/ucall.h @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef SELFTEST_KVM_UCALL_H +#define SELFTEST_KVM_UCALL_H + +#include "kvm_util.h" + +#define UCALL_EXIT_REASON KVM_EXIT_IO + +static inline void ucall_arch_init(struct kvm_vm *vm, vm_paddr_t mmio_gpa) +{ +} + +#endif diff --git a/tools/testing/selftests/kvm/include/x86_64/vmx.h b/tools/testing/selftests/kvm/include/x86_64/vmx.h index 16fa21ebb99c..5f0c0a29c556 100644 --- a/tools/testing/selftests/kvm/include/x86_64/vmx.h +++ b/tools/testing/selftests/kvm/include/x86_64/vmx.h @@ -8,12 +8,11 @@ #ifndef SELFTEST_KVM_VMX_H #define SELFTEST_KVM_VMX_H +#include <asm/vmx.h> + #include <stdint.h> #include "processor.h" - -#define CPUID_VMX_BIT 5 - -#define CPUID_VMX (1 << 5) +#include "apic.h" /* * Definitions of Primary Processor-Based VM-Execution Controls. @@ -48,7 +47,7 @@ #define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x00000001 #define SECONDARY_EXEC_ENABLE_EPT 0x00000002 #define SECONDARY_EXEC_DESC 0x00000004 -#define SECONDARY_EXEC_RDTSCP 0x00000008 +#define SECONDARY_EXEC_ENABLE_RDTSCP 0x00000008 #define SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE 0x00000010 #define SECONDARY_EXEC_ENABLE_VPID 0x00000020 #define SECONDARY_EXEC_WBINVD_EXITING 0x00000040 @@ -99,56 +98,10 @@ #define VMX_MISC_PREEMPTION_TIMER_RATE_MASK 0x0000001f #define VMX_MISC_SAVE_EFER_LMA 0x00000020 +#define VMX_EPT_VPID_CAP_1G_PAGES 0x00020000 +#define VMX_EPT_VPID_CAP_AD_BITS 0x00200000 + #define EXIT_REASON_FAILED_VMENTRY 0x80000000 -#define EXIT_REASON_EXCEPTION_NMI 0 -#define EXIT_REASON_EXTERNAL_INTERRUPT 1 -#define EXIT_REASON_TRIPLE_FAULT 2 -#define EXIT_REASON_INTERRUPT_WINDOW 7 -#define EXIT_REASON_NMI_WINDOW 8 -#define EXIT_REASON_TASK_SWITCH 9 -#define EXIT_REASON_CPUID 10 -#define EXIT_REASON_HLT 12 -#define EXIT_REASON_INVD 13 -#define EXIT_REASON_INVLPG 14 -#define EXIT_REASON_RDPMC 15 -#define EXIT_REASON_RDTSC 16 -#define EXIT_REASON_VMCALL 18 -#define EXIT_REASON_VMCLEAR 19 -#define EXIT_REASON_VMLAUNCH 20 -#define EXIT_REASON_VMPTRLD 21 -#define EXIT_REASON_VMPTRST 22 -#define EXIT_REASON_VMREAD 23 -#define EXIT_REASON_VMRESUME 24 -#define EXIT_REASON_VMWRITE 25 -#define EXIT_REASON_VMOFF 26 -#define EXIT_REASON_VMON 27 -#define EXIT_REASON_CR_ACCESS 28 -#define EXIT_REASON_DR_ACCESS 29 -#define EXIT_REASON_IO_INSTRUCTION 30 -#define EXIT_REASON_MSR_READ 31 -#define EXIT_REASON_MSR_WRITE 32 -#define EXIT_REASON_INVALID_STATE 33 -#define EXIT_REASON_MWAIT_INSTRUCTION 36 -#define EXIT_REASON_MONITOR_INSTRUCTION 39 -#define EXIT_REASON_PAUSE_INSTRUCTION 40 -#define EXIT_REASON_MCE_DURING_VMENTRY 41 -#define EXIT_REASON_TPR_BELOW_THRESHOLD 43 -#define EXIT_REASON_APIC_ACCESS 44 -#define EXIT_REASON_EOI_INDUCED 45 -#define EXIT_REASON_EPT_VIOLATION 48 -#define EXIT_REASON_EPT_MISCONFIG 49 -#define EXIT_REASON_INVEPT 50 -#define EXIT_REASON_RDTSCP 51 -#define EXIT_REASON_PREEMPTION_TIMER 52 -#define EXIT_REASON_INVVPID 53 -#define EXIT_REASON_WBINVD 54 -#define EXIT_REASON_XSETBV 55 -#define EXIT_REASON_APIC_WRITE 56 -#define EXIT_REASON_INVPCID 58 -#define EXIT_REASON_PML_FULL 62 -#define EXIT_REASON_XSAVES 63 -#define EXIT_REASON_XRSTORS 64 -#define LAST_EXIT_REASON 64 enum vmcs_field { VIRTUAL_PROCESSOR_ID = 0x00000000, @@ -208,6 +161,8 @@ enum vmcs_field { VMWRITE_BITMAP_HIGH = 0x00002029, XSS_EXIT_BITMAP = 0x0000202C, XSS_EXIT_BITMAP_HIGH = 0x0000202D, + ENCLS_EXITING_BITMAP = 0x0000202E, + ENCLS_EXITING_BITMAP_HIGH = 0x0000202F, TSC_MULTIPLIER = 0x00002032, TSC_MULTIPLIER_HIGH = 0x00002033, GUEST_PHYSICAL_ADDRESS = 0x00002400, @@ -482,11 +437,16 @@ static inline int vmresume(void) static inline void vmcall(void) { - /* Currently, L1 destroys our GPRs during vmexits. */ - __asm__ __volatile__("push %%rbp; vmcall; pop %%rbp" : : : - "rax", "rbx", "rcx", "rdx", - "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", - "r13", "r14", "r15"); + /* + * Stuff RAX and RCX with "safe" values to make sure L0 doesn't handle + * it as a valid hypercall (e.g. Hyper-V L2 TLB flush) as the intended + * use of this function is to exit to L1 from L2. Clobber all other + * GPRs as L1 doesn't correctly preserve them during vmexits. + */ + __asm__ __volatile__("push %%rbp; vmcall; pop %%rbp" + : : "a"(0xdeadbeef), "c"(0xbeefdead) + : "rbx", "rdx", "rsi", "rdi", "r8", "r9", + "r10", "r11", "r12", "r13", "r14", "r15"); } static inline int vmread(uint64_t encoding, uint64_t *value) @@ -562,17 +522,13 @@ struct vmx_pages { uint64_t vmwrite_gpa; void *vmwrite; - void *vp_assist_hva; - uint64_t vp_assist_gpa; - void *vp_assist; - - void *enlightened_vmcs_hva; - uint64_t enlightened_vmcs_gpa; - void *enlightened_vmcs; - void *eptp_hva; uint64_t eptp_gpa; void *eptp; + + void *apic_access_hva; + uint64_t apic_access_gpa; + void *apic_access; }; union vmx_basic { @@ -603,17 +559,19 @@ bool prepare_for_vmx_operation(struct vmx_pages *vmx); void prepare_vmcs(struct vmx_pages *vmx, void *guest_rip, void *guest_rsp); bool load_vmcs(struct vmx_pages *vmx); -bool nested_vmx_supported(void); -void nested_vmx_check_supported(void); +bool ept_1g_pages_supported(void); void nested_pg_map(struct vmx_pages *vmx, struct kvm_vm *vm, - uint64_t nested_paddr, uint64_t paddr, uint32_t eptp_memslot); + uint64_t nested_paddr, uint64_t paddr); void nested_map(struct vmx_pages *vmx, struct kvm_vm *vm, - uint64_t nested_paddr, uint64_t paddr, uint64_t size, - uint32_t eptp_memslot); + uint64_t nested_paddr, uint64_t paddr, uint64_t size); void nested_map_memslot(struct vmx_pages *vmx, struct kvm_vm *vm, - uint32_t memslot, uint32_t eptp_memslot); + uint32_t memslot); +void nested_identity_map_1g(struct vmx_pages *vmx, struct kvm_vm *vm, + uint64_t addr, uint64_t size); +bool kvm_cpu_has_ept(void); void prepare_eptp(struct vmx_pages *vmx, struct kvm_vm *vm, uint32_t eptp_memslot); +void prepare_virtualize_apic_accesses(struct vmx_pages *vmx, struct kvm_vm *vm); #endif /* SELFTEST_KVM_VMX_H */ diff --git a/tools/testing/selftests/kvm/kvm_binary_stats_test.c b/tools/testing/selftests/kvm/kvm_binary_stats_test.c new file mode 100644 index 000000000000..f02355c3c4c2 --- /dev/null +++ b/tools/testing/selftests/kvm/kvm_binary_stats_test.c @@ -0,0 +1,273 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * kvm_binary_stats_test + * + * Copyright (C) 2021, Google LLC. + * + * Test the fd-based interface for KVM statistics. + */ +#include <fcntl.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <errno.h> + +#include "test_util.h" + +#include "kvm_util.h" +#include "asm/kvm.h" +#include "linux/kvm.h" +#include "kselftest.h" + +static void stats_test(int stats_fd) +{ + ssize_t ret; + int i; + size_t size_desc; + size_t size_data = 0; + struct kvm_stats_header header; + char *id; + struct kvm_stats_desc *stats_desc; + u64 *stats_data; + struct kvm_stats_desc *pdesc; + u32 type, unit, base; + + /* Read kvm stats header */ + read_stats_header(stats_fd, &header); + + size_desc = get_stats_descriptor_size(&header); + + /* Read kvm stats id string */ + id = malloc(header.name_size); + TEST_ASSERT(id, "Allocate memory for id string"); + + ret = pread(stats_fd, id, header.name_size, sizeof(header)); + TEST_ASSERT(ret == header.name_size, + "Expected header size '%u', read '%lu' bytes", + header.name_size, ret); + + /* Check id string, that should start with "kvm" */ + TEST_ASSERT(!strncmp(id, "kvm", 3) && strlen(id) < header.name_size, + "Invalid KVM stats type, id: %s", id); + + /* Sanity check for other fields in header */ + if (header.num_desc == 0) { + ksft_print_msg("No KVM stats defined!\n"); + return; + } + /* + * The descriptor and data offsets must be valid, they must not overlap + * the header, and the descriptor and data blocks must not overlap each + * other. Note, the data block is rechecked after its size is known. + */ + TEST_ASSERT(header.desc_offset && header.desc_offset >= sizeof(header) && + header.data_offset && header.data_offset >= sizeof(header), + "Invalid offset fields in header"); + + TEST_ASSERT(header.desc_offset > header.data_offset || + (header.desc_offset + size_desc * header.num_desc <= header.data_offset), + "Descriptor block is overlapped with data block"); + + /* Read kvm stats descriptors */ + stats_desc = read_stats_descriptors(stats_fd, &header); + + /* Sanity check for fields in descriptors */ + for (i = 0; i < header.num_desc; ++i) { + pdesc = get_stats_descriptor(stats_desc, i, &header); + type = pdesc->flags & KVM_STATS_TYPE_MASK; + unit = pdesc->flags & KVM_STATS_UNIT_MASK; + base = pdesc->flags & KVM_STATS_BASE_MASK; + + /* Check name string */ + TEST_ASSERT(strlen(pdesc->name) < header.name_size, + "KVM stats name (index: %d) too long", i); + + /* Check type,unit,base boundaries */ + TEST_ASSERT(type <= KVM_STATS_TYPE_MAX, + "Unknown KVM stats (%s) type: %u", pdesc->name, type); + TEST_ASSERT(unit <= KVM_STATS_UNIT_MAX, + "Unknown KVM stats (%s) unit: %u", pdesc->name, unit); + TEST_ASSERT(base <= KVM_STATS_BASE_MAX, + "Unknown KVM stats (%s) base: %u", pdesc->name, base); + + /* + * Check exponent for stats unit + * Exponent for counter should be greater than or equal to 0 + * Exponent for unit bytes should be greater than or equal to 0 + * Exponent for unit seconds should be less than or equal to 0 + * Exponent for unit clock cycles should be greater than or + * equal to 0 + * Exponent for unit boolean should be 0 + */ + switch (pdesc->flags & KVM_STATS_UNIT_MASK) { + case KVM_STATS_UNIT_NONE: + case KVM_STATS_UNIT_BYTES: + case KVM_STATS_UNIT_CYCLES: + TEST_ASSERT(pdesc->exponent >= 0, + "Unsupported KVM stats (%s) exponent: %i", + pdesc->name, pdesc->exponent); + break; + case KVM_STATS_UNIT_SECONDS: + TEST_ASSERT(pdesc->exponent <= 0, + "Unsupported KVM stats (%s) exponent: %i", + pdesc->name, pdesc->exponent); + break; + case KVM_STATS_UNIT_BOOLEAN: + TEST_ASSERT(pdesc->exponent == 0, + "Unsupported KVM stats (%s) exponent: %d", + pdesc->name, pdesc->exponent); + break; + } + + /* Check size field, which should not be zero */ + TEST_ASSERT(pdesc->size, + "KVM descriptor(%s) with size of 0", pdesc->name); + /* Check bucket_size field */ + switch (pdesc->flags & KVM_STATS_TYPE_MASK) { + case KVM_STATS_TYPE_LINEAR_HIST: + TEST_ASSERT(pdesc->bucket_size, + "Bucket size of Linear Histogram stats (%s) is zero", + pdesc->name); + break; + default: + TEST_ASSERT(!pdesc->bucket_size, + "Bucket size of stats (%s) is not zero", + pdesc->name); + } + size_data = max(size_data, pdesc->offset + pdesc->size * sizeof(*stats_data)); + } + + /* + * Now that the size of the data block is known, verify the data block + * doesn't overlap the descriptor block. + */ + TEST_ASSERT(header.data_offset >= header.desc_offset || + header.data_offset + size_data <= header.desc_offset, + "Data block is overlapped with Descriptor block"); + + /* Check validity of all stats data size */ + TEST_ASSERT(size_data >= header.num_desc * sizeof(*stats_data), + "Data size is not correct"); + + /* Allocate memory for stats data */ + stats_data = malloc(size_data); + TEST_ASSERT(stats_data, "Allocate memory for stats data"); + /* Read kvm stats data as a bulk */ + ret = pread(stats_fd, stats_data, size_data, header.data_offset); + TEST_ASSERT(ret == size_data, "Read KVM stats data"); + /* Read kvm stats data one by one */ + for (i = 0; i < header.num_desc; ++i) { + pdesc = get_stats_descriptor(stats_desc, i, &header); + read_stat_data(stats_fd, &header, pdesc, stats_data, + pdesc->size); + } + + free(stats_data); + free(stats_desc); + free(id); + + close(stats_fd); + TEST_ASSERT(fcntl(stats_fd, F_GETFD) == -1, "Stats fd not freed"); +} + +#define DEFAULT_NUM_VM 4 +#define DEFAULT_NUM_VCPU 4 + +/* + * Usage: kvm_bin_form_stats [#vm] [#vcpu] + * The first parameter #vm set the number of VMs being created. + * The second parameter #vcpu set the number of VCPUs being created. + * By default, DEFAULT_NUM_VM VM and DEFAULT_NUM_VCPU VCPU for the VM would be + * created for testing. + */ + +int main(int argc, char *argv[]) +{ + int vm_stats_fds, *vcpu_stats_fds; + int i, j; + struct kvm_vcpu **vcpus; + struct kvm_vm **vms; + int max_vm = DEFAULT_NUM_VM; + int max_vcpu = DEFAULT_NUM_VCPU; + + /* Get the number of VMs and VCPUs that would be created for testing. */ + if (argc > 1) { + max_vm = strtol(argv[1], NULL, 0); + if (max_vm <= 0) + max_vm = DEFAULT_NUM_VM; + } + if (argc > 2) { + max_vcpu = strtol(argv[2], NULL, 0); + if (max_vcpu <= 0) + max_vcpu = DEFAULT_NUM_VCPU; + } + + ksft_print_header(); + + /* Check the extension for binary stats */ + TEST_REQUIRE(kvm_has_cap(KVM_CAP_BINARY_STATS_FD)); + + ksft_set_plan(max_vm); + + /* Create VMs and VCPUs */ + vms = malloc(sizeof(vms[0]) * max_vm); + TEST_ASSERT(vms, "Allocate memory for storing VM pointers"); + + vcpus = malloc(sizeof(struct kvm_vcpu *) * max_vm * max_vcpu); + TEST_ASSERT(vcpus, "Allocate memory for storing vCPU pointers"); + + /* + * Not per-VM as the array is populated, used, and invalidated within a + * single for-loop iteration. + */ + vcpu_stats_fds = calloc(max_vm, sizeof(*vcpu_stats_fds)); + TEST_ASSERT(vcpu_stats_fds, "Allocate memory for VM stats fds"); + + for (i = 0; i < max_vm; ++i) { + vms[i] = vm_create_barebones(); + for (j = 0; j < max_vcpu; ++j) + vcpus[i * max_vcpu + j] = __vm_vcpu_add(vms[i], j); + } + + /* + * Check stats read for every VM and vCPU, with a variety of flavors. + * Note, stats_test() closes the passed in stats fd. + */ + for (i = 0; i < max_vm; ++i) { + /* + * Verify that creating multiple userspace references to a + * single stats file works and doesn't cause explosions. + */ + vm_stats_fds = vm_get_stats_fd(vms[i]); + stats_test(dup(vm_stats_fds)); + + /* Verify userspace can instantiate multiple stats files. */ + stats_test(vm_get_stats_fd(vms[i])); + + for (j = 0; j < max_vcpu; ++j) { + vcpu_stats_fds[j] = vcpu_get_stats_fd(vcpus[i * max_vcpu + j]); + stats_test(dup(vcpu_stats_fds[j])); + stats_test(vcpu_get_stats_fd(vcpus[i * max_vcpu + j])); + } + + /* + * Close the VM fd and redo the stats tests. KVM should gift a + * reference (to the VM) to each stats fd, i.e. stats should + * still be accessible even after userspace has put its last + * _direct_ reference to the VM. + */ + kvm_vm_free(vms[i]); + + stats_test(vm_stats_fds); + for (j = 0; j < max_vcpu; ++j) + stats_test(vcpu_stats_fds[j]); + + ksft_test_result_pass("vm%i\n", i); + } + + free(vms); + free(vcpus); + free(vcpu_stats_fds); + + ksft_finished(); /* Print results and exit() accordingly */ +} diff --git a/tools/testing/selftests/kvm/kvm_create_max_vcpus.c b/tools/testing/selftests/kvm/kvm_create_max_vcpus.c index 0299cd81b8ba..c78f34699f73 100644 --- a/tools/testing/selftests/kvm/kvm_create_max_vcpus.c +++ b/tools/testing/selftests/kvm/kvm_create_max_vcpus.c @@ -6,12 +6,11 @@ * * Test for KVM_CAP_MAX_VCPUS and KVM_CAP_MAX_VCPU_ID. */ - -#define _GNU_SOURCE /* for program_invocation_short_name */ #include <fcntl.h> #include <stdio.h> #include <stdlib.h> #include <string.h> +#include <sys/resource.h> #include "test_util.h" @@ -27,11 +26,11 @@ void test_vcpu_creation(int first_vcpu_id, int num_vcpus) pr_info("Testing creating %d vCPUs, with IDs %d...%d.\n", num_vcpus, first_vcpu_id, first_vcpu_id + num_vcpus - 1); - vm = vm_create(VM_MODE_DEFAULT, DEFAULT_GUEST_PHY_PAGES, O_RDWR); + vm = vm_create_barebones(); for (i = first_vcpu_id; i < first_vcpu_id + num_vcpus; i++) /* This asserts that the vCPU was created. */ - vm_vcpu_add(vm, i); + __vm_vcpu_add(vm, i); kvm_vm_free(vm); } @@ -40,11 +39,38 @@ int main(int argc, char *argv[]) { int kvm_max_vcpu_id = kvm_check_cap(KVM_CAP_MAX_VCPU_ID); int kvm_max_vcpus = kvm_check_cap(KVM_CAP_MAX_VCPUS); + /* + * Number of file descriptors reqired, KVM_CAP_MAX_VCPUS for vCPU fds + + * an arbitrary number for everything else. + */ + int nr_fds_wanted = kvm_max_vcpus + 100; + struct rlimit rl; pr_info("KVM_CAP_MAX_VCPU_ID: %d\n", kvm_max_vcpu_id); pr_info("KVM_CAP_MAX_VCPUS: %d\n", kvm_max_vcpus); /* + * Check that we're allowed to open nr_fds_wanted file descriptors and + * try raising the limits if needed. + */ + TEST_ASSERT(!getrlimit(RLIMIT_NOFILE, &rl), "getrlimit() failed!"); + + if (rl.rlim_cur < nr_fds_wanted) { + rl.rlim_cur = nr_fds_wanted; + if (rl.rlim_max < nr_fds_wanted) { + int old_rlim_max = rl.rlim_max; + rl.rlim_max = nr_fds_wanted; + + int r = setrlimit(RLIMIT_NOFILE, &rl); + __TEST_REQUIRE(r >= 0, + "RLIMIT_NOFILE hard limit is too low (%d, wanted %d)", + old_rlim_max, nr_fds_wanted); + } else { + TEST_ASSERT(!setrlimit(RLIMIT_NOFILE, &rl), "setrlimit() failed!"); + } + } + + /* * Upstream KVM prior to 4.8 does not support KVM_CAP_MAX_VCPU_ID. * Userspace is supposed to use KVM_CAP_MAX_VCPUS as the maximum ID * in this case. @@ -53,7 +79,7 @@ int main(int argc, char *argv[]) kvm_max_vcpu_id = kvm_max_vcpus; TEST_ASSERT(kvm_max_vcpu_id >= kvm_max_vcpus, - "KVM_MAX_VCPU_ID (%d) must be at least as large as KVM_MAX_VCPUS (%d).", + "KVM_MAX_VCPU_IDS (%d) must be at least as large as KVM_MAX_VCPUS (%d).", kvm_max_vcpu_id, kvm_max_vcpus); test_vcpu_creation(0, kvm_max_vcpus); diff --git a/tools/testing/selftests/kvm/kvm_page_table_test.c b/tools/testing/selftests/kvm/kvm_page_table_test.c new file mode 100644 index 000000000000..dd8b12f626d3 --- /dev/null +++ b/tools/testing/selftests/kvm/kvm_page_table_test.c @@ -0,0 +1,477 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * KVM page table test + * + * Copyright (C) 2021, Huawei, Inc. + * + * Make sure that THP has been enabled or enough HUGETLB pages with specific + * page size have been pre-allocated on your system, if you are planning to + * use hugepages to back the guest memory for testing. + */ +#include <stdio.h> +#include <stdlib.h> +#include <time.h> +#include <pthread.h> +#include <semaphore.h> + +#include "test_util.h" +#include "kvm_util.h" +#include "processor.h" +#include "guest_modes.h" +#include "ucall_common.h" + +#define TEST_MEM_SLOT_INDEX 1 + +/* Default size(1GB) of the memory for testing */ +#define DEFAULT_TEST_MEM_SIZE (1 << 30) + +/* Default guest test virtual memory offset */ +#define DEFAULT_GUEST_TEST_MEM 0xc0000000 + +/* Different guest memory accessing stages */ +enum test_stage { + KVM_BEFORE_MAPPINGS, + KVM_CREATE_MAPPINGS, + KVM_UPDATE_MAPPINGS, + KVM_ADJUST_MAPPINGS, + NUM_TEST_STAGES, +}; + +static const char * const test_stage_string[] = { + "KVM_BEFORE_MAPPINGS", + "KVM_CREATE_MAPPINGS", + "KVM_UPDATE_MAPPINGS", + "KVM_ADJUST_MAPPINGS", +}; + +struct test_args { + struct kvm_vm *vm; + uint64_t guest_test_virt_mem; + uint64_t host_page_size; + uint64_t host_num_pages; + uint64_t large_page_size; + uint64_t large_num_pages; + uint64_t host_pages_per_lpage; + enum vm_mem_backing_src_type src_type; + struct kvm_vcpu *vcpus[KVM_MAX_VCPUS]; +}; + +/* + * Guest variables. Use addr_gva2hva() if these variables need + * to be changed in host. + */ +static enum test_stage guest_test_stage; + +/* Host variables */ +static uint32_t nr_vcpus = 1; +static struct test_args test_args; +static enum test_stage *current_stage; +static bool host_quit; + +/* Whether the test stage is updated, or completed */ +static sem_t test_stage_updated; +static sem_t test_stage_completed; + +/* + * Guest physical memory offset of the testing memory slot. + * This will be set to the topmost valid physical address minus + * the test memory size. + */ +static uint64_t guest_test_phys_mem; + +/* + * Guest virtual memory offset of the testing memory slot. + * Must not conflict with identity mapped test code. + */ +static uint64_t guest_test_virt_mem = DEFAULT_GUEST_TEST_MEM; + +static void guest_code(bool do_write) +{ + struct test_args *p = &test_args; + enum test_stage *current_stage = &guest_test_stage; + uint64_t addr; + int i, j; + + while (true) { + addr = p->guest_test_virt_mem; + + switch (READ_ONCE(*current_stage)) { + /* + * All vCPU threads will be started in this stage, + * where guest code of each vCPU will do nothing. + */ + case KVM_BEFORE_MAPPINGS: + break; + + /* + * Before dirty logging, vCPUs concurrently access the first + * 8 bytes of each page (host page/large page) within the same + * memory region with different accessing types (read/write). + * Then KVM will create normal page mappings or huge block + * mappings for them. + */ + case KVM_CREATE_MAPPINGS: + for (i = 0; i < p->large_num_pages; i++) { + if (do_write) + *(uint64_t *)addr = 0x0123456789ABCDEF; + else + READ_ONCE(*(uint64_t *)addr); + + addr += p->large_page_size; + } + break; + + /* + * During dirty logging, KVM will only update attributes of the + * normal page mappings from RO to RW if memory backing src type + * is anonymous. In other cases, KVM will split the huge block + * mappings into normal page mappings if memory backing src type + * is THP or HUGETLB. + */ + case KVM_UPDATE_MAPPINGS: + if (p->src_type == VM_MEM_SRC_ANONYMOUS) { + for (i = 0; i < p->host_num_pages; i++) { + *(uint64_t *)addr = 0x0123456789ABCDEF; + addr += p->host_page_size; + } + break; + } + + for (i = 0; i < p->large_num_pages; i++) { + /* + * Write to the first host page in each large + * page region, and triger break of large pages. + */ + *(uint64_t *)addr = 0x0123456789ABCDEF; + + /* + * Access the middle host pages in each large + * page region. Since dirty logging is enabled, + * this will create new mappings at the smallest + * granularity. + */ + addr += p->large_page_size / 2; + for (j = 0; j < p->host_pages_per_lpage / 2; j++) { + READ_ONCE(*(uint64_t *)addr); + addr += p->host_page_size; + } + } + break; + + /* + * After dirty logging is stopped, vCPUs concurrently read + * from every single host page. Then KVM will coalesce the + * split page mappings back to block mappings. And a TLB + * conflict abort could occur here if TLB entries of the + * page mappings are not fully invalidated. + */ + case KVM_ADJUST_MAPPINGS: + for (i = 0; i < p->host_num_pages; i++) { + READ_ONCE(*(uint64_t *)addr); + addr += p->host_page_size; + } + break; + + default: + GUEST_ASSERT(0); + } + + GUEST_SYNC(1); + } +} + +static void *vcpu_worker(void *data) +{ + struct kvm_vcpu *vcpu = data; + bool do_write = !(vcpu->id % 2); + struct timespec start; + struct timespec ts_diff; + enum test_stage stage; + int ret; + + vcpu_args_set(vcpu, 1, do_write); + + while (!READ_ONCE(host_quit)) { + ret = sem_wait(&test_stage_updated); + TEST_ASSERT(ret == 0, "Error in sem_wait"); + + if (READ_ONCE(host_quit)) + return NULL; + + clock_gettime(CLOCK_MONOTONIC, &start); + ret = _vcpu_run(vcpu); + ts_diff = timespec_elapsed(start); + + TEST_ASSERT(ret == 0, "vcpu_run failed: %d", ret); + TEST_ASSERT(get_ucall(vcpu, NULL) == UCALL_SYNC, + "Invalid guest sync status: exit_reason=%s", + exit_reason_str(vcpu->run->exit_reason)); + + pr_debug("Got sync event from vCPU %d\n", vcpu->id); + stage = READ_ONCE(*current_stage); + + /* + * Here we can know the execution time of every + * single vcpu running in different test stages. + */ + pr_debug("vCPU %d has completed stage %s\n" + "execution time is: %ld.%.9lds\n\n", + vcpu->id, test_stage_string[stage], + ts_diff.tv_sec, ts_diff.tv_nsec); + + ret = sem_post(&test_stage_completed); + TEST_ASSERT(ret == 0, "Error in sem_post"); + } + + return NULL; +} + +struct test_params { + uint64_t phys_offset; + uint64_t test_mem_size; + enum vm_mem_backing_src_type src_type; +}; + +static struct kvm_vm *pre_init_before_test(enum vm_guest_mode mode, void *arg) +{ + int ret; + struct test_params *p = arg; + enum vm_mem_backing_src_type src_type = p->src_type; + uint64_t large_page_size = get_backing_src_pagesz(src_type); + uint64_t guest_page_size = vm_guest_mode_params[mode].page_size; + uint64_t host_page_size = getpagesize(); + uint64_t test_mem_size = p->test_mem_size; + uint64_t guest_num_pages; + uint64_t alignment; + void *host_test_mem; + struct kvm_vm *vm; + + /* Align up the test memory size */ + alignment = max(large_page_size, guest_page_size); + test_mem_size = (test_mem_size + alignment - 1) & ~(alignment - 1); + + /* Create a VM with enough guest pages */ + guest_num_pages = test_mem_size / guest_page_size; + vm = __vm_create_with_vcpus(VM_SHAPE(mode), nr_vcpus, guest_num_pages, + guest_code, test_args.vcpus); + + /* Align down GPA of the testing memslot */ + if (!p->phys_offset) + guest_test_phys_mem = (vm->max_gfn - guest_num_pages) * + guest_page_size; + else + guest_test_phys_mem = p->phys_offset; +#ifdef __s390x__ + alignment = max(0x100000UL, alignment); +#endif + guest_test_phys_mem = align_down(guest_test_phys_mem, alignment); + + /* Set up the shared data structure test_args */ + test_args.vm = vm; + test_args.guest_test_virt_mem = guest_test_virt_mem; + test_args.host_page_size = host_page_size; + test_args.host_num_pages = test_mem_size / host_page_size; + test_args.large_page_size = large_page_size; + test_args.large_num_pages = test_mem_size / large_page_size; + test_args.host_pages_per_lpage = large_page_size / host_page_size; + test_args.src_type = src_type; + + /* Add an extra memory slot with specified backing src type */ + vm_userspace_mem_region_add(vm, src_type, guest_test_phys_mem, + TEST_MEM_SLOT_INDEX, guest_num_pages, 0); + + /* Do mapping(GVA->GPA) for the testing memory slot */ + virt_map(vm, guest_test_virt_mem, guest_test_phys_mem, guest_num_pages); + + /* Cache the HVA pointer of the region */ + host_test_mem = addr_gpa2hva(vm, (vm_paddr_t)guest_test_phys_mem); + + /* Export shared structure test_args to guest */ + sync_global_to_guest(vm, test_args); + + ret = sem_init(&test_stage_updated, 0, 0); + TEST_ASSERT(ret == 0, "Error in sem_init"); + + ret = sem_init(&test_stage_completed, 0, 0); + TEST_ASSERT(ret == 0, "Error in sem_init"); + + current_stage = addr_gva2hva(vm, (vm_vaddr_t)(&guest_test_stage)); + *current_stage = NUM_TEST_STAGES; + + pr_info("Testing guest mode: %s\n", vm_guest_mode_string(mode)); + pr_info("Testing memory backing src type: %s\n", + vm_mem_backing_src_alias(src_type)->name); + pr_info("Testing memory backing src granularity: 0x%lx\n", + large_page_size); + pr_info("Testing memory size(aligned): 0x%lx\n", test_mem_size); + pr_info("Guest physical test memory offset: 0x%lx\n", + guest_test_phys_mem); + pr_info("Host virtual test memory offset: 0x%lx\n", + (uint64_t)host_test_mem); + pr_info("Number of testing vCPUs: %d\n", nr_vcpus); + + return vm; +} + +static void vcpus_complete_new_stage(enum test_stage stage) +{ + int ret; + int vcpus; + + /* Wake up all the vcpus to run new test stage */ + for (vcpus = 0; vcpus < nr_vcpus; vcpus++) { + ret = sem_post(&test_stage_updated); + TEST_ASSERT(ret == 0, "Error in sem_post"); + } + pr_debug("All vcpus have been notified to continue\n"); + + /* Wait for all the vcpus to complete new test stage */ + for (vcpus = 0; vcpus < nr_vcpus; vcpus++) { + ret = sem_wait(&test_stage_completed); + TEST_ASSERT(ret == 0, "Error in sem_wait"); + + pr_debug("%d vcpus have completed stage %s\n", + vcpus + 1, test_stage_string[stage]); + } + + pr_debug("All vcpus have completed stage %s\n", + test_stage_string[stage]); +} + +static void run_test(enum vm_guest_mode mode, void *arg) +{ + pthread_t *vcpu_threads; + struct kvm_vm *vm; + struct timespec start; + struct timespec ts_diff; + int ret, i; + + /* Create VM with vCPUs and make some pre-initialization */ + vm = pre_init_before_test(mode, arg); + + vcpu_threads = malloc(nr_vcpus * sizeof(*vcpu_threads)); + TEST_ASSERT(vcpu_threads, "Memory allocation failed"); + + host_quit = false; + *current_stage = KVM_BEFORE_MAPPINGS; + + for (i = 0; i < nr_vcpus; i++) + pthread_create(&vcpu_threads[i], NULL, vcpu_worker, + test_args.vcpus[i]); + + vcpus_complete_new_stage(*current_stage); + pr_info("Started all vCPUs successfully\n"); + + /* Test the stage of KVM creating mappings */ + *current_stage = KVM_CREATE_MAPPINGS; + + clock_gettime(CLOCK_MONOTONIC, &start); + vcpus_complete_new_stage(*current_stage); + ts_diff = timespec_elapsed(start); + + pr_info("KVM_CREATE_MAPPINGS: total execution time: %ld.%.9lds\n\n", + ts_diff.tv_sec, ts_diff.tv_nsec); + + /* Test the stage of KVM updating mappings */ + vm_mem_region_set_flags(vm, TEST_MEM_SLOT_INDEX, + KVM_MEM_LOG_DIRTY_PAGES); + + *current_stage = KVM_UPDATE_MAPPINGS; + + clock_gettime(CLOCK_MONOTONIC, &start); + vcpus_complete_new_stage(*current_stage); + ts_diff = timespec_elapsed(start); + + pr_info("KVM_UPDATE_MAPPINGS: total execution time: %ld.%.9lds\n\n", + ts_diff.tv_sec, ts_diff.tv_nsec); + + /* Test the stage of KVM adjusting mappings */ + vm_mem_region_set_flags(vm, TEST_MEM_SLOT_INDEX, 0); + + *current_stage = KVM_ADJUST_MAPPINGS; + + clock_gettime(CLOCK_MONOTONIC, &start); + vcpus_complete_new_stage(*current_stage); + ts_diff = timespec_elapsed(start); + + pr_info("KVM_ADJUST_MAPPINGS: total execution time: %ld.%.9lds\n\n", + ts_diff.tv_sec, ts_diff.tv_nsec); + + /* Tell the vcpu thread to quit */ + host_quit = true; + for (i = 0; i < nr_vcpus; i++) { + ret = sem_post(&test_stage_updated); + TEST_ASSERT(ret == 0, "Error in sem_post"); + } + + for (i = 0; i < nr_vcpus; i++) + pthread_join(vcpu_threads[i], NULL); + + ret = sem_destroy(&test_stage_updated); + TEST_ASSERT(ret == 0, "Error in sem_destroy"); + + ret = sem_destroy(&test_stage_completed); + TEST_ASSERT(ret == 0, "Error in sem_destroy"); + + free(vcpu_threads); + kvm_vm_free(vm); +} + +static void help(char *name) +{ + puts(""); + printf("usage: %s [-h] [-p offset] [-m mode] " + "[-b mem-size] [-v vcpus] [-s mem-type]\n", name); + puts(""); + printf(" -p: specify guest physical test memory offset\n" + " Warning: a low offset can conflict with the loaded test code.\n"); + guest_modes_help(); + printf(" -b: specify size of the memory region for testing. e.g. 10M or 3G.\n" + " (default: 1G)\n"); + printf(" -v: specify the number of vCPUs to run\n" + " (default: 1)\n"); + backing_src_help("-s"); + puts(""); +} + +int main(int argc, char *argv[]) +{ + int max_vcpus = kvm_check_cap(KVM_CAP_MAX_VCPUS); + struct test_params p = { + .test_mem_size = DEFAULT_TEST_MEM_SIZE, + .src_type = DEFAULT_VM_MEM_SRC, + }; + int opt; + + guest_modes_append_default(); + + while ((opt = getopt(argc, argv, "hp:m:b:v:s:")) != -1) { + switch (opt) { + case 'p': + p.phys_offset = strtoull(optarg, NULL, 0); + break; + case 'm': + guest_modes_cmdline(optarg); + break; + case 'b': + p.test_mem_size = parse_size(optarg); + break; + case 'v': + nr_vcpus = atoi_positive("Number of vCPUs", optarg); + TEST_ASSERT(nr_vcpus <= max_vcpus, + "Invalid number of vcpus, must be between 1 and %d", max_vcpus); + break; + case 's': + p.src_type = parse_backing_src_type(optarg); + break; + case 'h': + default: + help(argv[0]); + exit(0); + } + } + + for_each_guest_mode(run_test, &p); + + return 0; +} diff --git a/tools/testing/selftests/kvm/lib/aarch64/gic.c b/tools/testing/selftests/kvm/lib/aarch64/gic.c new file mode 100644 index 000000000000..7abbf8866512 --- /dev/null +++ b/tools/testing/selftests/kvm/lib/aarch64/gic.c @@ -0,0 +1,157 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * ARM Generic Interrupt Controller (GIC) support + */ + +#include <errno.h> +#include <linux/bits.h> +#include <linux/sizes.h> + +#include "kvm_util.h" + +#include <gic.h> +#include "gic_private.h" +#include "processor.h" +#include "spinlock.h" + +static const struct gic_common_ops *gic_common_ops; +static struct spinlock gic_lock; + +static void gic_cpu_init(unsigned int cpu) +{ + gic_common_ops->gic_cpu_init(cpu); +} + +static void gic_dist_init(enum gic_type type, unsigned int nr_cpus) +{ + const struct gic_common_ops *gic_ops = NULL; + + spin_lock(&gic_lock); + + /* Distributor initialization is needed only once per VM */ + if (gic_common_ops) { + spin_unlock(&gic_lock); + return; + } + + if (type == GIC_V3) + gic_ops = &gicv3_ops; + + GUEST_ASSERT(gic_ops); + + gic_ops->gic_init(nr_cpus); + gic_common_ops = gic_ops; + + /* Make sure that the initialized data is visible to all the vCPUs */ + dsb(sy); + + spin_unlock(&gic_lock); +} + +void gic_init(enum gic_type type, unsigned int nr_cpus) +{ + uint32_t cpu = guest_get_vcpuid(); + + GUEST_ASSERT(type < GIC_TYPE_MAX); + GUEST_ASSERT(nr_cpus); + + gic_dist_init(type, nr_cpus); + gic_cpu_init(cpu); +} + +void gic_irq_enable(unsigned int intid) +{ + GUEST_ASSERT(gic_common_ops); + gic_common_ops->gic_irq_enable(intid); +} + +void gic_irq_disable(unsigned int intid) +{ + GUEST_ASSERT(gic_common_ops); + gic_common_ops->gic_irq_disable(intid); +} + +unsigned int gic_get_and_ack_irq(void) +{ + uint64_t irqstat; + unsigned int intid; + + GUEST_ASSERT(gic_common_ops); + + irqstat = gic_common_ops->gic_read_iar(); + intid = irqstat & GENMASK(23, 0); + + return intid; +} + +void gic_set_eoi(unsigned int intid) +{ + GUEST_ASSERT(gic_common_ops); + gic_common_ops->gic_write_eoir(intid); +} + +void gic_set_dir(unsigned int intid) +{ + GUEST_ASSERT(gic_common_ops); + gic_common_ops->gic_write_dir(intid); +} + +void gic_set_eoi_split(bool split) +{ + GUEST_ASSERT(gic_common_ops); + gic_common_ops->gic_set_eoi_split(split); +} + +void gic_set_priority_mask(uint64_t pmr) +{ + GUEST_ASSERT(gic_common_ops); + gic_common_ops->gic_set_priority_mask(pmr); +} + +void gic_set_priority(unsigned int intid, unsigned int prio) +{ + GUEST_ASSERT(gic_common_ops); + gic_common_ops->gic_set_priority(intid, prio); +} + +void gic_irq_set_active(unsigned int intid) +{ + GUEST_ASSERT(gic_common_ops); + gic_common_ops->gic_irq_set_active(intid); +} + +void gic_irq_clear_active(unsigned int intid) +{ + GUEST_ASSERT(gic_common_ops); + gic_common_ops->gic_irq_clear_active(intid); +} + +bool gic_irq_get_active(unsigned int intid) +{ + GUEST_ASSERT(gic_common_ops); + return gic_common_ops->gic_irq_get_active(intid); +} + +void gic_irq_set_pending(unsigned int intid) +{ + GUEST_ASSERT(gic_common_ops); + gic_common_ops->gic_irq_set_pending(intid); +} + +void gic_irq_clear_pending(unsigned int intid) +{ + GUEST_ASSERT(gic_common_ops); + gic_common_ops->gic_irq_clear_pending(intid); +} + +bool gic_irq_get_pending(unsigned int intid) +{ + GUEST_ASSERT(gic_common_ops); + return gic_common_ops->gic_irq_get_pending(intid); +} + +void gic_irq_set_config(unsigned int intid, bool is_edge) +{ + GUEST_ASSERT(gic_common_ops); + gic_common_ops->gic_irq_set_config(intid, is_edge); +} diff --git a/tools/testing/selftests/kvm/lib/aarch64/gic_private.h b/tools/testing/selftests/kvm/lib/aarch64/gic_private.h new file mode 100644 index 000000000000..d24e9ecc96c6 --- /dev/null +++ b/tools/testing/selftests/kvm/lib/aarch64/gic_private.h @@ -0,0 +1,32 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * ARM Generic Interrupt Controller (GIC) private defines that's only + * shared among the GIC library code. + */ + +#ifndef SELFTEST_KVM_GIC_PRIVATE_H +#define SELFTEST_KVM_GIC_PRIVATE_H + +struct gic_common_ops { + void (*gic_init)(unsigned int nr_cpus); + void (*gic_cpu_init)(unsigned int cpu); + void (*gic_irq_enable)(unsigned int intid); + void (*gic_irq_disable)(unsigned int intid); + uint64_t (*gic_read_iar)(void); + void (*gic_write_eoir)(uint32_t irq); + void (*gic_write_dir)(uint32_t irq); + void (*gic_set_eoi_split)(bool split); + void (*gic_set_priority_mask)(uint64_t mask); + void (*gic_set_priority)(uint32_t intid, uint32_t prio); + void (*gic_irq_set_active)(uint32_t intid); + void (*gic_irq_clear_active)(uint32_t intid); + bool (*gic_irq_get_active)(uint32_t intid); + void (*gic_irq_set_pending)(uint32_t intid); + void (*gic_irq_clear_pending)(uint32_t intid); + bool (*gic_irq_get_pending)(uint32_t intid); + void (*gic_irq_set_config)(uint32_t intid, bool is_edge); +}; + +extern const struct gic_common_ops gicv3_ops; + +#endif /* SELFTEST_KVM_GIC_PRIVATE_H */ diff --git a/tools/testing/selftests/kvm/lib/aarch64/gic_v3.c b/tools/testing/selftests/kvm/lib/aarch64/gic_v3.c new file mode 100644 index 000000000000..66d05506f78b --- /dev/null +++ b/tools/testing/selftests/kvm/lib/aarch64/gic_v3.c @@ -0,0 +1,427 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * ARM Generic Interrupt Controller (GIC) v3 support + */ + +#include <linux/sizes.h> + +#include "kvm_util.h" +#include "processor.h" +#include "delay.h" + +#include "gic.h" +#include "gic_v3.h" +#include "gic_private.h" + +#define GICV3_MAX_CPUS 512 + +#define GICD_INT_DEF_PRI 0xa0 +#define GICD_INT_DEF_PRI_X4 ((GICD_INT_DEF_PRI << 24) |\ + (GICD_INT_DEF_PRI << 16) |\ + (GICD_INT_DEF_PRI << 8) |\ + GICD_INT_DEF_PRI) + +#define ICC_PMR_DEF_PRIO 0xf0 + +struct gicv3_data { + unsigned int nr_cpus; + unsigned int nr_spis; +}; + +#define sgi_base_from_redist(redist_base) (redist_base + SZ_64K) +#define DIST_BIT (1U << 31) + +enum gicv3_intid_range { + SGI_RANGE, + PPI_RANGE, + SPI_RANGE, + INVALID_RANGE, +}; + +static struct gicv3_data gicv3_data; + +static void gicv3_gicd_wait_for_rwp(void) +{ + unsigned int count = 100000; /* 1s */ + + while (readl(GICD_BASE_GVA + GICD_CTLR) & GICD_CTLR_RWP) { + GUEST_ASSERT(count--); + udelay(10); + } +} + +static inline volatile void *gicr_base_cpu(uint32_t cpu) +{ + /* Align all the redistributors sequentially */ + return GICR_BASE_GVA + cpu * SZ_64K * 2; +} + +static void gicv3_gicr_wait_for_rwp(uint32_t cpu) +{ + unsigned int count = 100000; /* 1s */ + + while (readl(gicr_base_cpu(cpu) + GICR_CTLR) & GICR_CTLR_RWP) { + GUEST_ASSERT(count--); + udelay(10); + } +} + +static void gicv3_wait_for_rwp(uint32_t cpu_or_dist) +{ + if (cpu_or_dist & DIST_BIT) + gicv3_gicd_wait_for_rwp(); + else + gicv3_gicr_wait_for_rwp(cpu_or_dist); +} + +static enum gicv3_intid_range get_intid_range(unsigned int intid) +{ + switch (intid) { + case 0 ... 15: + return SGI_RANGE; + case 16 ... 31: + return PPI_RANGE; + case 32 ... 1019: + return SPI_RANGE; + } + + /* We should not be reaching here */ + GUEST_ASSERT(0); + + return INVALID_RANGE; +} + +static uint64_t gicv3_read_iar(void) +{ + uint64_t irqstat = read_sysreg_s(SYS_ICC_IAR1_EL1); + + dsb(sy); + return irqstat; +} + +static void gicv3_write_eoir(uint32_t irq) +{ + write_sysreg_s(irq, SYS_ICC_EOIR1_EL1); + isb(); +} + +static void gicv3_write_dir(uint32_t irq) +{ + write_sysreg_s(irq, SYS_ICC_DIR_EL1); + isb(); +} + +static void gicv3_set_priority_mask(uint64_t mask) +{ + write_sysreg_s(mask, SYS_ICC_PMR_EL1); +} + +static void gicv3_set_eoi_split(bool split) +{ + uint32_t val; + + /* + * All other fields are read-only, so no need to read CTLR first. In + * fact, the kernel does the same. + */ + val = split ? (1U << 1) : 0; + write_sysreg_s(val, SYS_ICC_CTLR_EL1); + isb(); +} + +uint32_t gicv3_reg_readl(uint32_t cpu_or_dist, uint64_t offset) +{ + volatile void *base = cpu_or_dist & DIST_BIT ? GICD_BASE_GVA + : sgi_base_from_redist(gicr_base_cpu(cpu_or_dist)); + return readl(base + offset); +} + +void gicv3_reg_writel(uint32_t cpu_or_dist, uint64_t offset, uint32_t reg_val) +{ + volatile void *base = cpu_or_dist & DIST_BIT ? GICD_BASE_GVA + : sgi_base_from_redist(gicr_base_cpu(cpu_or_dist)); + writel(reg_val, base + offset); +} + +uint32_t gicv3_getl_fields(uint32_t cpu_or_dist, uint64_t offset, uint32_t mask) +{ + return gicv3_reg_readl(cpu_or_dist, offset) & mask; +} + +void gicv3_setl_fields(uint32_t cpu_or_dist, uint64_t offset, + uint32_t mask, uint32_t reg_val) +{ + uint32_t tmp = gicv3_reg_readl(cpu_or_dist, offset) & ~mask; + + tmp |= (reg_val & mask); + gicv3_reg_writel(cpu_or_dist, offset, tmp); +} + +/* + * We use a single offset for the distributor and redistributor maps as they + * have the same value in both. The only exceptions are registers that only + * exist in one and not the other, like GICR_WAKER that doesn't exist in the + * distributor map. Such registers are conveniently marked as reserved in the + * map that doesn't implement it; like GICR_WAKER's offset of 0x0014 being + * marked as "Reserved" in the Distributor map. + */ +static void gicv3_access_reg(uint32_t intid, uint64_t offset, + uint32_t reg_bits, uint32_t bits_per_field, + bool write, uint32_t *val) +{ + uint32_t cpu = guest_get_vcpuid(); + enum gicv3_intid_range intid_range = get_intid_range(intid); + uint32_t fields_per_reg, index, mask, shift; + uint32_t cpu_or_dist; + + GUEST_ASSERT(bits_per_field <= reg_bits); + GUEST_ASSERT(!write || *val < (1U << bits_per_field)); + /* + * This function does not support 64 bit accesses. Just asserting here + * until we implement readq/writeq. + */ + GUEST_ASSERT(reg_bits == 32); + + fields_per_reg = reg_bits / bits_per_field; + index = intid % fields_per_reg; + shift = index * bits_per_field; + mask = ((1U << bits_per_field) - 1) << shift; + + /* Set offset to the actual register holding intid's config. */ + offset += (intid / fields_per_reg) * (reg_bits / 8); + + cpu_or_dist = (intid_range == SPI_RANGE) ? DIST_BIT : cpu; + + if (write) + gicv3_setl_fields(cpu_or_dist, offset, mask, *val << shift); + *val = gicv3_getl_fields(cpu_or_dist, offset, mask) >> shift; +} + +static void gicv3_write_reg(uint32_t intid, uint64_t offset, + uint32_t reg_bits, uint32_t bits_per_field, uint32_t val) +{ + gicv3_access_reg(intid, offset, reg_bits, + bits_per_field, true, &val); +} + +static uint32_t gicv3_read_reg(uint32_t intid, uint64_t offset, + uint32_t reg_bits, uint32_t bits_per_field) +{ + uint32_t val; + + gicv3_access_reg(intid, offset, reg_bits, + bits_per_field, false, &val); + return val; +} + +static void gicv3_set_priority(uint32_t intid, uint32_t prio) +{ + gicv3_write_reg(intid, GICD_IPRIORITYR, 32, 8, prio); +} + +/* Sets the intid to be level-sensitive or edge-triggered. */ +static void gicv3_irq_set_config(uint32_t intid, bool is_edge) +{ + uint32_t val; + + /* N/A for private interrupts. */ + GUEST_ASSERT(get_intid_range(intid) == SPI_RANGE); + val = is_edge ? 2 : 0; + gicv3_write_reg(intid, GICD_ICFGR, 32, 2, val); +} + +static void gicv3_irq_enable(uint32_t intid) +{ + bool is_spi = get_intid_range(intid) == SPI_RANGE; + uint32_t cpu = guest_get_vcpuid(); + + gicv3_write_reg(intid, GICD_ISENABLER, 32, 1, 1); + gicv3_wait_for_rwp(is_spi ? DIST_BIT : cpu); +} + +static void gicv3_irq_disable(uint32_t intid) +{ + bool is_spi = get_intid_range(intid) == SPI_RANGE; + uint32_t cpu = guest_get_vcpuid(); + + gicv3_write_reg(intid, GICD_ICENABLER, 32, 1, 1); + gicv3_wait_for_rwp(is_spi ? DIST_BIT : cpu); +} + +static void gicv3_irq_set_active(uint32_t intid) +{ + gicv3_write_reg(intid, GICD_ISACTIVER, 32, 1, 1); +} + +static void gicv3_irq_clear_active(uint32_t intid) +{ + gicv3_write_reg(intid, GICD_ICACTIVER, 32, 1, 1); +} + +static bool gicv3_irq_get_active(uint32_t intid) +{ + return gicv3_read_reg(intid, GICD_ISACTIVER, 32, 1); +} + +static void gicv3_irq_set_pending(uint32_t intid) +{ + gicv3_write_reg(intid, GICD_ISPENDR, 32, 1, 1); +} + +static void gicv3_irq_clear_pending(uint32_t intid) +{ + gicv3_write_reg(intid, GICD_ICPENDR, 32, 1, 1); +} + +static bool gicv3_irq_get_pending(uint32_t intid) +{ + return gicv3_read_reg(intid, GICD_ISPENDR, 32, 1); +} + +static void gicv3_enable_redist(volatile void *redist_base) +{ + uint32_t val = readl(redist_base + GICR_WAKER); + unsigned int count = 100000; /* 1s */ + + val &= ~GICR_WAKER_ProcessorSleep; + writel(val, redist_base + GICR_WAKER); + + /* Wait until the processor is 'active' */ + while (readl(redist_base + GICR_WAKER) & GICR_WAKER_ChildrenAsleep) { + GUEST_ASSERT(count--); + udelay(10); + } +} + +static void gicv3_cpu_init(unsigned int cpu) +{ + volatile void *sgi_base; + unsigned int i; + volatile void *redist_base_cpu; + + GUEST_ASSERT(cpu < gicv3_data.nr_cpus); + + redist_base_cpu = gicr_base_cpu(cpu); + sgi_base = sgi_base_from_redist(redist_base_cpu); + + gicv3_enable_redist(redist_base_cpu); + + /* + * Mark all the SGI and PPI interrupts as non-secure Group-1. + * Also, deactivate and disable them. + */ + writel(~0, sgi_base + GICR_IGROUPR0); + writel(~0, sgi_base + GICR_ICACTIVER0); + writel(~0, sgi_base + GICR_ICENABLER0); + + /* Set a default priority for all the SGIs and PPIs */ + for (i = 0; i < 32; i += 4) + writel(GICD_INT_DEF_PRI_X4, + sgi_base + GICR_IPRIORITYR0 + i); + + gicv3_gicr_wait_for_rwp(cpu); + + /* Enable the GIC system register (ICC_*) access */ + write_sysreg_s(read_sysreg_s(SYS_ICC_SRE_EL1) | ICC_SRE_EL1_SRE, + SYS_ICC_SRE_EL1); + + /* Set a default priority threshold */ + write_sysreg_s(ICC_PMR_DEF_PRIO, SYS_ICC_PMR_EL1); + + /* Enable non-secure Group-1 interrupts */ + write_sysreg_s(ICC_IGRPEN1_EL1_MASK, SYS_ICC_IGRPEN1_EL1); +} + +static void gicv3_dist_init(void) +{ + unsigned int i; + + /* Disable the distributor until we set things up */ + writel(0, GICD_BASE_GVA + GICD_CTLR); + gicv3_gicd_wait_for_rwp(); + + /* + * Mark all the SPI interrupts as non-secure Group-1. + * Also, deactivate and disable them. + */ + for (i = 32; i < gicv3_data.nr_spis; i += 32) { + writel(~0, GICD_BASE_GVA + GICD_IGROUPR + i / 8); + writel(~0, GICD_BASE_GVA + GICD_ICACTIVER + i / 8); + writel(~0, GICD_BASE_GVA + GICD_ICENABLER + i / 8); + } + + /* Set a default priority for all the SPIs */ + for (i = 32; i < gicv3_data.nr_spis; i += 4) + writel(GICD_INT_DEF_PRI_X4, + GICD_BASE_GVA + GICD_IPRIORITYR + i); + + /* Wait for the settings to sync-in */ + gicv3_gicd_wait_for_rwp(); + + /* Finally, enable the distributor globally with ARE */ + writel(GICD_CTLR_ARE_NS | GICD_CTLR_ENABLE_G1A | + GICD_CTLR_ENABLE_G1, GICD_BASE_GVA + GICD_CTLR); + gicv3_gicd_wait_for_rwp(); +} + +static void gicv3_init(unsigned int nr_cpus) +{ + GUEST_ASSERT(nr_cpus <= GICV3_MAX_CPUS); + + gicv3_data.nr_cpus = nr_cpus; + gicv3_data.nr_spis = GICD_TYPER_SPIS( + readl(GICD_BASE_GVA + GICD_TYPER)); + if (gicv3_data.nr_spis > 1020) + gicv3_data.nr_spis = 1020; + + /* + * Initialize only the distributor for now. + * The redistributor and CPU interfaces are initialized + * later for every PE. + */ + gicv3_dist_init(); +} + +const struct gic_common_ops gicv3_ops = { + .gic_init = gicv3_init, + .gic_cpu_init = gicv3_cpu_init, + .gic_irq_enable = gicv3_irq_enable, + .gic_irq_disable = gicv3_irq_disable, + .gic_read_iar = gicv3_read_iar, + .gic_write_eoir = gicv3_write_eoir, + .gic_write_dir = gicv3_write_dir, + .gic_set_priority_mask = gicv3_set_priority_mask, + .gic_set_eoi_split = gicv3_set_eoi_split, + .gic_set_priority = gicv3_set_priority, + .gic_irq_set_active = gicv3_irq_set_active, + .gic_irq_clear_active = gicv3_irq_clear_active, + .gic_irq_get_active = gicv3_irq_get_active, + .gic_irq_set_pending = gicv3_irq_set_pending, + .gic_irq_clear_pending = gicv3_irq_clear_pending, + .gic_irq_get_pending = gicv3_irq_get_pending, + .gic_irq_set_config = gicv3_irq_set_config, +}; + +void gic_rdist_enable_lpis(vm_paddr_t cfg_table, size_t cfg_table_size, + vm_paddr_t pend_table) +{ + volatile void *rdist_base = gicr_base_cpu(guest_get_vcpuid()); + + u32 ctlr; + u64 val; + + val = (cfg_table | + GICR_PROPBASER_InnerShareable | + GICR_PROPBASER_RaWaWb | + ((ilog2(cfg_table_size) - 1) & GICR_PROPBASER_IDBITS_MASK)); + writeq_relaxed(val, rdist_base + GICR_PROPBASER); + + val = (pend_table | + GICR_PENDBASER_InnerShareable | + GICR_PENDBASER_RaWaWb); + writeq_relaxed(val, rdist_base + GICR_PENDBASER); + + ctlr = readl_relaxed(rdist_base + GICR_CTLR); + ctlr |= GICR_CTLR_ENABLE_LPIS; + writel_relaxed(ctlr, rdist_base + GICR_CTLR); +} diff --git a/tools/testing/selftests/kvm/lib/aarch64/gic_v3_its.c b/tools/testing/selftests/kvm/lib/aarch64/gic_v3_its.c new file mode 100644 index 000000000000..09f270545646 --- /dev/null +++ b/tools/testing/selftests/kvm/lib/aarch64/gic_v3_its.c @@ -0,0 +1,248 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Guest ITS library, generously donated by drivers/irqchip/irq-gic-v3-its.c + * over in the kernel tree. + */ + +#include <linux/kvm.h> +#include <linux/sizes.h> +#include <asm/kvm_para.h> +#include <asm/kvm.h> + +#include "kvm_util.h" +#include "vgic.h" +#include "gic.h" +#include "gic_v3.h" +#include "processor.h" + +static u64 its_read_u64(unsigned long offset) +{ + return readq_relaxed(GITS_BASE_GVA + offset); +} + +static void its_write_u64(unsigned long offset, u64 val) +{ + writeq_relaxed(val, GITS_BASE_GVA + offset); +} + +static u32 its_read_u32(unsigned long offset) +{ + return readl_relaxed(GITS_BASE_GVA + offset); +} + +static void its_write_u32(unsigned long offset, u32 val) +{ + writel_relaxed(val, GITS_BASE_GVA + offset); +} + +static unsigned long its_find_baser(unsigned int type) +{ + int i; + + for (i = 0; i < GITS_BASER_NR_REGS; i++) { + u64 baser; + unsigned long offset = GITS_BASER + (i * sizeof(baser)); + + baser = its_read_u64(offset); + if (GITS_BASER_TYPE(baser) == type) + return offset; + } + + GUEST_FAIL("Couldn't find an ITS BASER of type %u", type); + return -1; +} + +static void its_install_table(unsigned int type, vm_paddr_t base, size_t size) +{ + unsigned long offset = its_find_baser(type); + u64 baser; + + baser = ((size / SZ_64K) - 1) | + GITS_BASER_PAGE_SIZE_64K | + GITS_BASER_InnerShareable | + base | + GITS_BASER_RaWaWb | + GITS_BASER_VALID; + + its_write_u64(offset, baser); +} + +static void its_install_cmdq(vm_paddr_t base, size_t size) +{ + u64 cbaser; + + cbaser = ((size / SZ_4K) - 1) | + GITS_CBASER_InnerShareable | + base | + GITS_CBASER_RaWaWb | + GITS_CBASER_VALID; + + its_write_u64(GITS_CBASER, cbaser); +} + +void its_init(vm_paddr_t coll_tbl, size_t coll_tbl_sz, + vm_paddr_t device_tbl, size_t device_tbl_sz, + vm_paddr_t cmdq, size_t cmdq_size) +{ + u32 ctlr; + + its_install_table(GITS_BASER_TYPE_COLLECTION, coll_tbl, coll_tbl_sz); + its_install_table(GITS_BASER_TYPE_DEVICE, device_tbl, device_tbl_sz); + its_install_cmdq(cmdq, cmdq_size); + + ctlr = its_read_u32(GITS_CTLR); + ctlr |= GITS_CTLR_ENABLE; + its_write_u32(GITS_CTLR, ctlr); +} + +struct its_cmd_block { + union { + u64 raw_cmd[4]; + __le64 raw_cmd_le[4]; + }; +}; + +static inline void its_fixup_cmd(struct its_cmd_block *cmd) +{ + /* Let's fixup BE commands */ + cmd->raw_cmd_le[0] = cpu_to_le64(cmd->raw_cmd[0]); + cmd->raw_cmd_le[1] = cpu_to_le64(cmd->raw_cmd[1]); + cmd->raw_cmd_le[2] = cpu_to_le64(cmd->raw_cmd[2]); + cmd->raw_cmd_le[3] = cpu_to_le64(cmd->raw_cmd[3]); +} + +static void its_mask_encode(u64 *raw_cmd, u64 val, int h, int l) +{ + u64 mask = GENMASK_ULL(h, l); + *raw_cmd &= ~mask; + *raw_cmd |= (val << l) & mask; +} + +static void its_encode_cmd(struct its_cmd_block *cmd, u8 cmd_nr) +{ + its_mask_encode(&cmd->raw_cmd[0], cmd_nr, 7, 0); +} + +static void its_encode_devid(struct its_cmd_block *cmd, u32 devid) +{ + its_mask_encode(&cmd->raw_cmd[0], devid, 63, 32); +} + +static void its_encode_event_id(struct its_cmd_block *cmd, u32 id) +{ + its_mask_encode(&cmd->raw_cmd[1], id, 31, 0); +} + +static void its_encode_phys_id(struct its_cmd_block *cmd, u32 phys_id) +{ + its_mask_encode(&cmd->raw_cmd[1], phys_id, 63, 32); +} + +static void its_encode_size(struct its_cmd_block *cmd, u8 size) +{ + its_mask_encode(&cmd->raw_cmd[1], size, 4, 0); +} + +static void its_encode_itt(struct its_cmd_block *cmd, u64 itt_addr) +{ + its_mask_encode(&cmd->raw_cmd[2], itt_addr >> 8, 51, 8); +} + +static void its_encode_valid(struct its_cmd_block *cmd, int valid) +{ + its_mask_encode(&cmd->raw_cmd[2], !!valid, 63, 63); +} + +static void its_encode_target(struct its_cmd_block *cmd, u64 target_addr) +{ + its_mask_encode(&cmd->raw_cmd[2], target_addr >> 16, 51, 16); +} + +static void its_encode_collection(struct its_cmd_block *cmd, u16 col) +{ + its_mask_encode(&cmd->raw_cmd[2], col, 15, 0); +} + +#define GITS_CMDQ_POLL_ITERATIONS 0 + +static void its_send_cmd(void *cmdq_base, struct its_cmd_block *cmd) +{ + u64 cwriter = its_read_u64(GITS_CWRITER); + struct its_cmd_block *dst = cmdq_base + cwriter; + u64 cbaser = its_read_u64(GITS_CBASER); + size_t cmdq_size; + u64 next; + int i; + + cmdq_size = ((cbaser & 0xFF) + 1) * SZ_4K; + + its_fixup_cmd(cmd); + + WRITE_ONCE(*dst, *cmd); + dsb(ishst); + next = (cwriter + sizeof(*cmd)) % cmdq_size; + its_write_u64(GITS_CWRITER, next); + + /* + * Polling isn't necessary considering KVM's ITS emulation at the time + * of writing this, as the CMDQ is processed synchronously after a write + * to CWRITER. + */ + for (i = 0; its_read_u64(GITS_CREADR) != next; i++) { + __GUEST_ASSERT(i < GITS_CMDQ_POLL_ITERATIONS, + "ITS didn't process command at offset %lu after %d iterations\n", + cwriter, i); + + cpu_relax(); + } +} + +void its_send_mapd_cmd(void *cmdq_base, u32 device_id, vm_paddr_t itt_base, + size_t itt_size, bool valid) +{ + struct its_cmd_block cmd = {}; + + its_encode_cmd(&cmd, GITS_CMD_MAPD); + its_encode_devid(&cmd, device_id); + its_encode_size(&cmd, ilog2(itt_size) - 1); + its_encode_itt(&cmd, itt_base); + its_encode_valid(&cmd, valid); + + its_send_cmd(cmdq_base, &cmd); +} + +void its_send_mapc_cmd(void *cmdq_base, u32 vcpu_id, u32 collection_id, bool valid) +{ + struct its_cmd_block cmd = {}; + + its_encode_cmd(&cmd, GITS_CMD_MAPC); + its_encode_collection(&cmd, collection_id); + its_encode_target(&cmd, vcpu_id); + its_encode_valid(&cmd, valid); + + its_send_cmd(cmdq_base, &cmd); +} + +void its_send_mapti_cmd(void *cmdq_base, u32 device_id, u32 event_id, + u32 collection_id, u32 intid) +{ + struct its_cmd_block cmd = {}; + + its_encode_cmd(&cmd, GITS_CMD_MAPTI); + its_encode_devid(&cmd, device_id); + its_encode_event_id(&cmd, event_id); + its_encode_phys_id(&cmd, intid); + its_encode_collection(&cmd, collection_id); + + its_send_cmd(cmdq_base, &cmd); +} + +void its_send_invall_cmd(void *cmdq_base, u32 collection_id) +{ + struct its_cmd_block cmd = {}; + + its_encode_cmd(&cmd, GITS_CMD_INVALL); + its_encode_collection(&cmd, collection_id); + + its_send_cmd(cmdq_base, &cmd); +} diff --git a/tools/testing/selftests/kvm/lib/aarch64/handlers.S b/tools/testing/selftests/kvm/lib/aarch64/handlers.S new file mode 100644 index 000000000000..0e443eadfac6 --- /dev/null +++ b/tools/testing/selftests/kvm/lib/aarch64/handlers.S @@ -0,0 +1,126 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +.macro save_registers + add sp, sp, #-16 * 17 + + stp x0, x1, [sp, #16 * 0] + stp x2, x3, [sp, #16 * 1] + stp x4, x5, [sp, #16 * 2] + stp x6, x7, [sp, #16 * 3] + stp x8, x9, [sp, #16 * 4] + stp x10, x11, [sp, #16 * 5] + stp x12, x13, [sp, #16 * 6] + stp x14, x15, [sp, #16 * 7] + stp x16, x17, [sp, #16 * 8] + stp x18, x19, [sp, #16 * 9] + stp x20, x21, [sp, #16 * 10] + stp x22, x23, [sp, #16 * 11] + stp x24, x25, [sp, #16 * 12] + stp x26, x27, [sp, #16 * 13] + stp x28, x29, [sp, #16 * 14] + + /* + * This stores sp_el1 into ex_regs.sp so exception handlers can "look" + * at it. It will _not_ be used to restore the sp on return from the + * exception so handlers can not update it. + */ + add x1, sp, #16 * 17 + stp x30, x1, [sp, #16 * 15] /* x30, SP */ + + mrs x1, elr_el1 + mrs x2, spsr_el1 + stp x1, x2, [sp, #16 * 16] /* PC, PSTATE */ +.endm + +.macro restore_registers + ldp x1, x2, [sp, #16 * 16] /* PC, PSTATE */ + msr elr_el1, x1 + msr spsr_el1, x2 + + /* sp is not restored */ + ldp x30, xzr, [sp, #16 * 15] /* x30, SP */ + + ldp x28, x29, [sp, #16 * 14] + ldp x26, x27, [sp, #16 * 13] + ldp x24, x25, [sp, #16 * 12] + ldp x22, x23, [sp, #16 * 11] + ldp x20, x21, [sp, #16 * 10] + ldp x18, x19, [sp, #16 * 9] + ldp x16, x17, [sp, #16 * 8] + ldp x14, x15, [sp, #16 * 7] + ldp x12, x13, [sp, #16 * 6] + ldp x10, x11, [sp, #16 * 5] + ldp x8, x9, [sp, #16 * 4] + ldp x6, x7, [sp, #16 * 3] + ldp x4, x5, [sp, #16 * 2] + ldp x2, x3, [sp, #16 * 1] + ldp x0, x1, [sp, #16 * 0] + + add sp, sp, #16 * 17 + + eret +.endm + +.pushsection ".entry.text", "ax" +.balign 0x800 +.global vectors +vectors: +.popsection + +.set vector, 0 + +/* + * Build an exception handler for vector and append a jump to it into + * vectors (while making sure that it's 0x80 aligned). + */ +.macro HANDLER, label +handler_\label: + save_registers + mov x0, sp + mov x1, #vector + bl route_exception + restore_registers + +.pushsection ".entry.text", "ax" +.balign 0x80 + b handler_\label +.popsection + +.set vector, vector + 1 +.endm + +.macro HANDLER_INVALID +.pushsection ".entry.text", "ax" +.balign 0x80 +/* This will abort so no need to save and restore registers. */ + mov x0, #vector + mov x1, #0 /* ec */ + mov x2, #0 /* valid_ec */ + b kvm_exit_unexpected_exception +.popsection + +.set vector, vector + 1 +.endm + +/* + * Caution: be sure to not add anything between the declaration of vectors + * above and these macro calls that will build the vectors table below it. + */ + HANDLER_INVALID // Synchronous EL1t + HANDLER_INVALID // IRQ EL1t + HANDLER_INVALID // FIQ EL1t + HANDLER_INVALID // Error EL1t + + HANDLER el1h_sync // Synchronous EL1h + HANDLER el1h_irq // IRQ EL1h + HANDLER el1h_fiq // FIQ EL1h + HANDLER el1h_error // Error EL1h + + HANDLER el0_sync_64 // Synchronous 64-bit EL0 + HANDLER el0_irq_64 // IRQ 64-bit EL0 + HANDLER el0_fiq_64 // FIQ 64-bit EL0 + HANDLER el0_error_64 // Error 64-bit EL0 + + HANDLER el0_sync_32 // Synchronous 32-bit EL0 + HANDLER el0_irq_32 // IRQ 32-bit EL0 + HANDLER el0_fiq_32 // FIQ 32-bit EL0 + HANDLER el0_error_32 // Error 32-bit EL0 diff --git a/tools/testing/selftests/kvm/lib/aarch64/processor.c b/tools/testing/selftests/kvm/lib/aarch64/processor.c index 2afa6618b396..0ac7cc89f38c 100644 --- a/tools/testing/selftests/kvm/lib/aarch64/processor.c +++ b/tools/testing/selftests/kvm/lib/aarch64/processor.c @@ -5,17 +5,21 @@ * Copyright (C) 2018, Red Hat, Inc. */ -#define _GNU_SOURCE /* for program_invocation_name */ - #include <linux/compiler.h> +#include <assert.h> +#include "guest_modes.h" #include "kvm_util.h" -#include "../kvm_util_internal.h" #include "processor.h" +#include "ucall_common.h" + +#include <linux/bitfield.h> +#include <linux/sizes.h> -#define KVM_GUEST_PAGE_TABLE_MIN_PADDR 0x180000 #define DEFAULT_ARM64_GUEST_STACK_VADDR_MIN 0xac0000 +static vm_vaddr_t exception_handlers; + static uint64_t page_align(struct kvm_vm *vm, uint64_t v) { return (v + vm->page_size) & ~(vm->page_size - 1); @@ -57,10 +61,44 @@ static uint64_t pte_index(struct kvm_vm *vm, vm_vaddr_t gva) return (gva >> vm->page_shift) & mask; } -static uint64_t pte_addr(struct kvm_vm *vm, uint64_t entry) +static inline bool use_lpa2_pte_format(struct kvm_vm *vm) { - uint64_t mask = ((1UL << (vm->va_bits - vm->page_shift)) - 1) << vm->page_shift; - return entry & mask; + return (vm->page_size == SZ_4K || vm->page_size == SZ_16K) && + (vm->pa_bits > 48 || vm->va_bits > 48); +} + +static uint64_t addr_pte(struct kvm_vm *vm, uint64_t pa, uint64_t attrs) +{ + uint64_t pte; + + if (use_lpa2_pte_format(vm)) { + pte = pa & GENMASK(49, vm->page_shift); + pte |= FIELD_GET(GENMASK(51, 50), pa) << 8; + attrs &= ~GENMASK(9, 8); + } else { + pte = pa & GENMASK(47, vm->page_shift); + if (vm->page_shift == 16) + pte |= FIELD_GET(GENMASK(51, 48), pa) << 12; + } + pte |= attrs; + + return pte; +} + +static uint64_t pte_addr(struct kvm_vm *vm, uint64_t pte) +{ + uint64_t pa; + + if (use_lpa2_pte_format(vm)) { + pa = pte & GENMASK(49, vm->page_shift); + pa |= FIELD_GET(GENMASK(9, 8), pte) << 50; + } else { + pa = pte & GENMASK(47, vm->page_shift); + if (vm->page_shift == 16) + pa |= FIELD_GET(GENMASK(15, 12), pte) << 48; + } + + return pa; } static uint64_t ptrs_per_pgd(struct kvm_vm *vm) @@ -74,19 +112,21 @@ static uint64_t __maybe_unused ptrs_per_pte(struct kvm_vm *vm) return 1 << (vm->page_shift - 3); } -void virt_pgd_alloc(struct kvm_vm *vm, uint32_t pgd_memslot) +void virt_arch_pgd_alloc(struct kvm_vm *vm) { - if (!vm->pgd_created) { - vm_paddr_t paddr = vm_phy_pages_alloc(vm, - page_align(vm, ptrs_per_pgd(vm) * 8) / vm->page_size, - KVM_GUEST_PAGE_TABLE_MIN_PADDR, pgd_memslot); - vm->pgd = paddr; - vm->pgd_created = true; - } + size_t nr_pages = page_align(vm, ptrs_per_pgd(vm) * 8) / vm->page_size; + + if (vm->pgd_created) + return; + + vm->pgd = vm_phy_pages_alloc(vm, nr_pages, + KVM_GUEST_PAGE_TABLE_MIN_PADDR, + vm->memslots[MEM_REGION_PT]); + vm->pgd_created = true; } -void _virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, - uint32_t pgd_memslot, uint64_t flags) +static void _virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, + uint64_t flags) { uint8_t attr_idx = flags & 7; uint64_t *ptep; @@ -106,25 +146,19 @@ void _virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, paddr, vm->max_gfn, vm->page_size); ptep = addr_gpa2hva(vm, vm->pgd) + pgd_index(vm, vaddr) * 8; - if (!*ptep) { - *ptep = vm_phy_page_alloc(vm, KVM_GUEST_PAGE_TABLE_MIN_PADDR, pgd_memslot); - *ptep |= 3; - } + if (!*ptep) + *ptep = addr_pte(vm, vm_alloc_page_table(vm), 3); switch (vm->pgtable_levels) { case 4: ptep = addr_gpa2hva(vm, pte_addr(vm, *ptep)) + pud_index(vm, vaddr) * 8; - if (!*ptep) { - *ptep = vm_phy_page_alloc(vm, KVM_GUEST_PAGE_TABLE_MIN_PADDR, pgd_memslot); - *ptep |= 3; - } + if (!*ptep) + *ptep = addr_pte(vm, vm_alloc_page_table(vm), 3); /* fall through */ case 3: ptep = addr_gpa2hva(vm, pte_addr(vm, *ptep)) + pmd_index(vm, vaddr) * 8; - if (!*ptep) { - *ptep = vm_phy_page_alloc(vm, KVM_GUEST_PAGE_TABLE_MIN_PADDR, pgd_memslot); - *ptep |= 3; - } + if (!*ptep) + *ptep = addr_pte(vm, vm_alloc_page_table(vm), 3); /* fall through */ case 2: ptep = addr_gpa2hva(vm, pte_addr(vm, *ptep)) + pte_index(vm, vaddr) * 8; @@ -133,19 +167,17 @@ void _virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, TEST_FAIL("Page table levels must be 2, 3, or 4"); } - *ptep = paddr | 3; - *ptep |= (attr_idx << 2) | (1 << 10) /* Access Flag */; + *ptep = addr_pte(vm, paddr, (attr_idx << 2) | (1 << 10) | 3); /* AF */ } -void virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, - uint32_t pgd_memslot) +void virt_arch_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr) { - uint64_t attr_idx = 4; /* NORMAL (See DEFAULT_MAIR_EL1) */ + uint64_t attr_idx = MT_NORMAL; - _virt_pg_map(vm, vaddr, paddr, pgd_memslot, attr_idx); + _virt_pg_map(vm, vaddr, paddr, attr_idx); } -vm_paddr_t addr_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva) +uint64_t *virt_get_pte_hva(struct kvm_vm *vm, vm_vaddr_t gva) { uint64_t *ptep; @@ -176,11 +208,18 @@ vm_paddr_t addr_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva) TEST_FAIL("Page table levels must be 2, 3, or 4"); } - return pte_addr(vm, *ptep) + (gva & (vm->page_size - 1)); + return ptep; unmapped_gva: TEST_FAIL("No mapping for vm virtual address, gva: 0x%lx", gva); - exit(1); + exit(EXIT_FAILURE); +} + +vm_paddr_t addr_arch_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva) +{ + uint64_t *ptep = virt_get_pte_hva(vm, gva); + + return pte_addr(vm, *ptep) + (gva & (vm->page_size - 1)); } static void pte_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent, uint64_t page, int level) @@ -202,7 +241,7 @@ static void pte_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent, uint64_t p #endif } -void virt_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent) +void virt_arch_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent) { int level = 4 - (vm->pgtable_levels - 1); uint64_t pgd, *ptep; @@ -219,25 +258,11 @@ void virt_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent) } } -struct kvm_vm *vm_create_default(uint32_t vcpuid, uint64_t extra_mem_pages, - void *guest_code) -{ - uint64_t ptrs_per_4k_pte = 512; - uint64_t extra_pg_pages = (extra_mem_pages / ptrs_per_4k_pte) * 2; - struct kvm_vm *vm; - - vm = vm_create(VM_MODE_DEFAULT, DEFAULT_GUEST_PHY_PAGES + extra_pg_pages, O_RDWR); - - kvm_vm_elf_load(vm, program_invocation_name, 0, 0); - vm_vcpu_add_default(vm, vcpuid, guest_code); - - return vm; -} - -void aarch64_vcpu_setup(struct kvm_vm *vm, int vcpuid, struct kvm_vcpu_init *init) +void aarch64_vcpu_setup(struct kvm_vcpu *vcpu, struct kvm_vcpu_init *init) { struct kvm_vcpu_init default_init = { .target = -1, }; - uint64_t sctlr_el1, tcr_el1; + struct kvm_vm *vm = vcpu->vm; + uint64_t sctlr_el1, tcr_el1, ttbr0_el1; if (!init) init = &default_init; @@ -248,44 +273,71 @@ void aarch64_vcpu_setup(struct kvm_vm *vm, int vcpuid, struct kvm_vcpu_init *ini init->target = preferred.target; } - vcpu_ioctl(vm, vcpuid, KVM_ARM_VCPU_INIT, init); + vcpu_ioctl(vcpu, KVM_ARM_VCPU_INIT, init); /* * Enable FP/ASIMD to avoid trapping when accessing Q0-Q15 * registers, which the variable argument list macros do. */ - set_reg(vm, vcpuid, ARM64_SYS_REG(CPACR_EL1), 3 << 20); + vcpu_set_reg(vcpu, KVM_ARM64_SYS_REG(SYS_CPACR_EL1), 3 << 20); - get_reg(vm, vcpuid, ARM64_SYS_REG(SCTLR_EL1), &sctlr_el1); - get_reg(vm, vcpuid, ARM64_SYS_REG(TCR_EL1), &tcr_el1); + vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(SYS_SCTLR_EL1), &sctlr_el1); + vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(SYS_TCR_EL1), &tcr_el1); + /* Configure base granule size */ switch (vm->mode) { - case VM_MODE_P52V48_4K: - TEST_FAIL("AArch64 does not support 4K sized pages " - "with 52-bit physical address ranges"); case VM_MODE_PXXV48_4K: TEST_FAIL("AArch64 does not support 4K sized pages " "with ANY-bit physical address ranges"); case VM_MODE_P52V48_64K: + case VM_MODE_P48V48_64K: + case VM_MODE_P40V48_64K: + case VM_MODE_P36V48_64K: tcr_el1 |= 1ul << 14; /* TG0 = 64KB */ - tcr_el1 |= 6ul << 32; /* IPS = 52 bits */ break; + case VM_MODE_P52V48_16K: + case VM_MODE_P48V48_16K: + case VM_MODE_P40V48_16K: + case VM_MODE_P36V48_16K: + case VM_MODE_P36V47_16K: + tcr_el1 |= 2ul << 14; /* TG0 = 16KB */ + break; + case VM_MODE_P52V48_4K: case VM_MODE_P48V48_4K: + case VM_MODE_P40V48_4K: + case VM_MODE_P36V48_4K: tcr_el1 |= 0ul << 14; /* TG0 = 4KB */ - tcr_el1 |= 5ul << 32; /* IPS = 48 bits */ break; + default: + TEST_FAIL("Unknown guest mode, mode: 0x%x", vm->mode); + } + + ttbr0_el1 = vm->pgd & GENMASK(47, vm->page_shift); + + /* Configure output size */ + switch (vm->mode) { + case VM_MODE_P52V48_4K: + case VM_MODE_P52V48_16K: + case VM_MODE_P52V48_64K: + tcr_el1 |= 6ul << 32; /* IPS = 52 bits */ + ttbr0_el1 |= FIELD_GET(GENMASK(51, 48), vm->pgd) << 2; + break; + case VM_MODE_P48V48_4K: + case VM_MODE_P48V48_16K: case VM_MODE_P48V48_64K: - tcr_el1 |= 1ul << 14; /* TG0 = 64KB */ tcr_el1 |= 5ul << 32; /* IPS = 48 bits */ break; case VM_MODE_P40V48_4K: - tcr_el1 |= 0ul << 14; /* TG0 = 4KB */ - tcr_el1 |= 2ul << 32; /* IPS = 40 bits */ - break; + case VM_MODE_P40V48_16K: case VM_MODE_P40V48_64K: - tcr_el1 |= 1ul << 14; /* TG0 = 64KB */ tcr_el1 |= 2ul << 32; /* IPS = 40 bits */ break; + case VM_MODE_P36V48_4K: + case VM_MODE_P36V48_16K: + case VM_MODE_P36V48_64K: + case VM_MODE_P36V47_16K: + tcr_el1 |= 1ul << 32; /* IPS = 36 bits */ + break; default: TEST_FAIL("Unknown guest mode, mode: 0x%x", vm->mode); } @@ -294,59 +346,296 @@ void aarch64_vcpu_setup(struct kvm_vm *vm, int vcpuid, struct kvm_vcpu_init *ini /* TCR_EL1 |= IRGN0:WBWA | ORGN0:WBWA | SH0:Inner-Shareable */; tcr_el1 |= (1 << 8) | (1 << 10) | (3 << 12); tcr_el1 |= (64 - vm->va_bits) /* T0SZ */; - - set_reg(vm, vcpuid, ARM64_SYS_REG(SCTLR_EL1), sctlr_el1); - set_reg(vm, vcpuid, ARM64_SYS_REG(TCR_EL1), tcr_el1); - set_reg(vm, vcpuid, ARM64_SYS_REG(MAIR_EL1), DEFAULT_MAIR_EL1); - set_reg(vm, vcpuid, ARM64_SYS_REG(TTBR0_EL1), vm->pgd); + if (use_lpa2_pte_format(vm)) + tcr_el1 |= (1ul << 59) /* DS */; + + vcpu_set_reg(vcpu, KVM_ARM64_SYS_REG(SYS_SCTLR_EL1), sctlr_el1); + vcpu_set_reg(vcpu, KVM_ARM64_SYS_REG(SYS_TCR_EL1), tcr_el1); + vcpu_set_reg(vcpu, KVM_ARM64_SYS_REG(SYS_MAIR_EL1), DEFAULT_MAIR_EL1); + vcpu_set_reg(vcpu, KVM_ARM64_SYS_REG(SYS_TTBR0_EL1), ttbr0_el1); + vcpu_set_reg(vcpu, KVM_ARM64_SYS_REG(SYS_TPIDR_EL1), vcpu->id); } -void vcpu_dump(FILE *stream, struct kvm_vm *vm, uint32_t vcpuid, uint8_t indent) +void vcpu_arch_dump(FILE *stream, struct kvm_vcpu *vcpu, uint8_t indent) { uint64_t pstate, pc; - get_reg(vm, vcpuid, ARM64_CORE_REG(regs.pstate), &pstate); - get_reg(vm, vcpuid, ARM64_CORE_REG(regs.pc), &pc); + vcpu_get_reg(vcpu, ARM64_CORE_REG(regs.pstate), &pstate); + vcpu_get_reg(vcpu, ARM64_CORE_REG(regs.pc), &pc); fprintf(stream, "%*spstate: 0x%.16lx pc: 0x%.16lx\n", indent, "", pstate, pc); } -void aarch64_vcpu_add_default(struct kvm_vm *vm, uint32_t vcpuid, - struct kvm_vcpu_init *init, void *guest_code) +void vcpu_arch_set_entry_point(struct kvm_vcpu *vcpu, void *guest_code) +{ + vcpu_set_reg(vcpu, ARM64_CORE_REG(regs.pc), (uint64_t)guest_code); +} + +static struct kvm_vcpu *__aarch64_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id, + struct kvm_vcpu_init *init) { - size_t stack_size = vm->page_size == 4096 ? - DEFAULT_STACK_PGS * vm->page_size : - vm->page_size; - uint64_t stack_vaddr = vm_vaddr_alloc(vm, stack_size, - DEFAULT_ARM64_GUEST_STACK_VADDR_MIN, 0, 0); + size_t stack_size; + uint64_t stack_vaddr; + struct kvm_vcpu *vcpu = __vm_vcpu_add(vm, vcpu_id); + + stack_size = vm->page_size == 4096 ? DEFAULT_STACK_PGS * vm->page_size : + vm->page_size; + stack_vaddr = __vm_vaddr_alloc(vm, stack_size, + DEFAULT_ARM64_GUEST_STACK_VADDR_MIN, + MEM_REGION_DATA); - vm_vcpu_add(vm, vcpuid); - aarch64_vcpu_setup(vm, vcpuid, init); + aarch64_vcpu_setup(vcpu, init); - set_reg(vm, vcpuid, ARM64_CORE_REG(sp_el1), stack_vaddr + stack_size); - set_reg(vm, vcpuid, ARM64_CORE_REG(regs.pc), (uint64_t)guest_code); + vcpu_set_reg(vcpu, ARM64_CORE_REG(sp_el1), stack_vaddr + stack_size); + return vcpu; } -void vm_vcpu_add_default(struct kvm_vm *vm, uint32_t vcpuid, void *guest_code) +struct kvm_vcpu *aarch64_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id, + struct kvm_vcpu_init *init, void *guest_code) { - aarch64_vcpu_add_default(vm, vcpuid, NULL, guest_code); + struct kvm_vcpu *vcpu = __aarch64_vcpu_add(vm, vcpu_id, init); + + vcpu_arch_set_entry_point(vcpu, guest_code); + + return vcpu; } -void vcpu_args_set(struct kvm_vm *vm, uint32_t vcpuid, unsigned int num, ...) +struct kvm_vcpu *vm_arch_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id) +{ + return __aarch64_vcpu_add(vm, vcpu_id, NULL); +} + +void vcpu_args_set(struct kvm_vcpu *vcpu, unsigned int num, ...) { va_list ap; int i; TEST_ASSERT(num >= 1 && num <= 8, "Unsupported number of args,\n" - " num: %u\n", num); + " num: %u", num); va_start(ap, num); for (i = 0; i < num; i++) { - set_reg(vm, vcpuid, ARM64_CORE_REG(regs.regs[i]), - va_arg(ap, uint64_t)); + vcpu_set_reg(vcpu, ARM64_CORE_REG(regs.regs[i]), + va_arg(ap, uint64_t)); } va_end(ap); } + +void kvm_exit_unexpected_exception(int vector, uint64_t ec, bool valid_ec) +{ + ucall(UCALL_UNHANDLED, 3, vector, ec, valid_ec); + while (1) + ; +} + +void assert_on_unhandled_exception(struct kvm_vcpu *vcpu) +{ + struct ucall uc; + + if (get_ucall(vcpu, &uc) != UCALL_UNHANDLED) + return; + + if (uc.args[2]) /* valid_ec */ { + assert(VECTOR_IS_SYNC(uc.args[0])); + TEST_FAIL("Unexpected exception (vector:0x%lx, ec:0x%lx)", + uc.args[0], uc.args[1]); + } else { + assert(!VECTOR_IS_SYNC(uc.args[0])); + TEST_FAIL("Unexpected exception (vector:0x%lx)", + uc.args[0]); + } +} + +struct handlers { + handler_fn exception_handlers[VECTOR_NUM][ESR_EC_NUM]; +}; + +void vcpu_init_descriptor_tables(struct kvm_vcpu *vcpu) +{ + extern char vectors; + + vcpu_set_reg(vcpu, KVM_ARM64_SYS_REG(SYS_VBAR_EL1), (uint64_t)&vectors); +} + +void route_exception(struct ex_regs *regs, int vector) +{ + struct handlers *handlers = (struct handlers *)exception_handlers; + bool valid_ec; + int ec = 0; + + switch (vector) { + case VECTOR_SYNC_CURRENT: + case VECTOR_SYNC_LOWER_64: + ec = (read_sysreg(esr_el1) >> ESR_EC_SHIFT) & ESR_EC_MASK; + valid_ec = true; + break; + case VECTOR_IRQ_CURRENT: + case VECTOR_IRQ_LOWER_64: + case VECTOR_FIQ_CURRENT: + case VECTOR_FIQ_LOWER_64: + case VECTOR_ERROR_CURRENT: + case VECTOR_ERROR_LOWER_64: + ec = 0; + valid_ec = false; + break; + default: + valid_ec = false; + goto unexpected_exception; + } + + if (handlers && handlers->exception_handlers[vector][ec]) + return handlers->exception_handlers[vector][ec](regs); + +unexpected_exception: + kvm_exit_unexpected_exception(vector, ec, valid_ec); +} + +void vm_init_descriptor_tables(struct kvm_vm *vm) +{ + vm->handlers = __vm_vaddr_alloc(vm, sizeof(struct handlers), + vm->page_size, MEM_REGION_DATA); + + *(vm_vaddr_t *)addr_gva2hva(vm, (vm_vaddr_t)(&exception_handlers)) = vm->handlers; +} + +void vm_install_sync_handler(struct kvm_vm *vm, int vector, int ec, + void (*handler)(struct ex_regs *)) +{ + struct handlers *handlers = addr_gva2hva(vm, vm->handlers); + + assert(VECTOR_IS_SYNC(vector)); + assert(vector < VECTOR_NUM); + assert(ec < ESR_EC_NUM); + handlers->exception_handlers[vector][ec] = handler; +} + +void vm_install_exception_handler(struct kvm_vm *vm, int vector, + void (*handler)(struct ex_regs *)) +{ + struct handlers *handlers = addr_gva2hva(vm, vm->handlers); + + assert(!VECTOR_IS_SYNC(vector)); + assert(vector < VECTOR_NUM); + handlers->exception_handlers[vector][0] = handler; +} + +uint32_t guest_get_vcpuid(void) +{ + return read_sysreg(tpidr_el1); +} + +static uint32_t max_ipa_for_page_size(uint32_t vm_ipa, uint32_t gran, + uint32_t not_sup_val, uint32_t ipa52_min_val) +{ + if (gran == not_sup_val) + return 0; + else if (gran >= ipa52_min_val && vm_ipa >= 52) + return 52; + else + return min(vm_ipa, 48U); +} + +void aarch64_get_supported_page_sizes(uint32_t ipa, uint32_t *ipa4k, + uint32_t *ipa16k, uint32_t *ipa64k) +{ + struct kvm_vcpu_init preferred_init; + int kvm_fd, vm_fd, vcpu_fd, err; + uint64_t val; + uint32_t gran; + struct kvm_one_reg reg = { + .id = KVM_ARM64_SYS_REG(SYS_ID_AA64MMFR0_EL1), + .addr = (uint64_t)&val, + }; + + kvm_fd = open_kvm_dev_path_or_exit(); + vm_fd = __kvm_ioctl(kvm_fd, KVM_CREATE_VM, (void *)(unsigned long)ipa); + TEST_ASSERT(vm_fd >= 0, KVM_IOCTL_ERROR(KVM_CREATE_VM, vm_fd)); + + vcpu_fd = ioctl(vm_fd, KVM_CREATE_VCPU, 0); + TEST_ASSERT(vcpu_fd >= 0, KVM_IOCTL_ERROR(KVM_CREATE_VCPU, vcpu_fd)); + + err = ioctl(vm_fd, KVM_ARM_PREFERRED_TARGET, &preferred_init); + TEST_ASSERT(err == 0, KVM_IOCTL_ERROR(KVM_ARM_PREFERRED_TARGET, err)); + err = ioctl(vcpu_fd, KVM_ARM_VCPU_INIT, &preferred_init); + TEST_ASSERT(err == 0, KVM_IOCTL_ERROR(KVM_ARM_VCPU_INIT, err)); + + err = ioctl(vcpu_fd, KVM_GET_ONE_REG, ®); + TEST_ASSERT(err == 0, KVM_IOCTL_ERROR(KVM_GET_ONE_REG, vcpu_fd)); + + gran = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64MMFR0_EL1_TGRAN4), val); + *ipa4k = max_ipa_for_page_size(ipa, gran, ID_AA64MMFR0_EL1_TGRAN4_NI, + ID_AA64MMFR0_EL1_TGRAN4_52_BIT); + + gran = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64MMFR0_EL1_TGRAN64), val); + *ipa64k = max_ipa_for_page_size(ipa, gran, ID_AA64MMFR0_EL1_TGRAN64_NI, + ID_AA64MMFR0_EL1_TGRAN64_IMP); + + gran = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64MMFR0_EL1_TGRAN16), val); + *ipa16k = max_ipa_for_page_size(ipa, gran, ID_AA64MMFR0_EL1_TGRAN16_NI, + ID_AA64MMFR0_EL1_TGRAN16_52_BIT); + + close(vcpu_fd); + close(vm_fd); + close(kvm_fd); +} + +#define __smccc_call(insn, function_id, arg0, arg1, arg2, arg3, arg4, arg5, \ + arg6, res) \ + asm volatile("mov w0, %w[function_id]\n" \ + "mov x1, %[arg0]\n" \ + "mov x2, %[arg1]\n" \ + "mov x3, %[arg2]\n" \ + "mov x4, %[arg3]\n" \ + "mov x5, %[arg4]\n" \ + "mov x6, %[arg5]\n" \ + "mov x7, %[arg6]\n" \ + #insn "#0\n" \ + "mov %[res0], x0\n" \ + "mov %[res1], x1\n" \ + "mov %[res2], x2\n" \ + "mov %[res3], x3\n" \ + : [res0] "=r"(res->a0), [res1] "=r"(res->a1), \ + [res2] "=r"(res->a2), [res3] "=r"(res->a3) \ + : [function_id] "r"(function_id), [arg0] "r"(arg0), \ + [arg1] "r"(arg1), [arg2] "r"(arg2), [arg3] "r"(arg3), \ + [arg4] "r"(arg4), [arg5] "r"(arg5), [arg6] "r"(arg6) \ + : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7") + + +void smccc_hvc(uint32_t function_id, uint64_t arg0, uint64_t arg1, + uint64_t arg2, uint64_t arg3, uint64_t arg4, uint64_t arg5, + uint64_t arg6, struct arm_smccc_res *res) +{ + __smccc_call(hvc, function_id, arg0, arg1, arg2, arg3, arg4, arg5, + arg6, res); +} + +void smccc_smc(uint32_t function_id, uint64_t arg0, uint64_t arg1, + uint64_t arg2, uint64_t arg3, uint64_t arg4, uint64_t arg5, + uint64_t arg6, struct arm_smccc_res *res) +{ + __smccc_call(smc, function_id, arg0, arg1, arg2, arg3, arg4, arg5, + arg6, res); +} + +void kvm_selftest_arch_init(void) +{ + /* + * arm64 doesn't have a true default mode, so start by computing the + * available IPA space and page sizes early. + */ + guest_modes_append_default(); +} + +void vm_vaddr_populate_bitmap(struct kvm_vm *vm) +{ + /* + * arm64 selftests use only TTBR0_EL1, meaning that the valid VA space + * is [0, 2^(64 - TCR_EL1.T0SZ)). + */ + sparsebit_set_num(vm->vpages_valid, 0, + (1ULL << vm->va_bits) >> vm->page_shift); +} diff --git a/tools/testing/selftests/kvm/lib/aarch64/spinlock.c b/tools/testing/selftests/kvm/lib/aarch64/spinlock.c new file mode 100644 index 000000000000..a076e780be5d --- /dev/null +++ b/tools/testing/selftests/kvm/lib/aarch64/spinlock.c @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * ARM64 Spinlock support + */ +#include <stdint.h> + +#include "spinlock.h" + +void spin_lock(struct spinlock *lock) +{ + int val, res; + + asm volatile( + "1: ldaxr %w0, [%2]\n" + " cbnz %w0, 1b\n" + " mov %w0, #1\n" + " stxr %w1, %w0, [%2]\n" + " cbnz %w1, 1b\n" + : "=&r" (val), "=&r" (res) + : "r" (&lock->v) + : "memory"); +} + +void spin_unlock(struct spinlock *lock) +{ + asm volatile("stlr wzr, [%0]\n" : : "r" (&lock->v) : "memory"); +} diff --git a/tools/testing/selftests/kvm/lib/aarch64/ucall.c b/tools/testing/selftests/kvm/lib/aarch64/ucall.c index c8e0ec20d3bf..ddab0ce89d4d 100644 --- a/tools/testing/selftests/kvm/lib/aarch64/ucall.c +++ b/tools/testing/selftests/kvm/lib/aarch64/ucall.c @@ -5,108 +5,30 @@ * Copyright (C) 2018, Red Hat, Inc. */ #include "kvm_util.h" -#include "../kvm_util_internal.h" -static vm_vaddr_t *ucall_exit_mmio_addr; +vm_vaddr_t *ucall_exit_mmio_addr; -static bool ucall_mmio_init(struct kvm_vm *vm, vm_paddr_t gpa) +void ucall_arch_init(struct kvm_vm *vm, vm_paddr_t mmio_gpa) { - if (kvm_userspace_memory_region_find(vm, gpa, gpa + 1)) - return false; + vm_vaddr_t mmio_gva = vm_vaddr_unused_gap(vm, vm->page_size, KVM_UTIL_MIN_VADDR); - virt_pg_map(vm, gpa, gpa, 0); + virt_map(vm, mmio_gva, mmio_gpa, 1); - ucall_exit_mmio_addr = (vm_vaddr_t *)gpa; - sync_global_to_guest(vm, ucall_exit_mmio_addr); + vm->ucall_mmio_addr = mmio_gpa; - return true; + write_guest_global(vm, ucall_exit_mmio_addr, (vm_vaddr_t *)mmio_gva); } -void ucall_init(struct kvm_vm *vm, void *arg) +void *ucall_arch_get_ucall(struct kvm_vcpu *vcpu) { - vm_paddr_t gpa, start, end, step, offset; - unsigned int bits; - bool ret; - - if (arg) { - gpa = (vm_paddr_t)arg; - ret = ucall_mmio_init(vm, gpa); - TEST_ASSERT(ret, "Can't set ucall mmio address to %lx", gpa); - return; - } - - /* - * Find an address within the allowed physical and virtual address - * spaces, that does _not_ have a KVM memory region associated with - * it. Identity mapping an address like this allows the guest to - * access it, but as KVM doesn't know what to do with it, it - * will assume it's something userspace handles and exit with - * KVM_EXIT_MMIO. Well, at least that's how it works for AArch64. - * Here we start with a guess that the addresses around 5/8th - * of the allowed space are unmapped and then work both down and - * up from there in 1/16th allowed space sized steps. - * - * Note, we need to use VA-bits - 1 when calculating the allowed - * virtual address space for an identity mapping because the upper - * half of the virtual address space is the two's complement of the - * lower and won't match physical addresses. - */ - bits = vm->va_bits - 1; - bits = vm->pa_bits < bits ? vm->pa_bits : bits; - end = 1ul << bits; - start = end * 5 / 8; - step = end / 16; - for (offset = 0; offset < end - start; offset += step) { - if (ucall_mmio_init(vm, start - offset)) - return; - if (ucall_mmio_init(vm, start + offset)) - return; - } - TEST_FAIL("Can't find a ucall mmio address"); -} - -void ucall_uninit(struct kvm_vm *vm) -{ - ucall_exit_mmio_addr = 0; - sync_global_to_guest(vm, ucall_exit_mmio_addr); -} - -void ucall(uint64_t cmd, int nargs, ...) -{ - struct ucall uc = { - .cmd = cmd, - }; - va_list va; - int i; - - nargs = nargs <= UCALL_MAX_ARGS ? nargs : UCALL_MAX_ARGS; - - va_start(va, nargs); - for (i = 0; i < nargs; ++i) - uc.args[i] = va_arg(va, uint64_t); - va_end(va); - - *ucall_exit_mmio_addr = (vm_vaddr_t)&uc; -} - -uint64_t get_ucall(struct kvm_vm *vm, uint32_t vcpu_id, struct ucall *uc) -{ - struct kvm_run *run = vcpu_state(vm, vcpu_id); - struct ucall ucall = {}; + struct kvm_run *run = vcpu->run; if (run->exit_reason == KVM_EXIT_MMIO && - run->mmio.phys_addr == (uint64_t)ucall_exit_mmio_addr) { - vm_vaddr_t gva; - - TEST_ASSERT(run->mmio.is_write && run->mmio.len == 8, + run->mmio.phys_addr == vcpu->vm->ucall_mmio_addr) { + TEST_ASSERT(run->mmio.is_write && run->mmio.len == sizeof(uint64_t), "Unexpected ucall exit mmio address access"); - memcpy(&gva, run->mmio.data, sizeof(gva)); - memcpy(&ucall, addr_gva2hva(vm, gva), sizeof(ucall)); - - vcpu_run_complete_io(vm, vcpu_id); - if (uc) - memcpy(uc, &ucall, sizeof(ucall)); + return (void *)(*((uint64_t *)run->mmio.data)); } - return ucall.cmd; + return NULL; } diff --git a/tools/testing/selftests/kvm/lib/aarch64/vgic.c b/tools/testing/selftests/kvm/lib/aarch64/vgic.c new file mode 100644 index 000000000000..4427f43f73ea --- /dev/null +++ b/tools/testing/selftests/kvm/lib/aarch64/vgic.c @@ -0,0 +1,188 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * ARM Generic Interrupt Controller (GIC) v3 host support + */ + +#include <linux/kernel.h> +#include <linux/kvm.h> +#include <linux/sizes.h> +#include <asm/cputype.h> +#include <asm/kvm_para.h> +#include <asm/kvm.h> + +#include "kvm_util.h" +#include "vgic.h" +#include "gic.h" +#include "gic_v3.h" + +/* + * vGIC-v3 default host setup + * + * Input args: + * vm - KVM VM + * nr_vcpus - Number of vCPUs supported by this VM + * + * Output args: None + * + * Return: GIC file-descriptor or negative error code upon failure + * + * The function creates a vGIC-v3 device and maps the distributor and + * redistributor regions of the guest. Since it depends on the number of + * vCPUs for the VM, it must be called after all the vCPUs have been created. + */ +int vgic_v3_setup(struct kvm_vm *vm, unsigned int nr_vcpus, uint32_t nr_irqs) +{ + int gic_fd; + uint64_t attr; + struct list_head *iter; + unsigned int nr_gic_pages, nr_vcpus_created = 0; + + TEST_ASSERT(nr_vcpus, "Number of vCPUs cannot be empty"); + + /* + * Make sure that the caller is infact calling this + * function after all the vCPUs are added. + */ + list_for_each(iter, &vm->vcpus) + nr_vcpus_created++; + TEST_ASSERT(nr_vcpus == nr_vcpus_created, + "Number of vCPUs requested (%u) doesn't match with the ones created for the VM (%u)", + nr_vcpus, nr_vcpus_created); + + /* Distributor setup */ + gic_fd = __kvm_create_device(vm, KVM_DEV_TYPE_ARM_VGIC_V3); + if (gic_fd < 0) + return gic_fd; + + kvm_device_attr_set(gic_fd, KVM_DEV_ARM_VGIC_GRP_NR_IRQS, 0, &nr_irqs); + + kvm_device_attr_set(gic_fd, KVM_DEV_ARM_VGIC_GRP_CTRL, + KVM_DEV_ARM_VGIC_CTRL_INIT, NULL); + + attr = GICD_BASE_GPA; + kvm_device_attr_set(gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, + KVM_VGIC_V3_ADDR_TYPE_DIST, &attr); + nr_gic_pages = vm_calc_num_guest_pages(vm->mode, KVM_VGIC_V3_DIST_SIZE); + virt_map(vm, GICD_BASE_GPA, GICD_BASE_GPA, nr_gic_pages); + + /* Redistributor setup */ + attr = REDIST_REGION_ATTR_ADDR(nr_vcpus, GICR_BASE_GPA, 0, 0); + kvm_device_attr_set(gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, + KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &attr); + nr_gic_pages = vm_calc_num_guest_pages(vm->mode, + KVM_VGIC_V3_REDIST_SIZE * nr_vcpus); + virt_map(vm, GICR_BASE_GPA, GICR_BASE_GPA, nr_gic_pages); + + kvm_device_attr_set(gic_fd, KVM_DEV_ARM_VGIC_GRP_CTRL, + KVM_DEV_ARM_VGIC_CTRL_INIT, NULL); + + return gic_fd; +} + +/* should only work for level sensitive interrupts */ +int _kvm_irq_set_level_info(int gic_fd, uint32_t intid, int level) +{ + uint64_t attr = 32 * (intid / 32); + uint64_t index = intid % 32; + uint64_t val; + int ret; + + ret = __kvm_device_attr_get(gic_fd, KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO, + attr, &val); + if (ret != 0) + return ret; + + val |= 1U << index; + ret = __kvm_device_attr_set(gic_fd, KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO, + attr, &val); + return ret; +} + +void kvm_irq_set_level_info(int gic_fd, uint32_t intid, int level) +{ + int ret = _kvm_irq_set_level_info(gic_fd, intid, level); + + TEST_ASSERT(!ret, KVM_IOCTL_ERROR(KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO, ret)); +} + +int _kvm_arm_irq_line(struct kvm_vm *vm, uint32_t intid, int level) +{ + uint32_t irq = intid & KVM_ARM_IRQ_NUM_MASK; + + TEST_ASSERT(!INTID_IS_SGI(intid), "KVM_IRQ_LINE's interface itself " + "doesn't allow injecting SGIs. There's no mask for it."); + + if (INTID_IS_PPI(intid)) + irq |= KVM_ARM_IRQ_TYPE_PPI << KVM_ARM_IRQ_TYPE_SHIFT; + else + irq |= KVM_ARM_IRQ_TYPE_SPI << KVM_ARM_IRQ_TYPE_SHIFT; + + return _kvm_irq_line(vm, irq, level); +} + +void kvm_arm_irq_line(struct kvm_vm *vm, uint32_t intid, int level) +{ + int ret = _kvm_arm_irq_line(vm, intid, level); + + TEST_ASSERT(!ret, KVM_IOCTL_ERROR(KVM_IRQ_LINE, ret)); +} + +static void vgic_poke_irq(int gic_fd, uint32_t intid, struct kvm_vcpu *vcpu, + uint64_t reg_off) +{ + uint64_t reg = intid / 32; + uint64_t index = intid % 32; + uint64_t attr = reg_off + reg * 4; + uint64_t val; + bool intid_is_private = INTID_IS_SGI(intid) || INTID_IS_PPI(intid); + + uint32_t group = intid_is_private ? KVM_DEV_ARM_VGIC_GRP_REDIST_REGS + : KVM_DEV_ARM_VGIC_GRP_DIST_REGS; + + if (intid_is_private) { + /* TODO: only vcpu 0 implemented for now. */ + assert(vcpu->id == 0); + attr += SZ_64K; + } + + /* Check that the addr part of the attr is within 32 bits. */ + assert((attr & ~KVM_DEV_ARM_VGIC_OFFSET_MASK) == 0); + + /* + * All calls will succeed, even with invalid intid's, as long as the + * addr part of the attr is within 32 bits (checked above). An invalid + * intid will just make the read/writes point to above the intended + * register space (i.e., ICPENDR after ISPENDR). + */ + kvm_device_attr_get(gic_fd, group, attr, &val); + val |= 1ULL << index; + kvm_device_attr_set(gic_fd, group, attr, &val); +} + +void kvm_irq_write_ispendr(int gic_fd, uint32_t intid, struct kvm_vcpu *vcpu) +{ + vgic_poke_irq(gic_fd, intid, vcpu, GICD_ISPENDR); +} + +void kvm_irq_write_isactiver(int gic_fd, uint32_t intid, struct kvm_vcpu *vcpu) +{ + vgic_poke_irq(gic_fd, intid, vcpu, GICD_ISACTIVER); +} + +int vgic_its_setup(struct kvm_vm *vm) +{ + int its_fd = kvm_create_device(vm, KVM_DEV_TYPE_ARM_VGIC_ITS); + u64 attr; + + attr = GITS_BASE_GPA; + kvm_device_attr_set(its_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, + KVM_VGIC_ITS_ADDR_TYPE, &attr); + + kvm_device_attr_set(its_fd, KVM_DEV_ARM_VGIC_GRP_CTRL, + KVM_DEV_ARM_VGIC_CTRL_INIT, NULL); + + virt_map(vm, GITS_BASE_GPA, GITS_BASE_GPA, + vm_calc_num_guest_pages(vm->mode, KVM_VGIC_V3_ITS_SIZE)); + + return its_fd; +} diff --git a/tools/testing/selftests/kvm/lib/assert.c b/tools/testing/selftests/kvm/lib/assert.c index 5ebbd0d6b472..b49690658c60 100644 --- a/tools/testing/selftests/kvm/lib/assert.c +++ b/tools/testing/selftests/kvm/lib/assert.c @@ -4,9 +4,6 @@ * * Copyright (C) 2018, Google LLC. */ - -#define _GNU_SOURCE /* for getline(3) and strchrnul(3)*/ - #include "test_util.h" #include <execinfo.h> @@ -22,7 +19,7 @@ static void test_dump_stack(void) * Build and run this command: * * addr2line -s -e /proc/$PPID/exe -fpai {backtrace addresses} | \ - * grep -v test_dump_stack | cat -n 1>&2 + * cat -n 1>&2 * * Note that the spacing is different and there's no newline. */ @@ -36,18 +33,24 @@ static void test_dump_stack(void) n * (((sizeof(void *)) * 2) + 1) + /* Null terminator: */ 1]; - char *c; + char *c = cmd; n = backtrace(stack, n); - c = &cmd[0]; - c += sprintf(c, "%s", addr2line); /* - * Skip the first 3 frames: backtrace, test_dump_stack, and - * test_assert. We hope that backtrace isn't inlined and the other two - * we've declared noinline. + * Skip the first 2 frames, which should be test_dump_stack() and + * test_assert(); both of which are declared noinline. Bail if the + * resulting stack trace would be empty. Otherwise, addr2line will block + * waiting for addresses to be passed in via stdin. */ + if (n <= 2) { + fputs(" (stack trace empty)\n", stderr); + return; + } + + c += sprintf(c, "%s", addr2line); for (i = 2; i < n; i++) c += sprintf(c, " %lx", ((unsigned long) stack[i]) - 1); + c += sprintf(c, "%s", pipeline); #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wunused-result" @@ -71,9 +74,9 @@ test_assert(bool exp, const char *exp_str, fprintf(stderr, "==== Test Assertion Failure ====\n" " %s:%u: %s\n" - " pid=%d tid=%d - %s\n", + " pid=%d tid=%d errno=%d - %s\n", file, line, exp_str, getpid(), _gettid(), - strerror(errno)); + errno, strerror(errno)); test_dump_stack(); if (fmt) { fputs(" ", stderr); diff --git a/tools/testing/selftests/kvm/lib/elf.c b/tools/testing/selftests/kvm/lib/elf.c index bc75a91e00a6..f34d926d9735 100644 --- a/tools/testing/selftests/kvm/lib/elf.c +++ b/tools/testing/selftests/kvm/lib/elf.c @@ -11,7 +11,6 @@ #include <linux/elf.h> #include "kvm_util.h" -#include "kvm_util_internal.h" static void elfhdr_get(const char *filename, Elf64_Ehdr *hdrp) { @@ -91,6 +90,7 @@ static void elfhdr_get(const char *filename, Elf64_Ehdr *hdrp) " hdrp->e_shentsize: %x\n" " expected: %zx", hdrp->e_shentsize, sizeof(Elf64_Shdr)); + close(fd); } /* VM ELF Load @@ -111,8 +111,7 @@ static void elfhdr_get(const char *filename, Elf64_Ehdr *hdrp) * by the image and it needs to have sufficient available physical pages, to * back the virtual pages used to load the image. */ -void kvm_vm_elf_load(struct kvm_vm *vm, const char *filename, - uint32_t data_memslot, uint32_t pgd_memslot) +void kvm_vm_elf_load(struct kvm_vm *vm, const char *filename) { off_t offset, offset_rv; Elf64_Ehdr hdr; @@ -140,7 +139,7 @@ void kvm_vm_elf_load(struct kvm_vm *vm, const char *filename, offset = hdr.e_phoff + (n1 * hdr.e_phentsize); offset_rv = lseek(fd, offset, SEEK_SET); TEST_ASSERT(offset_rv == offset, - "Failed to seek to begining of program header %u,\n" + "Failed to seek to beginning of program header %u,\n" " filename: %s\n" " rv: %jd errno: %i", n1, filename, (intmax_t) offset_rv, errno); @@ -158,14 +157,13 @@ void kvm_vm_elf_load(struct kvm_vm *vm, const char *filename, "memsize of 0,\n" " phdr index: %u p_memsz: 0x%" PRIx64, n1, (uint64_t) phdr.p_memsz); - vm_vaddr_t seg_vstart = phdr.p_vaddr; - seg_vstart &= ~(vm_vaddr_t)(vm->page_size - 1); + vm_vaddr_t seg_vstart = align_down(phdr.p_vaddr, vm->page_size); vm_vaddr_t seg_vend = phdr.p_vaddr + phdr.p_memsz - 1; seg_vend |= vm->page_size - 1; size_t seg_size = seg_vend - seg_vstart + 1; - vm_vaddr_t vaddr = vm_vaddr_alloc(vm, seg_size, seg_vstart, - data_memslot, pgd_memslot); + vm_vaddr_t vaddr = __vm_vaddr_alloc(vm, seg_size, seg_vstart, + MEM_REGION_CODE); TEST_ASSERT(vaddr == seg_vstart, "Unable to allocate " "virtual memory for segment at requested min addr,\n" " segment idx: %u\n" @@ -186,11 +184,12 @@ void kvm_vm_elf_load(struct kvm_vm *vm, const char *filename, "Seek to program segment offset failed,\n" " program header idx: %u errno: %i\n" " offset_rv: 0x%jx\n" - " expected: 0x%jx\n", + " expected: 0x%jx", n1, errno, (intmax_t) offset_rv, (intmax_t) phdr.p_offset); test_read(fd, addr_gva2hva(vm, phdr.p_vaddr), phdr.p_filesz); } } + close(fd); } diff --git a/tools/testing/selftests/kvm/lib/guest_modes.c b/tools/testing/selftests/kvm/lib/guest_modes.c new file mode 100644 index 000000000000..b04901e55138 --- /dev/null +++ b/tools/testing/selftests/kvm/lib/guest_modes.c @@ -0,0 +1,129 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2020, Red Hat, Inc. + */ +#include "guest_modes.h" + +#ifdef __aarch64__ +#include "processor.h" +enum vm_guest_mode vm_mode_default; +#endif + +struct guest_mode guest_modes[NUM_VM_MODES]; + +void guest_modes_append_default(void) +{ +#ifndef __aarch64__ + guest_mode_append(VM_MODE_DEFAULT, true); +#else + { + unsigned int limit = kvm_check_cap(KVM_CAP_ARM_VM_IPA_SIZE); + uint32_t ipa4k, ipa16k, ipa64k; + int i; + + aarch64_get_supported_page_sizes(limit, &ipa4k, &ipa16k, &ipa64k); + + guest_mode_append(VM_MODE_P52V48_4K, ipa4k >= 52); + guest_mode_append(VM_MODE_P52V48_16K, ipa16k >= 52); + guest_mode_append(VM_MODE_P52V48_64K, ipa64k >= 52); + + guest_mode_append(VM_MODE_P48V48_4K, ipa4k >= 48); + guest_mode_append(VM_MODE_P48V48_16K, ipa16k >= 48); + guest_mode_append(VM_MODE_P48V48_64K, ipa64k >= 48); + + guest_mode_append(VM_MODE_P40V48_4K, ipa4k >= 40); + guest_mode_append(VM_MODE_P40V48_16K, ipa16k >= 40); + guest_mode_append(VM_MODE_P40V48_64K, ipa64k >= 40); + + guest_mode_append(VM_MODE_P36V48_4K, ipa4k >= 36); + guest_mode_append(VM_MODE_P36V48_16K, ipa16k >= 36); + guest_mode_append(VM_MODE_P36V48_64K, ipa64k >= 36); + guest_mode_append(VM_MODE_P36V47_16K, ipa16k >= 36); + + vm_mode_default = ipa4k >= 40 ? VM_MODE_P40V48_4K : NUM_VM_MODES; + + /* + * Pick the first supported IPA size if the default + * isn't available. + */ + for (i = 0; vm_mode_default == NUM_VM_MODES && i < NUM_VM_MODES; i++) { + if (guest_modes[i].supported && guest_modes[i].enabled) + vm_mode_default = i; + } + + TEST_ASSERT(vm_mode_default != NUM_VM_MODES, + "No supported mode!"); + } +#endif +#ifdef __s390x__ + { + int kvm_fd, vm_fd; + struct kvm_s390_vm_cpu_processor info; + + kvm_fd = open_kvm_dev_path_or_exit(); + vm_fd = __kvm_ioctl(kvm_fd, KVM_CREATE_VM, NULL); + kvm_device_attr_get(vm_fd, KVM_S390_VM_CPU_MODEL, + KVM_S390_VM_CPU_PROCESSOR, &info); + close(vm_fd); + close(kvm_fd); + /* Starting with z13 we have 47bits of physical address */ + if (info.ibc >= 0x30) + guest_mode_append(VM_MODE_P47V64_4K, true); + } +#endif +#ifdef __riscv + { + unsigned int sz = kvm_check_cap(KVM_CAP_VM_GPA_BITS); + + if (sz >= 52) + guest_mode_append(VM_MODE_P52V48_4K, true); + if (sz >= 48) + guest_mode_append(VM_MODE_P48V48_4K, true); + } +#endif +} + +void for_each_guest_mode(void (*func)(enum vm_guest_mode, void *), void *arg) +{ + int i; + + for (i = 0; i < NUM_VM_MODES; ++i) { + if (!guest_modes[i].enabled) + continue; + TEST_ASSERT(guest_modes[i].supported, + "Guest mode ID %d (%s) not supported.", + i, vm_guest_mode_string(i)); + func(i, arg); + } +} + +void guest_modes_help(void) +{ + int i; + + printf(" -m: specify the guest mode ID to test\n" + " (default: test all supported modes)\n" + " This option may be used multiple times.\n" + " Guest mode IDs:\n"); + for (i = 0; i < NUM_VM_MODES; ++i) { + printf(" %d: %s%s\n", i, vm_guest_mode_string(i), + guest_modes[i].supported ? " (supported)" : ""); + } +} + +void guest_modes_cmdline(const char *arg) +{ + static bool mode_selected; + unsigned int mode; + int i; + + if (!mode_selected) { + for (i = 0; i < NUM_VM_MODES; ++i) + guest_modes[i].enabled = false; + mode_selected = true; + } + + mode = atoi_non_negative("Guest mode ID", arg); + TEST_ASSERT(mode < NUM_VM_MODES, "Guest mode ID %d too big", mode); + guest_modes[mode].enabled = true; +} diff --git a/tools/testing/selftests/kvm/lib/guest_sprintf.c b/tools/testing/selftests/kvm/lib/guest_sprintf.c new file mode 100644 index 000000000000..74627514c4d4 --- /dev/null +++ b/tools/testing/selftests/kvm/lib/guest_sprintf.c @@ -0,0 +1,314 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include "test_util.h" +#include "kvm_util.h" +#include "ucall_common.h" + +#define APPEND_BUFFER_SAFE(str, end, v) \ +do { \ + GUEST_ASSERT(str < end); \ + *str++ = (v); \ +} while (0) + +static int isdigit(int ch) +{ + return (ch >= '0') && (ch <= '9'); +} + +static int skip_atoi(const char **s) +{ + int i = 0; + + while (isdigit(**s)) + i = i * 10 + *((*s)++) - '0'; + return i; +} + +#define ZEROPAD 1 /* pad with zero */ +#define SIGN 2 /* unsigned/signed long */ +#define PLUS 4 /* show plus */ +#define SPACE 8 /* space if plus */ +#define LEFT 16 /* left justified */ +#define SMALL 32 /* Must be 32 == 0x20 */ +#define SPECIAL 64 /* 0x */ + +#define __do_div(n, base) \ +({ \ + int __res; \ + \ + __res = ((uint64_t) n) % (uint32_t) base; \ + n = ((uint64_t) n) / (uint32_t) base; \ + __res; \ +}) + +static char *number(char *str, const char *end, long num, int base, int size, + int precision, int type) +{ + /* we are called with base 8, 10 or 16, only, thus don't need "G..." */ + static const char digits[16] = "0123456789ABCDEF"; /* "GHIJKLMNOPQRSTUVWXYZ"; */ + + char tmp[66]; + char c, sign, locase; + int i; + + /* + * locase = 0 or 0x20. ORing digits or letters with 'locase' + * produces same digits or (maybe lowercased) letters + */ + locase = (type & SMALL); + if (type & LEFT) + type &= ~ZEROPAD; + if (base < 2 || base > 16) + return NULL; + c = (type & ZEROPAD) ? '0' : ' '; + sign = 0; + if (type & SIGN) { + if (num < 0) { + sign = '-'; + num = -num; + size--; + } else if (type & PLUS) { + sign = '+'; + size--; + } else if (type & SPACE) { + sign = ' '; + size--; + } + } + if (type & SPECIAL) { + if (base == 16) + size -= 2; + else if (base == 8) + size--; + } + i = 0; + if (num == 0) + tmp[i++] = '0'; + else + while (num != 0) + tmp[i++] = (digits[__do_div(num, base)] | locase); + if (i > precision) + precision = i; + size -= precision; + if (!(type & (ZEROPAD + LEFT))) + while (size-- > 0) + APPEND_BUFFER_SAFE(str, end, ' '); + if (sign) + APPEND_BUFFER_SAFE(str, end, sign); + if (type & SPECIAL) { + if (base == 8) + APPEND_BUFFER_SAFE(str, end, '0'); + else if (base == 16) { + APPEND_BUFFER_SAFE(str, end, '0'); + APPEND_BUFFER_SAFE(str, end, 'x'); + } + } + if (!(type & LEFT)) + while (size-- > 0) + APPEND_BUFFER_SAFE(str, end, c); + while (i < precision--) + APPEND_BUFFER_SAFE(str, end, '0'); + while (i-- > 0) + APPEND_BUFFER_SAFE(str, end, tmp[i]); + while (size-- > 0) + APPEND_BUFFER_SAFE(str, end, ' '); + + return str; +} + +int guest_vsnprintf(char *buf, int n, const char *fmt, va_list args) +{ + char *str, *end; + const char *s; + uint64_t num; + int i, base; + int len; + + int flags; /* flags to number() */ + + int field_width; /* width of output field */ + int precision; /* + * min. # of digits for integers; max + * number of chars for from string + */ + int qualifier; /* 'h', 'l', or 'L' for integer fields */ + + end = buf + n; + GUEST_ASSERT(buf < end); + GUEST_ASSERT(n > 0); + + for (str = buf; *fmt; ++fmt) { + if (*fmt != '%') { + APPEND_BUFFER_SAFE(str, end, *fmt); + continue; + } + + /* process flags */ + flags = 0; +repeat: + ++fmt; /* this also skips first '%' */ + switch (*fmt) { + case '-': + flags |= LEFT; + goto repeat; + case '+': + flags |= PLUS; + goto repeat; + case ' ': + flags |= SPACE; + goto repeat; + case '#': + flags |= SPECIAL; + goto repeat; + case '0': + flags |= ZEROPAD; + goto repeat; + } + + /* get field width */ + field_width = -1; + if (isdigit(*fmt)) + field_width = skip_atoi(&fmt); + else if (*fmt == '*') { + ++fmt; + /* it's the next argument */ + field_width = va_arg(args, int); + if (field_width < 0) { + field_width = -field_width; + flags |= LEFT; + } + } + + /* get the precision */ + precision = -1; + if (*fmt == '.') { + ++fmt; + if (isdigit(*fmt)) + precision = skip_atoi(&fmt); + else if (*fmt == '*') { + ++fmt; + /* it's the next argument */ + precision = va_arg(args, int); + } + if (precision < 0) + precision = 0; + } + + /* get the conversion qualifier */ + qualifier = -1; + if (*fmt == 'h' || *fmt == 'l' || *fmt == 'L') { + qualifier = *fmt; + ++fmt; + } + + /* + * Play nice with %llu, %llx, etc. KVM selftests only support + * 64-bit builds, so just treat %ll* the same as %l*. + */ + if (qualifier == 'l' && *fmt == 'l') + ++fmt; + + /* default base */ + base = 10; + + switch (*fmt) { + case 'c': + if (!(flags & LEFT)) + while (--field_width > 0) + APPEND_BUFFER_SAFE(str, end, ' '); + APPEND_BUFFER_SAFE(str, end, + (uint8_t)va_arg(args, int)); + while (--field_width > 0) + APPEND_BUFFER_SAFE(str, end, ' '); + continue; + + case 's': + s = va_arg(args, char *); + len = strnlen(s, precision); + + if (!(flags & LEFT)) + while (len < field_width--) + APPEND_BUFFER_SAFE(str, end, ' '); + for (i = 0; i < len; ++i) + APPEND_BUFFER_SAFE(str, end, *s++); + while (len < field_width--) + APPEND_BUFFER_SAFE(str, end, ' '); + continue; + + case 'p': + if (field_width == -1) { + field_width = 2 * sizeof(void *); + flags |= SPECIAL | SMALL | ZEROPAD; + } + str = number(str, end, + (uint64_t)va_arg(args, void *), 16, + field_width, precision, flags); + continue; + + case 'n': + if (qualifier == 'l') { + long *ip = va_arg(args, long *); + *ip = (str - buf); + } else { + int *ip = va_arg(args, int *); + *ip = (str - buf); + } + continue; + + case '%': + APPEND_BUFFER_SAFE(str, end, '%'); + continue; + + /* integer number formats - set up the flags and "break" */ + case 'o': + base = 8; + break; + + case 'x': + flags |= SMALL; + case 'X': + base = 16; + break; + + case 'd': + case 'i': + flags |= SIGN; + case 'u': + break; + + default: + APPEND_BUFFER_SAFE(str, end, '%'); + if (*fmt) + APPEND_BUFFER_SAFE(str, end, *fmt); + else + --fmt; + continue; + } + if (qualifier == 'l') + num = va_arg(args, uint64_t); + else if (qualifier == 'h') { + num = (uint16_t)va_arg(args, int); + if (flags & SIGN) + num = (int16_t)num; + } else if (flags & SIGN) + num = va_arg(args, int); + else + num = va_arg(args, uint32_t); + str = number(str, end, num, base, field_width, precision, flags); + } + + GUEST_ASSERT(str < end); + *str = '\0'; + return str - buf; +} + +int guest_snprintf(char *buf, int n, const char *fmt, ...) +{ + va_list va; + int len; + + va_start(va, fmt); + len = guest_vsnprintf(buf, n, fmt, va); + va_end(va); + + return len; +} diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index 74776ee228f2..ad00e4761886 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -4,173 +4,285 @@ * * Copyright (C) 2018, Google LLC. */ - #include "test_util.h" #include "kvm_util.h" -#include "kvm_util_internal.h" #include "processor.h" +#include "ucall_common.h" #include <assert.h> +#include <sched.h> #include <sys/mman.h> #include <sys/types.h> #include <sys/stat.h> +#include <unistd.h> #include <linux/kernel.h> -#define KVM_UTIL_PGS_PER_HUGEPG 512 #define KVM_UTIL_MIN_PFN 2 -/* Aligns x up to the next multiple of size. Size must be a power of 2. */ -static void *align(void *x, size_t size) +uint32_t guest_random_seed; +struct guest_random_state guest_rng; + +static int vcpu_mmap_sz(void); + +int open_path_or_exit(const char *path, int flags) { - size_t mask = size - 1; - TEST_ASSERT(size != 0 && !(size & (size - 1)), - "size not a power of 2: %lu", size); - return (void *) (((size_t) x + mask) & ~mask); + int fd; + + fd = open(path, flags); + __TEST_REQUIRE(fd >= 0 || errno != ENOENT, "Cannot open %s: %s", path, strerror(errno)); + TEST_ASSERT(fd >= 0, "Failed to open '%s'", path); + + return fd; } /* - * Capability + * Open KVM_DEV_PATH if available, otherwise exit the entire program. * * Input Args: - * cap - Capability - * - * Output Args: None + * flags - The flags to pass when opening KVM_DEV_PATH. * * Return: - * On success, the Value corresponding to the capability (KVM_CAP_*) - * specified by the value of cap. On failure a TEST_ASSERT failure - * is produced. - * - * Looks up and returns the value corresponding to the capability - * (KVM_CAP_*) given by cap. + * The opened file descriptor of /dev/kvm. */ -int kvm_check_cap(long cap) +static int _open_kvm_dev_path_or_exit(int flags) { - int ret; - int kvm_fd; + return open_path_or_exit(KVM_DEV_PATH, flags); +} + +int open_kvm_dev_path_or_exit(void) +{ + return _open_kvm_dev_path_or_exit(O_RDONLY); +} - kvm_fd = open(KVM_DEV_PATH, O_RDONLY); - if (kvm_fd < 0) - exit(KSFT_SKIP); +static ssize_t get_module_param(const char *module_name, const char *param, + void *buffer, size_t buffer_size) +{ + const int path_size = 128; + char path[path_size]; + ssize_t bytes_read; + int fd, r; - ret = ioctl(kvm_fd, KVM_CHECK_EXTENSION, cap); - TEST_ASSERT(ret != -1, "KVM_CHECK_EXTENSION IOCTL failed,\n" - " rc: %i errno: %i", ret, errno); + r = snprintf(path, path_size, "/sys/module/%s/parameters/%s", + module_name, param); + TEST_ASSERT(r < path_size, + "Failed to construct sysfs path in %d bytes.", path_size); - close(kvm_fd); + fd = open_path_or_exit(path, O_RDONLY); - return ret; + bytes_read = read(fd, buffer, buffer_size); + TEST_ASSERT(bytes_read > 0, "read(%s) returned %ld, wanted %ld bytes", + path, bytes_read, buffer_size); + + r = close(fd); + TEST_ASSERT(!r, "close(%s) failed", path); + return bytes_read; } -/* VM Enable Capability - * - * Input Args: - * vm - Virtual Machine - * cap - Capability - * - * Output Args: None - * - * Return: On success, 0. On failure a TEST_ASSERT failure is produced. - * - * Enables a capability (KVM_CAP_*) on the VM. - */ -int vm_enable_cap(struct kvm_vm *vm, struct kvm_enable_cap *cap) +static int get_module_param_integer(const char *module_name, const char *param) { - int ret; + /* + * 16 bytes to hold a 64-bit value (1 byte per char), 1 byte for the + * NUL char, and 1 byte because the kernel sucks and inserts a newline + * at the end. + */ + char value[16 + 1 + 1]; + ssize_t r; - ret = ioctl(vm->fd, KVM_ENABLE_CAP, cap); - TEST_ASSERT(ret == 0, "KVM_ENABLE_CAP IOCTL failed,\n" - " rc: %i errno: %i", ret, errno); + memset(value, '\0', sizeof(value)); - return ret; + r = get_module_param(module_name, param, value, sizeof(value)); + TEST_ASSERT(value[r - 1] == '\n', + "Expected trailing newline, got char '%c'", value[r - 1]); + + /* + * Squash the newline, otherwise atoi_paranoid() will complain about + * trailing non-NUL characters in the string. + */ + value[r - 1] = '\0'; + return atoi_paranoid(value); } -static void vm_open(struct kvm_vm *vm, int perm) +static bool get_module_param_bool(const char *module_name, const char *param) { - vm->kvm_fd = open(KVM_DEV_PATH, perm); - if (vm->kvm_fd < 0) - exit(KSFT_SKIP); + char value; + ssize_t r; - if (!kvm_check_cap(KVM_CAP_IMMEDIATE_EXIT)) { - print_skip("immediate_exit not available"); - exit(KSFT_SKIP); - } + r = get_module_param(module_name, param, &value, sizeof(value)); + TEST_ASSERT_EQ(r, 1); - vm->fd = ioctl(vm->kvm_fd, KVM_CREATE_VM, vm->type); - TEST_ASSERT(vm->fd >= 0, "KVM_CREATE_VM ioctl failed, " - "rc: %i errno: %i", vm->fd, errno); + if (value == 'Y') + return true; + else if (value == 'N') + return false; + + TEST_FAIL("Unrecognized value '%c' for boolean module param", value); } -const char * const vm_guest_mode_string[] = { - "PA-bits:52, VA-bits:48, 4K pages", - "PA-bits:52, VA-bits:48, 64K pages", - "PA-bits:48, VA-bits:48, 4K pages", - "PA-bits:48, VA-bits:48, 64K pages", - "PA-bits:40, VA-bits:48, 4K pages", - "PA-bits:40, VA-bits:48, 64K pages", - "PA-bits:ANY, VA-bits:48, 4K pages", -}; -_Static_assert(sizeof(vm_guest_mode_string)/sizeof(char *) == NUM_VM_MODES, - "Missing new mode strings?"); - -struct vm_guest_mode_params { - unsigned int pa_bits; - unsigned int va_bits; - unsigned int page_size; - unsigned int page_shift; -}; +bool get_kvm_param_bool(const char *param) +{ + return get_module_param_bool("kvm", param); +} -static const struct vm_guest_mode_params vm_guest_mode_params[] = { - { 52, 48, 0x1000, 12 }, - { 52, 48, 0x10000, 16 }, - { 48, 48, 0x1000, 12 }, - { 48, 48, 0x10000, 16 }, - { 40, 48, 0x1000, 12 }, - { 40, 48, 0x10000, 16 }, - { 0, 0, 0x1000, 12 }, -}; -_Static_assert(sizeof(vm_guest_mode_params)/sizeof(struct vm_guest_mode_params) == NUM_VM_MODES, - "Missing new mode params?"); +bool get_kvm_intel_param_bool(const char *param) +{ + return get_module_param_bool("kvm_intel", param); +} + +bool get_kvm_amd_param_bool(const char *param) +{ + return get_module_param_bool("kvm_amd", param); +} + +int get_kvm_param_integer(const char *param) +{ + return get_module_param_integer("kvm", param); +} + +int get_kvm_intel_param_integer(const char *param) +{ + return get_module_param_integer("kvm_intel", param); +} + +int get_kvm_amd_param_integer(const char *param) +{ + return get_module_param_integer("kvm_amd", param); +} /* - * VM Create + * Capability * * Input Args: - * mode - VM Mode (e.g. VM_MODE_P52V48_4K) - * phy_pages - Physical memory pages - * perm - permission + * cap - Capability * * Output Args: None * * Return: - * Pointer to opaque structure that describes the created VM. + * On success, the Value corresponding to the capability (KVM_CAP_*) + * specified by the value of cap. On failure a TEST_ASSERT failure + * is produced. * - * Creates a VM with the mode specified by mode (e.g. VM_MODE_P52V48_4K). - * When phy_pages is non-zero, a memory region of phy_pages physical pages - * is created and mapped starting at guest physical address 0. The file - * descriptor to control the created VM is created with the permissions - * given by perm (e.g. O_RDWR). + * Looks up and returns the value corresponding to the capability + * (KVM_CAP_*) given by cap. */ -struct kvm_vm *_vm_create(enum vm_guest_mode mode, uint64_t phy_pages, int perm) +unsigned int kvm_check_cap(long cap) { - struct kvm_vm *vm; + int ret; + int kvm_fd; + + kvm_fd = open_kvm_dev_path_or_exit(); + ret = __kvm_ioctl(kvm_fd, KVM_CHECK_EXTENSION, (void *)cap); + TEST_ASSERT(ret >= 0, KVM_IOCTL_ERROR(KVM_CHECK_EXTENSION, ret)); + + close(kvm_fd); - pr_debug("%s: mode='%s' pages='%ld' perm='%d'\n", __func__, - vm_guest_mode_string(mode), phy_pages, perm); + return (unsigned int)ret; +} + +void vm_enable_dirty_ring(struct kvm_vm *vm, uint32_t ring_size) +{ + if (vm_check_cap(vm, KVM_CAP_DIRTY_LOG_RING_ACQ_REL)) + vm_enable_cap(vm, KVM_CAP_DIRTY_LOG_RING_ACQ_REL, ring_size); + else + vm_enable_cap(vm, KVM_CAP_DIRTY_LOG_RING, ring_size); + vm->dirty_ring_size = ring_size; +} + +static void vm_open(struct kvm_vm *vm) +{ + vm->kvm_fd = _open_kvm_dev_path_or_exit(O_RDWR); + + TEST_REQUIRE(kvm_has_cap(KVM_CAP_IMMEDIATE_EXIT)); + + vm->fd = __kvm_ioctl(vm->kvm_fd, KVM_CREATE_VM, (void *)vm->type); + TEST_ASSERT(vm->fd >= 0, KVM_IOCTL_ERROR(KVM_CREATE_VM, vm->fd)); +} + +const char *vm_guest_mode_string(uint32_t i) +{ + static const char * const strings[] = { + [VM_MODE_P52V48_4K] = "PA-bits:52, VA-bits:48, 4K pages", + [VM_MODE_P52V48_16K] = "PA-bits:52, VA-bits:48, 16K pages", + [VM_MODE_P52V48_64K] = "PA-bits:52, VA-bits:48, 64K pages", + [VM_MODE_P48V48_4K] = "PA-bits:48, VA-bits:48, 4K pages", + [VM_MODE_P48V48_16K] = "PA-bits:48, VA-bits:48, 16K pages", + [VM_MODE_P48V48_64K] = "PA-bits:48, VA-bits:48, 64K pages", + [VM_MODE_P40V48_4K] = "PA-bits:40, VA-bits:48, 4K pages", + [VM_MODE_P40V48_16K] = "PA-bits:40, VA-bits:48, 16K pages", + [VM_MODE_P40V48_64K] = "PA-bits:40, VA-bits:48, 64K pages", + [VM_MODE_PXXV48_4K] = "PA-bits:ANY, VA-bits:48, 4K pages", + [VM_MODE_P47V64_4K] = "PA-bits:47, VA-bits:64, 4K pages", + [VM_MODE_P44V64_4K] = "PA-bits:44, VA-bits:64, 4K pages", + [VM_MODE_P36V48_4K] = "PA-bits:36, VA-bits:48, 4K pages", + [VM_MODE_P36V48_16K] = "PA-bits:36, VA-bits:48, 16K pages", + [VM_MODE_P36V48_64K] = "PA-bits:36, VA-bits:48, 64K pages", + [VM_MODE_P36V47_16K] = "PA-bits:36, VA-bits:47, 16K pages", + }; + _Static_assert(sizeof(strings)/sizeof(char *) == NUM_VM_MODES, + "Missing new mode strings?"); + + TEST_ASSERT(i < NUM_VM_MODES, "Guest mode ID %d too big", i); + + return strings[i]; +} + +const struct vm_guest_mode_params vm_guest_mode_params[] = { + [VM_MODE_P52V48_4K] = { 52, 48, 0x1000, 12 }, + [VM_MODE_P52V48_16K] = { 52, 48, 0x4000, 14 }, + [VM_MODE_P52V48_64K] = { 52, 48, 0x10000, 16 }, + [VM_MODE_P48V48_4K] = { 48, 48, 0x1000, 12 }, + [VM_MODE_P48V48_16K] = { 48, 48, 0x4000, 14 }, + [VM_MODE_P48V48_64K] = { 48, 48, 0x10000, 16 }, + [VM_MODE_P40V48_4K] = { 40, 48, 0x1000, 12 }, + [VM_MODE_P40V48_16K] = { 40, 48, 0x4000, 14 }, + [VM_MODE_P40V48_64K] = { 40, 48, 0x10000, 16 }, + [VM_MODE_PXXV48_4K] = { 0, 0, 0x1000, 12 }, + [VM_MODE_P47V64_4K] = { 47, 64, 0x1000, 12 }, + [VM_MODE_P44V64_4K] = { 44, 64, 0x1000, 12 }, + [VM_MODE_P36V48_4K] = { 36, 48, 0x1000, 12 }, + [VM_MODE_P36V48_16K] = { 36, 48, 0x4000, 14 }, + [VM_MODE_P36V48_64K] = { 36, 48, 0x10000, 16 }, + [VM_MODE_P36V47_16K] = { 36, 47, 0x4000, 14 }, +}; +_Static_assert(sizeof(vm_guest_mode_params)/sizeof(struct vm_guest_mode_params) == NUM_VM_MODES, + "Missing new mode params?"); + +/* + * Initializes vm->vpages_valid to match the canonical VA space of the + * architecture. + * + * The default implementation is valid for architectures which split the + * range addressed by a single page table into a low and high region + * based on the MSB of the VA. On architectures with this behavior + * the VA region spans [0, 2^(va_bits - 1)), [-(2^(va_bits - 1), -1]. + */ +__weak void vm_vaddr_populate_bitmap(struct kvm_vm *vm) +{ + sparsebit_set_num(vm->vpages_valid, + 0, (1ULL << (vm->va_bits - 1)) >> vm->page_shift); + sparsebit_set_num(vm->vpages_valid, + (~((1ULL << (vm->va_bits - 1)) - 1)) >> vm->page_shift, + (1ULL << (vm->va_bits - 1)) >> vm->page_shift); +} + +struct kvm_vm *____vm_create(struct vm_shape shape) +{ + struct kvm_vm *vm; vm = calloc(1, sizeof(*vm)); TEST_ASSERT(vm != NULL, "Insufficient Memory"); INIT_LIST_HEAD(&vm->vcpus); - INIT_LIST_HEAD(&vm->userspace_mem_regions); + vm->regions.gpa_tree = RB_ROOT; + vm->regions.hva_tree = RB_ROOT; + hash_init(vm->regions.slot_hash); - vm->mode = mode; - vm->type = 0; + vm->mode = shape.mode; + vm->type = shape.type; - vm->pa_bits = vm_guest_mode_params[mode].pa_bits; - vm->va_bits = vm_guest_mode_params[mode].va_bits; - vm->page_size = vm_guest_mode_params[mode].page_size; - vm->page_shift = vm_guest_mode_params[mode].page_shift; + vm->pa_bits = vm_guest_mode_params[vm->mode].pa_bits; + vm->va_bits = vm_guest_mode_params[vm->mode].va_bits; + vm->page_size = vm_guest_mode_params[vm->mode].page_size; + vm->page_shift = vm_guest_mode_params[vm->mode].page_shift; /* Setup mode specific traits. */ switch (vm->mode) { @@ -187,18 +299,30 @@ struct kvm_vm *_vm_create(enum vm_guest_mode mode, uint64_t phy_pages, int perm) vm->pgtable_levels = 3; break; case VM_MODE_P40V48_4K: + case VM_MODE_P36V48_4K: vm->pgtable_levels = 4; break; case VM_MODE_P40V48_64K: + case VM_MODE_P36V48_64K: + vm->pgtable_levels = 3; + break; + case VM_MODE_P52V48_16K: + case VM_MODE_P48V48_16K: + case VM_MODE_P40V48_16K: + case VM_MODE_P36V48_16K: + vm->pgtable_levels = 4; + break; + case VM_MODE_P36V47_16K: vm->pgtable_levels = 3; break; case VM_MODE_PXXV48_4K: #ifdef __x86_64__ kvm_get_cpu_address_width(&vm->pa_bits, &vm->va_bits); + kvm_init_vm_address_properties(vm); /* * Ignore KVM support for 5-level paging (vm->va_bits == 57), * it doesn't take effect unless a CR4.LA57 is set, which it - * isn't for this VM_MODE. + * isn't for this mode (48-bit virtual address space). */ TEST_ASSERT(vm->va_bits == 48 || vm->va_bits == 57, "Linear address width (%d bits) not supported", @@ -211,40 +335,162 @@ struct kvm_vm *_vm_create(enum vm_guest_mode mode, uint64_t phy_pages, int perm) TEST_FAIL("VM_MODE_PXXV48_4K not supported on non-x86 platforms"); #endif break; + case VM_MODE_P47V64_4K: + vm->pgtable_levels = 5; + break; + case VM_MODE_P44V64_4K: + vm->pgtable_levels = 5; + break; default: - TEST_FAIL("Unknown guest mode, mode: 0x%x", mode); + TEST_FAIL("Unknown guest mode: 0x%x", vm->mode); } #ifdef __aarch64__ + TEST_ASSERT(!vm->type, "ARM doesn't support test-provided types"); if (vm->pa_bits != 40) vm->type = KVM_VM_TYPE_ARM_IPA_SIZE(vm->pa_bits); #endif - vm_open(vm, perm); + vm_open(vm); /* Limit to VA-bit canonical virtual addresses. */ vm->vpages_valid = sparsebit_alloc(); - sparsebit_set_num(vm->vpages_valid, - 0, (1ULL << (vm->va_bits - 1)) >> vm->page_shift); - sparsebit_set_num(vm->vpages_valid, - (~((1ULL << (vm->va_bits - 1)) - 1)) >> vm->page_shift, - (1ULL << (vm->va_bits - 1)) >> vm->page_shift); + vm_vaddr_populate_bitmap(vm); /* Limit physical addresses to PA-bits. */ - vm->max_gfn = ((1ULL << vm->pa_bits) >> vm->page_shift) - 1; + vm->max_gfn = vm_compute_max_gfn(vm); /* Allocate and setup memory for guest. */ vm->vpages_mapped = sparsebit_alloc(); - if (phy_pages != 0) - vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, - 0, 0, phy_pages, 0); return vm; } -struct kvm_vm *vm_create(enum vm_guest_mode mode, uint64_t phy_pages, int perm) +static uint64_t vm_nr_pages_required(enum vm_guest_mode mode, + uint32_t nr_runnable_vcpus, + uint64_t extra_mem_pages) { - return _vm_create(mode, phy_pages, perm); + uint64_t page_size = vm_guest_mode_params[mode].page_size; + uint64_t nr_pages; + + TEST_ASSERT(nr_runnable_vcpus, + "Use vm_create_barebones() for VMs that _never_ have vCPUs"); + + TEST_ASSERT(nr_runnable_vcpus <= kvm_check_cap(KVM_CAP_MAX_VCPUS), + "nr_vcpus = %d too large for host, max-vcpus = %d", + nr_runnable_vcpus, kvm_check_cap(KVM_CAP_MAX_VCPUS)); + + /* + * Arbitrarily allocate 512 pages (2mb when page size is 4kb) for the + * test code and other per-VM assets that will be loaded into memslot0. + */ + nr_pages = 512; + + /* Account for the per-vCPU stacks on behalf of the test. */ + nr_pages += nr_runnable_vcpus * DEFAULT_STACK_PGS; + + /* + * Account for the number of pages needed for the page tables. The + * maximum page table size for a memory region will be when the + * smallest page size is used. Considering each page contains x page + * table descriptors, the total extra size for page tables (for extra + * N pages) will be: N/x+N/x^2+N/x^3+... which is definitely smaller + * than N/x*2. + */ + nr_pages += (nr_pages + extra_mem_pages) / PTES_PER_MIN_PAGE * 2; + + /* Account for the number of pages needed by ucall. */ + nr_pages += ucall_nr_pages_required(page_size); + + return vm_adjust_num_guest_pages(mode, nr_pages); +} + +struct kvm_vm *__vm_create(struct vm_shape shape, uint32_t nr_runnable_vcpus, + uint64_t nr_extra_pages) +{ + uint64_t nr_pages = vm_nr_pages_required(shape.mode, nr_runnable_vcpus, + nr_extra_pages); + struct userspace_mem_region *slot0; + struct kvm_vm *vm; + int i; + + pr_debug("%s: mode='%s' type='%d', pages='%ld'\n", __func__, + vm_guest_mode_string(shape.mode), shape.type, nr_pages); + + vm = ____vm_create(shape); + + vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, 0, 0, nr_pages, 0); + for (i = 0; i < NR_MEM_REGIONS; i++) + vm->memslots[i] = 0; + + kvm_vm_elf_load(vm, program_invocation_name); + + /* + * TODO: Add proper defines to protect the library's memslots, and then + * carve out memslot1 for the ucall MMIO address. KVM treats writes to + * read-only memslots as MMIO, and creating a read-only memslot for the + * MMIO region would prevent silently clobbering the MMIO region. + */ + slot0 = memslot2region(vm, 0); + ucall_init(vm, slot0->region.guest_phys_addr + slot0->region.memory_size); + + pr_info("Random seed: 0x%x\n", guest_random_seed); + guest_rng = new_guest_random_state(guest_random_seed); + sync_global_to_guest(vm, guest_rng); + + kvm_arch_vm_post_create(vm); + + return vm; +} + +/* + * VM Create with customized parameters + * + * Input Args: + * mode - VM Mode (e.g. VM_MODE_P52V48_4K) + * nr_vcpus - VCPU count + * extra_mem_pages - Non-slot0 physical memory total size + * guest_code - Guest entry point + * vcpuids - VCPU IDs + * + * Output Args: None + * + * Return: + * Pointer to opaque structure that describes the created VM. + * + * Creates a VM with the mode specified by mode (e.g. VM_MODE_P52V48_4K). + * extra_mem_pages is only used to calculate the maximum page table size, + * no real memory allocation for non-slot0 memory in this function. + */ +struct kvm_vm *__vm_create_with_vcpus(struct vm_shape shape, uint32_t nr_vcpus, + uint64_t extra_mem_pages, + void *guest_code, struct kvm_vcpu *vcpus[]) +{ + struct kvm_vm *vm; + int i; + + TEST_ASSERT(!nr_vcpus || vcpus, "Must provide vCPU array"); + + vm = __vm_create(shape, nr_vcpus, extra_mem_pages); + + for (i = 0; i < nr_vcpus; ++i) + vcpus[i] = vm_vcpu_add(vm, i, guest_code); + + return vm; +} + +struct kvm_vm *__vm_create_shape_with_one_vcpu(struct vm_shape shape, + struct kvm_vcpu **vcpu, + uint64_t extra_mem_pages, + void *guest_code) +{ + struct kvm_vcpu *vcpus[1]; + struct kvm_vm *vm; + + vm = __vm_create_with_vcpus(shape, 1, extra_mem_pages, guest_code, vcpus); + + *vcpu = vcpus[0]; + return vm; } /* @@ -252,7 +498,6 @@ struct kvm_vm *vm_create(enum vm_guest_mode mode, uint64_t phy_pages, int perm) * * Input Args: * vm - VM that has been released before - * perm - permission * * Output Args: None * @@ -260,17 +505,19 @@ struct kvm_vm *vm_create(enum vm_guest_mode mode, uint64_t phy_pages, int perm) * global state, such as the irqchip and the memory regions that are mapped * into the guest. */ -void kvm_vm_restart(struct kvm_vm *vmp, int perm) +void kvm_vm_restart(struct kvm_vm *vmp) { + int ctr; struct userspace_mem_region *region; - vm_open(vmp, perm); + vm_open(vmp); if (vmp->has_irqchip) vm_create_irqchip(vmp); - list_for_each_entry(region, &vmp->userspace_mem_regions, list) { - int ret = ioctl(vmp->fd, KVM_SET_USER_MEMORY_REGION, ®ion->region); - TEST_ASSERT(ret == 0, "KVM_SET_USER_MEMORY_REGION IOCTL failed,\n" + hash_for_each(vmp->regions.slot_hash, ctr, region, slot_node) { + int ret = ioctl(vmp->fd, KVM_SET_USER_MEMORY_REGION2, ®ion->region); + + TEST_ASSERT(ret == 0, "KVM_SET_USER_MEMORY_REGION2 IOCTL failed,\n" " rc: %i errno: %i\n" " slot: %u flags: 0x%x\n" " guest_phys_addr: 0x%llx size: 0x%llx", @@ -281,27 +528,87 @@ void kvm_vm_restart(struct kvm_vm *vmp, int perm) } } -void kvm_vm_get_dirty_log(struct kvm_vm *vm, int slot, void *log) +__weak struct kvm_vcpu *vm_arch_vcpu_recreate(struct kvm_vm *vm, + uint32_t vcpu_id) { - struct kvm_dirty_log args = { .dirty_bitmap = log, .slot = slot }; - int ret; + return __vm_vcpu_add(vm, vcpu_id); +} + +struct kvm_vcpu *vm_recreate_with_one_vcpu(struct kvm_vm *vm) +{ + kvm_vm_restart(vm); - ret = ioctl(vm->fd, KVM_GET_DIRTY_LOG, &args); - TEST_ASSERT(ret == 0, "%s: KVM_GET_DIRTY_LOG failed: %s", - __func__, strerror(-ret)); + return vm_vcpu_recreate(vm, 0); } -void kvm_vm_clear_dirty_log(struct kvm_vm *vm, int slot, void *log, - uint64_t first_page, uint32_t num_pages) +void kvm_pin_this_task_to_pcpu(uint32_t pcpu) { - struct kvm_clear_dirty_log args = { .dirty_bitmap = log, .slot = slot, - .first_page = first_page, - .num_pages = num_pages }; - int ret; + cpu_set_t mask; + int r; + + CPU_ZERO(&mask); + CPU_SET(pcpu, &mask); + r = sched_setaffinity(0, sizeof(mask), &mask); + TEST_ASSERT(!r, "sched_setaffinity() failed for pCPU '%u'.", pcpu); +} - ret = ioctl(vm->fd, KVM_CLEAR_DIRTY_LOG, &args); - TEST_ASSERT(ret == 0, "%s: KVM_CLEAR_DIRTY_LOG failed: %s", - __func__, strerror(-ret)); +static uint32_t parse_pcpu(const char *cpu_str, const cpu_set_t *allowed_mask) +{ + uint32_t pcpu = atoi_non_negative("CPU number", cpu_str); + + TEST_ASSERT(CPU_ISSET(pcpu, allowed_mask), + "Not allowed to run on pCPU '%d', check cgroups?", pcpu); + return pcpu; +} + +void kvm_print_vcpu_pinning_help(void) +{ + const char *name = program_invocation_name; + + printf(" -c: Pin tasks to physical CPUs. Takes a list of comma separated\n" + " values (target pCPU), one for each vCPU, plus an optional\n" + " entry for the main application task (specified via entry\n" + " <nr_vcpus + 1>). If used, entries must be provided for all\n" + " vCPUs, i.e. pinning vCPUs is all or nothing.\n\n" + " E.g. to create 3 vCPUs, pin vCPU0=>pCPU22, vCPU1=>pCPU23,\n" + " vCPU2=>pCPU24, and pin the application task to pCPU50:\n\n" + " %s -v 3 -c 22,23,24,50\n\n" + " To leave the application task unpinned, drop the final entry:\n\n" + " %s -v 3 -c 22,23,24\n\n" + " (default: no pinning)\n", name, name); +} + +void kvm_parse_vcpu_pinning(const char *pcpus_string, uint32_t vcpu_to_pcpu[], + int nr_vcpus) +{ + cpu_set_t allowed_mask; + char *cpu, *cpu_list; + char delim[2] = ","; + int i, r; + + cpu_list = strdup(pcpus_string); + TEST_ASSERT(cpu_list, "strdup() allocation failed."); + + r = sched_getaffinity(0, sizeof(allowed_mask), &allowed_mask); + TEST_ASSERT(!r, "sched_getaffinity() failed"); + + cpu = strtok(cpu_list, delim); + + /* 1. Get all pcpus for vcpus. */ + for (i = 0; i < nr_vcpus; i++) { + TEST_ASSERT(cpu, "pCPU not provided for vCPU '%d'", i); + vcpu_to_pcpu[i] = parse_pcpu(cpu, &allowed_mask); + cpu = strtok(NULL, delim); + } + + /* 2. Check if the main worker needs to be pinned. */ + if (cpu) { + kvm_pin_this_task_to_pcpu(parse_pcpu(cpu, &allowed_mask)); + cpu = strtok(NULL, delim); + } + + TEST_ASSERT(!cpu, "pCPU list contains trailing garbage characters '%s'", cpu); + free(cpu_list); } /* @@ -326,74 +633,29 @@ void kvm_vm_clear_dirty_log(struct kvm_vm *vm, int slot, void *log, static struct userspace_mem_region * userspace_mem_region_find(struct kvm_vm *vm, uint64_t start, uint64_t end) { - struct userspace_mem_region *region; + struct rb_node *node; - list_for_each_entry(region, &vm->userspace_mem_regions, list) { + for (node = vm->regions.gpa_tree.rb_node; node; ) { + struct userspace_mem_region *region = + container_of(node, struct userspace_mem_region, gpa_node); uint64_t existing_start = region->region.guest_phys_addr; uint64_t existing_end = region->region.guest_phys_addr + region->region.memory_size - 1; if (start <= existing_end && end >= existing_start) return region; + + if (start < existing_start) + node = node->rb_left; + else + node = node->rb_right; } return NULL; } -/* - * KVM Userspace Memory Region Find - * - * Input Args: - * vm - Virtual Machine - * start - Starting VM physical address - * end - Ending VM physical address, inclusive. - * - * Output Args: None - * - * Return: - * Pointer to overlapping region, NULL if no such region. - * - * Public interface to userspace_mem_region_find. Allows tests to look up - * the memslot datastructure for a given range of guest physical memory. - */ -struct kvm_userspace_memory_region * -kvm_userspace_memory_region_find(struct kvm_vm *vm, uint64_t start, - uint64_t end) +__weak void vcpu_arch_free(struct kvm_vcpu *vcpu) { - struct userspace_mem_region *region; - - region = userspace_mem_region_find(vm, start, end); - if (!region) - return NULL; - return ®ion->region; -} - -/* - * VCPU Find - * - * Input Args: - * vm - Virtual Machine - * vcpuid - VCPU ID - * - * Output Args: None - * - * Return: - * Pointer to VCPU structure - * - * Locates a vcpu structure that describes the VCPU specified by vcpuid and - * returns a pointer to it. Returns NULL if the VM doesn't contain a VCPU - * for the specified vcpuid. - */ -struct vcpu *vcpu_find(struct kvm_vm *vm, uint32_t vcpuid) -{ - struct vcpu *vcpu; - - list_for_each_entry(vcpu, &vm->vcpus, list) { - if (vcpu->id == vcpuid) - return vcpu; - } - - return NULL; } /* @@ -408,53 +670,70 @@ struct vcpu *vcpu_find(struct kvm_vm *vm, uint32_t vcpuid) * * Removes a vCPU from a VM and frees its resources. */ -static void vm_vcpu_rm(struct vcpu *vcpu) +static void vm_vcpu_rm(struct kvm_vm *vm, struct kvm_vcpu *vcpu) { int ret; - ret = munmap(vcpu->state, sizeof(*vcpu->state)); - TEST_ASSERT(ret == 0, "munmap of VCPU fd failed, rc: %i " - "errno: %i", ret, errno); - close(vcpu->fd); - TEST_ASSERT(ret == 0, "Close of VCPU fd failed, rc: %i " - "errno: %i", ret, errno); + if (vcpu->dirty_gfns) { + ret = munmap(vcpu->dirty_gfns, vm->dirty_ring_size); + TEST_ASSERT(!ret, __KVM_SYSCALL_ERROR("munmap()", ret)); + vcpu->dirty_gfns = NULL; + } + + ret = munmap(vcpu->run, vcpu_mmap_sz()); + TEST_ASSERT(!ret, __KVM_SYSCALL_ERROR("munmap()", ret)); + + ret = close(vcpu->fd); + TEST_ASSERT(!ret, __KVM_SYSCALL_ERROR("close()", ret)); list_del(&vcpu->list); + + vcpu_arch_free(vcpu); free(vcpu); } void kvm_vm_release(struct kvm_vm *vmp) { - struct vcpu *vcpu, *tmp; + struct kvm_vcpu *vcpu, *tmp; int ret; list_for_each_entry_safe(vcpu, tmp, &vmp->vcpus, list) - vm_vcpu_rm(vcpu); + vm_vcpu_rm(vmp, vcpu); ret = close(vmp->fd); - TEST_ASSERT(ret == 0, "Close of vm fd failed,\n" - " vmp->fd: %i rc: %i errno: %i", vmp->fd, ret, errno); + TEST_ASSERT(!ret, __KVM_SYSCALL_ERROR("close()", ret)); - close(vmp->kvm_fd); - TEST_ASSERT(ret == 0, "Close of /dev/kvm fd failed,\n" - " vmp->kvm_fd: %i rc: %i errno: %i", vmp->kvm_fd, ret, errno); + ret = close(vmp->kvm_fd); + TEST_ASSERT(!ret, __KVM_SYSCALL_ERROR("close()", ret)); } static void __vm_mem_region_delete(struct kvm_vm *vm, - struct userspace_mem_region *region) + struct userspace_mem_region *region, + bool unlink) { int ret; - list_del(®ion->list); + if (unlink) { + rb_erase(®ion->gpa_node, &vm->regions.gpa_tree); + rb_erase(®ion->hva_node, &vm->regions.hva_tree); + hash_del(®ion->slot_node); + } region->region.memory_size = 0; - ret = ioctl(vm->fd, KVM_SET_USER_MEMORY_REGION, ®ion->region); - TEST_ASSERT(ret == 0, "KVM_SET_USER_MEMORY_REGION IOCTL failed, " - "rc: %i errno: %i", ret, errno); + vm_ioctl(vm, KVM_SET_USER_MEMORY_REGION2, ®ion->region); sparsebit_free(®ion->unused_phy_pages); + sparsebit_free(®ion->protected_phy_pages); ret = munmap(region->mmap_start, region->mmap_size); - TEST_ASSERT(ret == 0, "munmap failed, rc: %i errno: %i", ret, errno); + TEST_ASSERT(!ret, __KVM_SYSCALL_ERROR("munmap()", ret)); + if (region->fd >= 0) { + /* There's an extra map when using shared memory. */ + ret = munmap(region->mmap_alias, region->mmap_size); + TEST_ASSERT(!ret, __KVM_SYSCALL_ERROR("munmap()", ret)); + close(region->fd); + } + if (region->region.guest_memfd >= 0) + close(region->region.guest_memfd); free(region); } @@ -464,14 +743,22 @@ static void __vm_mem_region_delete(struct kvm_vm *vm, */ void kvm_vm_free(struct kvm_vm *vmp) { - struct userspace_mem_region *region, *tmp; + int ctr; + struct hlist_node *node; + struct userspace_mem_region *region; if (vmp == NULL) return; + /* Free cached stats metadata and close FD */ + if (vmp->stats_fd) { + free(vmp->stats_desc); + close(vmp->stats_fd); + } + /* Free userspace_mem_regions. */ - list_for_each_entry_safe(region, tmp, &vmp->userspace_mem_regions, list) - __vm_mem_region_delete(vmp, region); + hash_for_each_safe(vmp->regions.slot_hash, ctr, node, region, slot_node) + __vm_mem_region_delete(vmp, region, false); /* Free sparsebit arrays. */ sparsebit_free(&vmp->vpages_valid); @@ -483,6 +770,26 @@ void kvm_vm_free(struct kvm_vm *vmp) free(vmp); } +int kvm_memfd_alloc(size_t size, bool hugepages) +{ + int memfd_flags = MFD_CLOEXEC; + int fd, r; + + if (hugepages) + memfd_flags |= MFD_HUGETLB; + + fd = memfd_create("kvm_selftest", memfd_flags); + TEST_ASSERT(fd != -1, __KVM_SYSCALL_ERROR("memfd_create()", fd)); + + r = ftruncate(fd, size); + TEST_ASSERT(!r, __KVM_SYSCALL_ERROR("ftruncate()", r)); + + r = fallocate(fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, 0, size); + TEST_ASSERT(!r, __KVM_SYSCALL_ERROR("fallocate()", r)); + + return fd; +} + /* * Memory Compare, host virtual to guest virtual * @@ -553,38 +860,129 @@ int kvm_memcmp_hva_gva(void *hva, struct kvm_vm *vm, vm_vaddr_t gva, size_t len) return 0; } -/* - * VM Userspace Memory Region Add - * - * Input Args: - * vm - Virtual Machine - * backing_src - Storage source for this region. - * NULL to use anonymous memory. - * guest_paddr - Starting guest physical address - * slot - KVM region slot - * npages - Number of physical pages - * flags - KVM memory region flags (e.g. KVM_MEM_LOG_DIRTY_PAGES) - * - * Output Args: None - * - * Return: None - * - * Allocates a memory area of the number of pages specified by npages - * and maps it to the VM specified by vm, at a starting physical address - * given by guest_paddr. The region is created with a KVM region slot - * given by slot, which must be unique and < KVM_MEM_SLOTS_NUM. The - * region is created with the flags given by flags. - */ -void vm_userspace_mem_region_add(struct kvm_vm *vm, - enum vm_mem_backing_src_type src_type, - uint64_t guest_paddr, uint32_t slot, uint64_t npages, - uint32_t flags) +static void vm_userspace_mem_region_gpa_insert(struct rb_root *gpa_tree, + struct userspace_mem_region *region) +{ + struct rb_node **cur, *parent; + + for (cur = &gpa_tree->rb_node, parent = NULL; *cur; ) { + struct userspace_mem_region *cregion; + + cregion = container_of(*cur, typeof(*cregion), gpa_node); + parent = *cur; + if (region->region.guest_phys_addr < + cregion->region.guest_phys_addr) + cur = &(*cur)->rb_left; + else { + TEST_ASSERT(region->region.guest_phys_addr != + cregion->region.guest_phys_addr, + "Duplicate GPA in region tree"); + + cur = &(*cur)->rb_right; + } + } + + rb_link_node(®ion->gpa_node, parent, cur); + rb_insert_color(®ion->gpa_node, gpa_tree); +} + +static void vm_userspace_mem_region_hva_insert(struct rb_root *hva_tree, + struct userspace_mem_region *region) +{ + struct rb_node **cur, *parent; + + for (cur = &hva_tree->rb_node, parent = NULL; *cur; ) { + struct userspace_mem_region *cregion; + + cregion = container_of(*cur, typeof(*cregion), hva_node); + parent = *cur; + if (region->host_mem < cregion->host_mem) + cur = &(*cur)->rb_left; + else { + TEST_ASSERT(region->host_mem != + cregion->host_mem, + "Duplicate HVA in region tree"); + + cur = &(*cur)->rb_right; + } + } + + rb_link_node(®ion->hva_node, parent, cur); + rb_insert_color(®ion->hva_node, hva_tree); +} + + +int __vm_set_user_memory_region(struct kvm_vm *vm, uint32_t slot, uint32_t flags, + uint64_t gpa, uint64_t size, void *hva) +{ + struct kvm_userspace_memory_region region = { + .slot = slot, + .flags = flags, + .guest_phys_addr = gpa, + .memory_size = size, + .userspace_addr = (uintptr_t)hva, + }; + + return ioctl(vm->fd, KVM_SET_USER_MEMORY_REGION, ®ion); +} + +void vm_set_user_memory_region(struct kvm_vm *vm, uint32_t slot, uint32_t flags, + uint64_t gpa, uint64_t size, void *hva) +{ + int ret = __vm_set_user_memory_region(vm, slot, flags, gpa, size, hva); + + TEST_ASSERT(!ret, "KVM_SET_USER_MEMORY_REGION failed, errno = %d (%s)", + errno, strerror(errno)); +} + +#define TEST_REQUIRE_SET_USER_MEMORY_REGION2() \ + __TEST_REQUIRE(kvm_has_cap(KVM_CAP_USER_MEMORY2), \ + "KVM selftests now require KVM_SET_USER_MEMORY_REGION2 (introduced in v6.8)") + +int __vm_set_user_memory_region2(struct kvm_vm *vm, uint32_t slot, uint32_t flags, + uint64_t gpa, uint64_t size, void *hva, + uint32_t guest_memfd, uint64_t guest_memfd_offset) +{ + struct kvm_userspace_memory_region2 region = { + .slot = slot, + .flags = flags, + .guest_phys_addr = gpa, + .memory_size = size, + .userspace_addr = (uintptr_t)hva, + .guest_memfd = guest_memfd, + .guest_memfd_offset = guest_memfd_offset, + }; + + TEST_REQUIRE_SET_USER_MEMORY_REGION2(); + + return ioctl(vm->fd, KVM_SET_USER_MEMORY_REGION2, ®ion); +} + +void vm_set_user_memory_region2(struct kvm_vm *vm, uint32_t slot, uint32_t flags, + uint64_t gpa, uint64_t size, void *hva, + uint32_t guest_memfd, uint64_t guest_memfd_offset) +{ + int ret = __vm_set_user_memory_region2(vm, slot, flags, gpa, size, hva, + guest_memfd, guest_memfd_offset); + + TEST_ASSERT(!ret, "KVM_SET_USER_MEMORY_REGION2 failed, errno = %d (%s)", + errno, strerror(errno)); +} + + +/* FIXME: This thing needs to be ripped apart and rewritten. */ +void vm_mem_add(struct kvm_vm *vm, enum vm_mem_backing_src_type src_type, + uint64_t guest_paddr, uint32_t slot, uint64_t npages, + uint32_t flags, int guest_memfd, uint64_t guest_memfd_offset) { int ret; struct userspace_mem_region *region; - size_t huge_page_size = KVM_UTIL_PGS_PER_HUGEPG * vm->page_size; + size_t backing_src_pagesz = get_backing_src_pagesz(src_type); + size_t mem_size = npages * vm->page_size; size_t alignment; + TEST_REQUIRE_SET_USER_MEMORY_REGION2(); + TEST_ASSERT(vm_adjust_num_guest_pages(vm->mode, npages) == npages, "Number of guest pages is not compatible with the host. " "Try npages=%d", vm_adjust_num_guest_pages(vm->mode, npages)); @@ -617,7 +1015,8 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm, (uint64_t) region->region.memory_size); /* Confirm no region with the requested slot already exists. */ - list_for_each_entry(region, &vm->userspace_mem_regions, list) { + hash_for_each_possible(vm->regions.slot_hash, region, slot_node, + slot) { if (region->region.slot != slot) continue; @@ -634,7 +1033,7 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm, /* Allocate and initialize new mem region structure. */ region = calloc(1, sizeof(*region)); TEST_ASSERT(region != NULL, "Insufficient Memory"); - region->mmap_size = npages * vm->page_size; + region->mmap_size = mem_size; #ifdef __s390x__ /* On s390x, the host address must be aligned to 1M (due to PGSTEs) */ @@ -643,37 +1042,79 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm, alignment = 1; #endif + /* + * When using THP mmap is not guaranteed to returned a hugepage aligned + * address so we have to pad the mmap. Padding is not needed for HugeTLB + * because mmap will always return an address aligned to the HugeTLB + * page size. + */ if (src_type == VM_MEM_SRC_ANONYMOUS_THP) - alignment = max(huge_page_size, alignment); + alignment = max(backing_src_pagesz, alignment); + + TEST_ASSERT_EQ(guest_paddr, align_up(guest_paddr, backing_src_pagesz)); /* Add enough memory to align up if necessary */ if (alignment > 1) region->mmap_size += alignment; + region->fd = -1; + if (backing_src_is_shared(src_type)) + region->fd = kvm_memfd_alloc(region->mmap_size, + src_type == VM_MEM_SRC_SHARED_HUGETLB); + region->mmap_start = mmap(NULL, region->mmap_size, PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS - | (src_type == VM_MEM_SRC_ANONYMOUS_HUGETLB ? MAP_HUGETLB : 0), - -1, 0); + vm_mem_backing_src_alias(src_type)->flag, + region->fd, 0); TEST_ASSERT(region->mmap_start != MAP_FAILED, - "test_malloc failed, mmap_start: %p errno: %i", - region->mmap_start, errno); + __KVM_SYSCALL_ERROR("mmap()", (int)(unsigned long)MAP_FAILED)); + + TEST_ASSERT(!is_backing_src_hugetlb(src_type) || + region->mmap_start == align_ptr_up(region->mmap_start, backing_src_pagesz), + "mmap_start %p is not aligned to HugeTLB page size 0x%lx", + region->mmap_start, backing_src_pagesz); /* Align host address */ - region->host_mem = align(region->mmap_start, alignment); + region->host_mem = align_ptr_up(region->mmap_start, alignment); /* As needed perform madvise */ - if (src_type == VM_MEM_SRC_ANONYMOUS || src_type == VM_MEM_SRC_ANONYMOUS_THP) { - ret = madvise(region->host_mem, npages * vm->page_size, - src_type == VM_MEM_SRC_ANONYMOUS ? MADV_NOHUGEPAGE : MADV_HUGEPAGE); - TEST_ASSERT(ret == 0, "madvise failed,\n" - " addr: %p\n" - " length: 0x%lx\n" - " src_type: %x", - region->host_mem, npages * vm->page_size, src_type); + if ((src_type == VM_MEM_SRC_ANONYMOUS || + src_type == VM_MEM_SRC_ANONYMOUS_THP) && thp_configured()) { + ret = madvise(region->host_mem, mem_size, + src_type == VM_MEM_SRC_ANONYMOUS ? MADV_NOHUGEPAGE : MADV_HUGEPAGE); + TEST_ASSERT(ret == 0, "madvise failed, addr: %p length: 0x%lx src_type: %s", + region->host_mem, mem_size, + vm_mem_backing_src_alias(src_type)->name); + } + + region->backing_src_type = src_type; + + if (flags & KVM_MEM_GUEST_MEMFD) { + if (guest_memfd < 0) { + uint32_t guest_memfd_flags = 0; + TEST_ASSERT(!guest_memfd_offset, + "Offset must be zero when creating new guest_memfd"); + guest_memfd = vm_create_guest_memfd(vm, mem_size, guest_memfd_flags); + } else { + /* + * Install a unique fd for each memslot so that the fd + * can be closed when the region is deleted without + * needing to track if the fd is owned by the framework + * or by the caller. + */ + guest_memfd = dup(guest_memfd); + TEST_ASSERT(guest_memfd >= 0, __KVM_SYSCALL_ERROR("dup()", guest_memfd)); + } + + region->region.guest_memfd = guest_memfd; + region->region.guest_memfd_offset = guest_memfd_offset; + } else { + region->region.guest_memfd = -1; } region->unused_phy_pages = sparsebit_alloc(); + if (vm_arch_has_protected_memory(vm)) + region->protected_phy_pages = sparsebit_alloc(); sparsebit_set_num(region->unused_phy_pages, guest_paddr >> vm->page_shift, npages); region->region.slot = slot; @@ -681,16 +1122,40 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm, region->region.guest_phys_addr = guest_paddr; region->region.memory_size = npages * vm->page_size; region->region.userspace_addr = (uintptr_t) region->host_mem; - ret = ioctl(vm->fd, KVM_SET_USER_MEMORY_REGION, ®ion->region); - TEST_ASSERT(ret == 0, "KVM_SET_USER_MEMORY_REGION IOCTL failed,\n" + ret = __vm_ioctl(vm, KVM_SET_USER_MEMORY_REGION2, ®ion->region); + TEST_ASSERT(ret == 0, "KVM_SET_USER_MEMORY_REGION2 IOCTL failed,\n" " rc: %i errno: %i\n" " slot: %u flags: 0x%x\n" - " guest_phys_addr: 0x%lx size: 0x%lx", + " guest_phys_addr: 0x%lx size: 0x%lx guest_memfd: %d", ret, errno, slot, flags, - guest_paddr, (uint64_t) region->region.memory_size); + guest_paddr, (uint64_t) region->region.memory_size, + region->region.guest_memfd); + + /* Add to quick lookup data structures */ + vm_userspace_mem_region_gpa_insert(&vm->regions.gpa_tree, region); + vm_userspace_mem_region_hva_insert(&vm->regions.hva_tree, region); + hash_add(vm->regions.slot_hash, ®ion->slot_node, slot); + + /* If shared memory, create an alias. */ + if (region->fd >= 0) { + region->mmap_alias = mmap(NULL, region->mmap_size, + PROT_READ | PROT_WRITE, + vm_mem_backing_src_alias(src_type)->flag, + region->fd, 0); + TEST_ASSERT(region->mmap_alias != MAP_FAILED, + __KVM_SYSCALL_ERROR("mmap()", (int)(unsigned long)MAP_FAILED)); + + /* Align host alias address */ + region->host_alias = align_ptr_up(region->mmap_alias, alignment); + } +} - /* Add to linked-list of memory regions. */ - list_add(®ion->list, &vm->userspace_mem_regions); +void vm_userspace_mem_region_add(struct kvm_vm *vm, + enum vm_mem_backing_src_type src_type, + uint64_t guest_paddr, uint32_t slot, + uint64_t npages, uint32_t flags) +{ + vm_mem_add(vm, src_type, guest_paddr, slot, npages, flags, -1, 0); } /* @@ -713,10 +1178,10 @@ memslot2region(struct kvm_vm *vm, uint32_t memslot) { struct userspace_mem_region *region; - list_for_each_entry(region, &vm->userspace_mem_regions, list) { + hash_for_each_possible(vm->regions.slot_hash, region, slot_node, + memslot) if (region->region.slot == memslot) return region; - } fprintf(stderr, "No mem region with the requested slot found,\n" " requested slot: %u\n", memslot); @@ -749,9 +1214,9 @@ void vm_mem_region_set_flags(struct kvm_vm *vm, uint32_t slot, uint32_t flags) region->region.flags = flags; - ret = ioctl(vm->fd, KVM_SET_USER_MEMORY_REGION, ®ion->region); + ret = __vm_ioctl(vm, KVM_SET_USER_MEMORY_REGION2, ®ion->region); - TEST_ASSERT(ret == 0, "KVM_SET_USER_MEMORY_REGION IOCTL failed,\n" + TEST_ASSERT(ret == 0, "KVM_SET_USER_MEMORY_REGION2 IOCTL failed,\n" " rc: %i errno: %i slot: %u flags: 0x%x", ret, errno, slot, flags); } @@ -779,9 +1244,9 @@ void vm_mem_region_move(struct kvm_vm *vm, uint32_t slot, uint64_t new_gpa) region->region.guest_phys_addr = new_gpa; - ret = ioctl(vm->fd, KVM_SET_USER_MEMORY_REGION, ®ion->region); + ret = __vm_ioctl(vm, KVM_SET_USER_MEMORY_REGION2, ®ion->region); - TEST_ASSERT(!ret, "KVM_SET_USER_MEMORY_REGION failed\n" + TEST_ASSERT(!ret, "KVM_SET_USER_MEMORY_REGION2 failed\n" "ret: %i errno: %i slot: %u new_gpa: 0x%lx", ret, errno, slot, new_gpa); } @@ -801,85 +1266,97 @@ void vm_mem_region_move(struct kvm_vm *vm, uint32_t slot, uint64_t new_gpa) */ void vm_mem_region_delete(struct kvm_vm *vm, uint32_t slot) { - __vm_mem_region_delete(vm, memslot2region(vm, slot)); + __vm_mem_region_delete(vm, memslot2region(vm, slot), true); } -/* - * VCPU mmap Size - * - * Input Args: None - * - * Output Args: None - * - * Return: - * Size of VCPU state - * - * Returns the size of the structure pointed to by the return value - * of vcpu_state(). - */ +void vm_guest_mem_fallocate(struct kvm_vm *vm, uint64_t base, uint64_t size, + bool punch_hole) +{ + const int mode = FALLOC_FL_KEEP_SIZE | (punch_hole ? FALLOC_FL_PUNCH_HOLE : 0); + struct userspace_mem_region *region; + uint64_t end = base + size; + uint64_t gpa, len; + off_t fd_offset; + int ret; + + for (gpa = base; gpa < end; gpa += len) { + uint64_t offset; + + region = userspace_mem_region_find(vm, gpa, gpa); + TEST_ASSERT(region && region->region.flags & KVM_MEM_GUEST_MEMFD, + "Private memory region not found for GPA 0x%lx", gpa); + + offset = gpa - region->region.guest_phys_addr; + fd_offset = region->region.guest_memfd_offset + offset; + len = min_t(uint64_t, end - gpa, region->region.memory_size - offset); + + ret = fallocate(region->region.guest_memfd, mode, fd_offset, len); + TEST_ASSERT(!ret, "fallocate() failed to %s at %lx (len = %lu), fd = %d, mode = %x, offset = %lx", + punch_hole ? "punch hole" : "allocate", gpa, len, + region->region.guest_memfd, mode, fd_offset); + } +} + +/* Returns the size of a vCPU's kvm_run structure. */ static int vcpu_mmap_sz(void) { int dev_fd, ret; - dev_fd = open(KVM_DEV_PATH, O_RDONLY); - if (dev_fd < 0) - exit(KSFT_SKIP); + dev_fd = open_kvm_dev_path_or_exit(); ret = ioctl(dev_fd, KVM_GET_VCPU_MMAP_SIZE, NULL); TEST_ASSERT(ret >= sizeof(struct kvm_run), - "%s KVM_GET_VCPU_MMAP_SIZE ioctl failed, rc: %i errno: %i", - __func__, ret, errno); + KVM_IOCTL_ERROR(KVM_GET_VCPU_MMAP_SIZE, ret)); close(dev_fd); return ret; } +static bool vcpu_exists(struct kvm_vm *vm, uint32_t vcpu_id) +{ + struct kvm_vcpu *vcpu; + + list_for_each_entry(vcpu, &vm->vcpus, list) { + if (vcpu->id == vcpu_id) + return true; + } + + return false; +} + /* - * VM VCPU Add - * - * Input Args: - * vm - Virtual Machine - * vcpuid - VCPU ID - * - * Output Args: None - * - * Return: None - * - * Adds a virtual CPU to the VM specified by vm with the ID given by vcpuid. - * No additional VCPU setup is done. + * Adds a virtual CPU to the VM specified by vm with the ID given by vcpu_id. + * No additional vCPU setup is done. Returns the vCPU. */ -void vm_vcpu_add(struct kvm_vm *vm, uint32_t vcpuid) +struct kvm_vcpu *__vm_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id) { - struct vcpu *vcpu; + struct kvm_vcpu *vcpu; /* Confirm a vcpu with the specified id doesn't already exist. */ - vcpu = vcpu_find(vm, vcpuid); - if (vcpu != NULL) - TEST_FAIL("vcpu with the specified id " - "already exists,\n" - " requested vcpuid: %u\n" - " existing vcpuid: %u state: %p", - vcpuid, vcpu->id, vcpu->state); + TEST_ASSERT(!vcpu_exists(vm, vcpu_id), "vCPU%d already exists", vcpu_id); /* Allocate and initialize new vcpu structure. */ vcpu = calloc(1, sizeof(*vcpu)); TEST_ASSERT(vcpu != NULL, "Insufficient Memory"); - vcpu->id = vcpuid; - vcpu->fd = ioctl(vm->fd, KVM_CREATE_VCPU, vcpuid); - TEST_ASSERT(vcpu->fd >= 0, "KVM_CREATE_VCPU failed, rc: %i errno: %i", - vcpu->fd, errno); - TEST_ASSERT(vcpu_mmap_sz() >= sizeof(*vcpu->state), "vcpu mmap size " + vcpu->vm = vm; + vcpu->id = vcpu_id; + vcpu->fd = __vm_ioctl(vm, KVM_CREATE_VCPU, (void *)(unsigned long)vcpu_id); + TEST_ASSERT_VM_VCPU_IOCTL(vcpu->fd >= 0, KVM_CREATE_VCPU, vcpu->fd, vm); + + TEST_ASSERT(vcpu_mmap_sz() >= sizeof(*vcpu->run), "vcpu mmap size " "smaller than expected, vcpu_mmap_sz: %i expected_min: %zi", - vcpu_mmap_sz(), sizeof(*vcpu->state)); - vcpu->state = (struct kvm_run *) mmap(NULL, sizeof(*vcpu->state), + vcpu_mmap_sz(), sizeof(*vcpu->run)); + vcpu->run = (struct kvm_run *) mmap(NULL, vcpu_mmap_sz(), PROT_READ | PROT_WRITE, MAP_SHARED, vcpu->fd, 0); - TEST_ASSERT(vcpu->state != MAP_FAILED, "mmap vcpu_state failed, " - "vcpu id: %u errno: %i", vcpuid, errno); + TEST_ASSERT(vcpu->run != MAP_FAILED, + __KVM_SYSCALL_ERROR("mmap()", (int)(unsigned long)MAP_FAILED)); /* Add to linked-list of VCPUs. */ list_add(&vcpu->list, &vm->vcpus); + + return vcpu; } /* @@ -902,8 +1379,8 @@ void vm_vcpu_add(struct kvm_vm *vm, uint32_t vcpuid) * TEST_ASSERT failure occurs for invalid input or no area of at least * sz unallocated bytes >= vaddr_min is available. */ -static vm_vaddr_t vm_vaddr_unused_gap(struct kvm_vm *vm, size_t sz, - vm_vaddr_t vaddr_min) +vm_vaddr_t vm_vaddr_unused_gap(struct kvm_vm *vm, size_t sz, + vm_vaddr_t vaddr_min) { uint64_t pages = (sz + vm->page_size - 1) >> vm->page_shift; @@ -968,6 +1445,50 @@ va_found: return pgidx_start * vm->page_size; } +static vm_vaddr_t ____vm_vaddr_alloc(struct kvm_vm *vm, size_t sz, + vm_vaddr_t vaddr_min, + enum kvm_mem_region_type type, + bool protected) +{ + uint64_t pages = (sz >> vm->page_shift) + ((sz % vm->page_size) != 0); + + virt_pgd_alloc(vm); + vm_paddr_t paddr = __vm_phy_pages_alloc(vm, pages, + KVM_UTIL_MIN_PFN * vm->page_size, + vm->memslots[type], protected); + + /* + * Find an unused range of virtual page addresses of at least + * pages in length. + */ + vm_vaddr_t vaddr_start = vm_vaddr_unused_gap(vm, sz, vaddr_min); + + /* Map the virtual pages. */ + for (vm_vaddr_t vaddr = vaddr_start; pages > 0; + pages--, vaddr += vm->page_size, paddr += vm->page_size) { + + virt_pg_map(vm, vaddr, paddr); + + sparsebit_set(vm->vpages_mapped, vaddr >> vm->page_shift); + } + + return vaddr_start; +} + +vm_vaddr_t __vm_vaddr_alloc(struct kvm_vm *vm, size_t sz, vm_vaddr_t vaddr_min, + enum kvm_mem_region_type type) +{ + return ____vm_vaddr_alloc(vm, sz, vaddr_min, type, + vm_arch_has_protected_memory(vm)); +} + +vm_vaddr_t vm_vaddr_alloc_shared(struct kvm_vm *vm, size_t sz, + vm_vaddr_t vaddr_min, + enum kvm_mem_region_type type) +{ + return ____vm_vaddr_alloc(vm, sz, vaddr_min, type, false); +} + /* * VM Virtual Address Allocate * @@ -975,8 +1496,6 @@ va_found: * vm - Virtual Machine * sz - Size in bytes * vaddr_min - Minimum starting virtual address - * data_memslot - Memory region slot for data pages - * pgd_memslot - Memory region slot for new virtual translation tables * * Output Args: None * @@ -987,36 +1506,54 @@ va_found: * given by vm. The allocated bytes are mapped to a virtual address >= * the address given by vaddr_min. Note that each allocation uses a * a unique set of pages, with the minimum real allocation being at least - * a page. + * a page. The allocated physical space comes from the TEST_DATA memory region. */ -vm_vaddr_t vm_vaddr_alloc(struct kvm_vm *vm, size_t sz, vm_vaddr_t vaddr_min, - uint32_t data_memslot, uint32_t pgd_memslot) +vm_vaddr_t vm_vaddr_alloc(struct kvm_vm *vm, size_t sz, vm_vaddr_t vaddr_min) { - uint64_t pages = (sz >> vm->page_shift) + ((sz % vm->page_size) != 0); - - virt_pgd_alloc(vm, pgd_memslot); - - /* - * Find an unused range of virtual page addresses of at least - * pages in length. - */ - vm_vaddr_t vaddr_start = vm_vaddr_unused_gap(vm, sz, vaddr_min); - - /* Map the virtual pages. */ - for (vm_vaddr_t vaddr = vaddr_start; pages > 0; - pages--, vaddr += vm->page_size) { - vm_paddr_t paddr; - - paddr = vm_phy_page_alloc(vm, - KVM_UTIL_MIN_PFN * vm->page_size, data_memslot); + return __vm_vaddr_alloc(vm, sz, vaddr_min, MEM_REGION_TEST_DATA); +} - virt_pg_map(vm, vaddr, paddr, pgd_memslot); +/* + * VM Virtual Address Allocate Pages + * + * Input Args: + * vm - Virtual Machine + * + * Output Args: None + * + * Return: + * Starting guest virtual address + * + * Allocates at least N system pages worth of bytes within the virtual address + * space of the vm. + */ +vm_vaddr_t vm_vaddr_alloc_pages(struct kvm_vm *vm, int nr_pages) +{ + return vm_vaddr_alloc(vm, nr_pages * getpagesize(), KVM_UTIL_MIN_VADDR); +} - sparsebit_set(vm->vpages_mapped, - vaddr >> vm->page_shift); - } +vm_vaddr_t __vm_vaddr_alloc_page(struct kvm_vm *vm, enum kvm_mem_region_type type) +{ + return __vm_vaddr_alloc(vm, getpagesize(), KVM_UTIL_MIN_VADDR, type); +} - return vaddr_start; +/* + * VM Virtual Address Allocate Page + * + * Input Args: + * vm - Virtual Machine + * + * Output Args: None + * + * Return: + * Starting guest virtual address + * + * Allocates at least one system page worth of bytes within the virtual address + * space of the vm. + */ +vm_vaddr_t vm_vaddr_alloc_page(struct kvm_vm *vm) +{ + return vm_vaddr_alloc_pages(vm, 1); } /* @@ -1027,7 +1564,6 @@ vm_vaddr_t vm_vaddr_alloc(struct kvm_vm *vm, size_t sz, vm_vaddr_t vaddr_min, * vaddr - Virtuall address to map * paddr - VM Physical Address * npages - The number of pages to map - * pgd_memslot - Memory region slot for new virtual translation tables * * Output Args: None * @@ -1037,7 +1573,7 @@ vm_vaddr_t vm_vaddr_alloc(struct kvm_vm *vm, size_t sz, vm_vaddr_t vaddr_min, * @npages starting at @vaddr to the page range starting at @paddr. */ void virt_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, - unsigned int npages, uint32_t pgd_memslot) + unsigned int npages) { size_t page_size = vm->page_size; size_t size = npages * page_size; @@ -1046,7 +1582,9 @@ void virt_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, TEST_ASSERT(paddr + size > paddr, "Paddr overflow"); while (npages--) { - virt_pg_map(vm, vaddr, paddr, pgd_memslot); + virt_pg_map(vm, vaddr, paddr); + sparsebit_set(vm->vpages_mapped, vaddr >> vm->page_shift); + vaddr += page_size; paddr += page_size; } @@ -1073,16 +1611,16 @@ void *addr_gpa2hva(struct kvm_vm *vm, vm_paddr_t gpa) { struct userspace_mem_region *region; - list_for_each_entry(region, &vm->userspace_mem_regions, list) { - if ((gpa >= region->region.guest_phys_addr) - && (gpa <= (region->region.guest_phys_addr - + region->region.memory_size - 1))) - return (void *) ((uintptr_t) region->host_mem - + (gpa - region->region.guest_phys_addr)); + gpa = vm_untag_gpa(vm, gpa); + + region = userspace_mem_region_find(vm, gpa, gpa); + if (!region) { + TEST_FAIL("No vm physical memory at 0x%lx", gpa); + return NULL; } - TEST_FAIL("No vm physical memory at 0x%lx", gpa); - return NULL; + return (void *)((uintptr_t)region->host_mem + + (gpa - region->region.guest_phys_addr)); } /* @@ -1104,15 +1642,22 @@ void *addr_gpa2hva(struct kvm_vm *vm, vm_paddr_t gpa) */ vm_paddr_t addr_hva2gpa(struct kvm_vm *vm, void *hva) { - struct userspace_mem_region *region; + struct rb_node *node; + + for (node = vm->regions.hva_tree.rb_node; node; ) { + struct userspace_mem_region *region = + container_of(node, struct userspace_mem_region, hva_node); + + if (hva >= region->host_mem) { + if (hva <= (region->host_mem + + region->region.memory_size - 1)) + return (vm_paddr_t)((uintptr_t) + region->region.guest_phys_addr + + (hva - (uintptr_t)region->host_mem)); - list_for_each_entry(region, &vm->userspace_mem_regions, list) { - if ((hva >= region->host_mem) - && (hva <= (region->host_mem - + region->region.memory_size - 1))) - return (vm_paddr_t) ((uintptr_t) - region->region.guest_phys_addr - + (hva - (uintptr_t) region->host_mem)); + node = node->rb_right; + } else + node = node->rb_left; } TEST_FAIL("No mapping to a guest physical address, hva: %p", hva); @@ -1120,402 +1665,265 @@ vm_paddr_t addr_hva2gpa(struct kvm_vm *vm, void *hva) } /* - * VM Create IRQ Chip + * Address VM physical to Host Virtual *alias*. * * Input Args: * vm - Virtual Machine + * gpa - VM physical address * * Output Args: None * - * Return: None - * - * Creates an interrupt controller chip for the VM specified by vm. + * Return: + * Equivalent address within the host virtual *alias* area, or NULL + * (without failing the test) if the guest memory is not shared (so + * no alias exists). + * + * Create a writable, shared virtual=>physical alias for the specific GPA. + * The primary use case is to allow the host selftest to manipulate guest + * memory without mapping said memory in the guest's address space. And, for + * userfaultfd-based demand paging, to do so without triggering userfaults. */ -void vm_create_irqchip(struct kvm_vm *vm) +void *addr_gpa2alias(struct kvm_vm *vm, vm_paddr_t gpa) { - int ret; - - ret = ioctl(vm->fd, KVM_CREATE_IRQCHIP, 0); - TEST_ASSERT(ret == 0, "KVM_CREATE_IRQCHIP IOCTL failed, " - "rc: %i errno: %i", ret, errno); + struct userspace_mem_region *region; + uintptr_t offset; - vm->has_irqchip = true; -} + region = userspace_mem_region_find(vm, gpa, gpa); + if (!region) + return NULL; -/* - * VM VCPU State - * - * Input Args: - * vm - Virtual Machine - * vcpuid - VCPU ID - * - * Output Args: None - * - * Return: - * Pointer to structure that describes the state of the VCPU. - * - * Locates and returns a pointer to a structure that describes the - * state of the VCPU with the given vcpuid. - */ -struct kvm_run *vcpu_state(struct kvm_vm *vm, uint32_t vcpuid) -{ - struct vcpu *vcpu = vcpu_find(vm, vcpuid); - TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid); + if (!region->host_alias) + return NULL; - return vcpu->state; + offset = gpa - region->region.guest_phys_addr; + return (void *) ((uintptr_t) region->host_alias + offset); } -/* - * VM VCPU Run - * - * Input Args: - * vm - Virtual Machine - * vcpuid - VCPU ID - * - * Output Args: None - * - * Return: None - * - * Switch to executing the code for the VCPU given by vcpuid, within the VM - * given by vm. - */ -void vcpu_run(struct kvm_vm *vm, uint32_t vcpuid) +/* Create an interrupt controller chip for the specified VM. */ +void vm_create_irqchip(struct kvm_vm *vm) { - int ret = _vcpu_run(vm, vcpuid); - TEST_ASSERT(ret == 0, "KVM_RUN IOCTL failed, " - "rc: %i errno: %i", ret, errno); + vm_ioctl(vm, KVM_CREATE_IRQCHIP, NULL); + + vm->has_irqchip = true; } -int _vcpu_run(struct kvm_vm *vm, uint32_t vcpuid) +int _vcpu_run(struct kvm_vcpu *vcpu) { - struct vcpu *vcpu = vcpu_find(vm, vcpuid); int rc; - TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid); do { - rc = ioctl(vcpu->fd, KVM_RUN, NULL); + rc = __vcpu_run(vcpu); } while (rc == -1 && errno == EINTR); - return rc; -} -void vcpu_run_complete_io(struct kvm_vm *vm, uint32_t vcpuid) -{ - struct vcpu *vcpu = vcpu_find(vm, vcpuid); - int ret; - - TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid); + assert_on_unhandled_exception(vcpu); - vcpu->state->immediate_exit = 1; - ret = ioctl(vcpu->fd, KVM_RUN, NULL); - vcpu->state->immediate_exit = 0; - - TEST_ASSERT(ret == -1 && errno == EINTR, - "KVM_RUN IOCTL didn't exit immediately, rc: %i, errno: %i", - ret, errno); -} - -void vcpu_set_guest_debug(struct kvm_vm *vm, uint32_t vcpuid, - struct kvm_guest_debug *debug) -{ - struct vcpu *vcpu = vcpu_find(vm, vcpuid); - int ret = ioctl(vcpu->fd, KVM_SET_GUEST_DEBUG, debug); - - TEST_ASSERT(ret == 0, "KVM_SET_GUEST_DEBUG failed: %d", ret); + return rc; } /* - * VM VCPU Set MP State - * - * Input Args: - * vm - Virtual Machine - * vcpuid - VCPU ID - * mp_state - mp_state to be set - * - * Output Args: None - * - * Return: None - * - * Sets the MP state of the VCPU given by vcpuid, to the state given - * by mp_state. + * Invoke KVM_RUN on a vCPU until KVM returns something other than -EINTR. + * Assert if the KVM returns an error (other than -EINTR). */ -void vcpu_set_mp_state(struct kvm_vm *vm, uint32_t vcpuid, - struct kvm_mp_state *mp_state) +void vcpu_run(struct kvm_vcpu *vcpu) { - struct vcpu *vcpu = vcpu_find(vm, vcpuid); - int ret; + int ret = _vcpu_run(vcpu); - TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid); - - ret = ioctl(vcpu->fd, KVM_SET_MP_STATE, mp_state); - TEST_ASSERT(ret == 0, "KVM_SET_MP_STATE IOCTL failed, " - "rc: %i errno: %i", ret, errno); + TEST_ASSERT(!ret, KVM_IOCTL_ERROR(KVM_RUN, ret)); } -/* - * VM VCPU Regs Get - * - * Input Args: - * vm - Virtual Machine - * vcpuid - VCPU ID - * - * Output Args: - * regs - current state of VCPU regs - * - * Return: None - * - * Obtains the current register state for the VCPU specified by vcpuid - * and stores it at the location given by regs. - */ -void vcpu_regs_get(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_regs *regs) +void vcpu_run_complete_io(struct kvm_vcpu *vcpu) { - struct vcpu *vcpu = vcpu_find(vm, vcpuid); int ret; - TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid); + vcpu->run->immediate_exit = 1; + ret = __vcpu_run(vcpu); + vcpu->run->immediate_exit = 0; - ret = ioctl(vcpu->fd, KVM_GET_REGS, regs); - TEST_ASSERT(ret == 0, "KVM_GET_REGS failed, rc: %i errno: %i", - ret, errno); + TEST_ASSERT(ret == -1 && errno == EINTR, + "KVM_RUN IOCTL didn't exit immediately, rc: %i, errno: %i", + ret, errno); } /* - * VM VCPU Regs Set - * - * Input Args: - * vm - Virtual Machine - * vcpuid - VCPU ID - * regs - Values to set VCPU regs to - * - * Output Args: None - * - * Return: None - * - * Sets the regs of the VCPU specified by vcpuid to the values - * given by regs. + * Get the list of guest registers which are supported for + * KVM_GET_ONE_REG/KVM_SET_ONE_REG ioctls. Returns a kvm_reg_list pointer, + * it is the caller's responsibility to free the list. */ -void vcpu_regs_set(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_regs *regs) -{ - struct vcpu *vcpu = vcpu_find(vm, vcpuid); - int ret; - - TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid); - - ret = ioctl(vcpu->fd, KVM_SET_REGS, regs); - TEST_ASSERT(ret == 0, "KVM_SET_REGS failed, rc: %i errno: %i", - ret, errno); -} - -#ifdef __KVM_HAVE_VCPU_EVENTS -void vcpu_events_get(struct kvm_vm *vm, uint32_t vcpuid, - struct kvm_vcpu_events *events) +struct kvm_reg_list *vcpu_get_reg_list(struct kvm_vcpu *vcpu) { - struct vcpu *vcpu = vcpu_find(vm, vcpuid); + struct kvm_reg_list reg_list_n = { .n = 0 }, *reg_list; int ret; - TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid); + ret = __vcpu_ioctl(vcpu, KVM_GET_REG_LIST, ®_list_n); + TEST_ASSERT(ret == -1 && errno == E2BIG, "KVM_GET_REG_LIST n=0"); - ret = ioctl(vcpu->fd, KVM_GET_VCPU_EVENTS, events); - TEST_ASSERT(ret == 0, "KVM_GET_VCPU_EVENTS, failed, rc: %i errno: %i", - ret, errno); + reg_list = calloc(1, sizeof(*reg_list) + reg_list_n.n * sizeof(__u64)); + reg_list->n = reg_list_n.n; + vcpu_ioctl(vcpu, KVM_GET_REG_LIST, reg_list); + return reg_list; } -void vcpu_events_set(struct kvm_vm *vm, uint32_t vcpuid, - struct kvm_vcpu_events *events) +void *vcpu_map_dirty_ring(struct kvm_vcpu *vcpu) { - struct vcpu *vcpu = vcpu_find(vm, vcpuid); - int ret; + uint32_t page_size = getpagesize(); + uint32_t size = vcpu->vm->dirty_ring_size; - TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid); + TEST_ASSERT(size > 0, "Should enable dirty ring first"); - ret = ioctl(vcpu->fd, KVM_SET_VCPU_EVENTS, events); - TEST_ASSERT(ret == 0, "KVM_SET_VCPU_EVENTS, failed, rc: %i errno: %i", - ret, errno); -} -#endif + if (!vcpu->dirty_gfns) { + void *addr; -#ifdef __x86_64__ -void vcpu_nested_state_get(struct kvm_vm *vm, uint32_t vcpuid, - struct kvm_nested_state *state) -{ - struct vcpu *vcpu = vcpu_find(vm, vcpuid); - int ret; + addr = mmap(NULL, size, PROT_READ, MAP_PRIVATE, vcpu->fd, + page_size * KVM_DIRTY_LOG_PAGE_OFFSET); + TEST_ASSERT(addr == MAP_FAILED, "Dirty ring mapped private"); - TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid); + addr = mmap(NULL, size, PROT_READ | PROT_EXEC, MAP_PRIVATE, vcpu->fd, + page_size * KVM_DIRTY_LOG_PAGE_OFFSET); + TEST_ASSERT(addr == MAP_FAILED, "Dirty ring mapped exec"); - ret = ioctl(vcpu->fd, KVM_GET_NESTED_STATE, state); - TEST_ASSERT(ret == 0, - "KVM_SET_NESTED_STATE failed, ret: %i errno: %i", - ret, errno); -} + addr = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, vcpu->fd, + page_size * KVM_DIRTY_LOG_PAGE_OFFSET); + TEST_ASSERT(addr != MAP_FAILED, "Dirty ring map failed"); -int vcpu_nested_state_set(struct kvm_vm *vm, uint32_t vcpuid, - struct kvm_nested_state *state, bool ignore_error) -{ - struct vcpu *vcpu = vcpu_find(vm, vcpuid); - int ret; - - TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid); - - ret = ioctl(vcpu->fd, KVM_SET_NESTED_STATE, state); - if (!ignore_error) { - TEST_ASSERT(ret == 0, - "KVM_SET_NESTED_STATE failed, ret: %i errno: %i", - ret, errno); + vcpu->dirty_gfns = addr; + vcpu->dirty_gfns_count = size / sizeof(struct kvm_dirty_gfn); } - return ret; + return vcpu->dirty_gfns; } -#endif /* - * VM VCPU System Regs Get - * - * Input Args: - * vm - Virtual Machine - * vcpuid - VCPU ID - * - * Output Args: - * sregs - current state of VCPU system regs - * - * Return: None - * - * Obtains the current system register state for the VCPU specified by - * vcpuid and stores it at the location given by sregs. + * Device Ioctl */ -void vcpu_sregs_get(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_sregs *sregs) -{ - struct vcpu *vcpu = vcpu_find(vm, vcpuid); - int ret; - TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid); +int __kvm_has_device_attr(int dev_fd, uint32_t group, uint64_t attr) +{ + struct kvm_device_attr attribute = { + .group = group, + .attr = attr, + .flags = 0, + }; - ret = ioctl(vcpu->fd, KVM_GET_SREGS, sregs); - TEST_ASSERT(ret == 0, "KVM_GET_SREGS failed, rc: %i errno: %i", - ret, errno); + return ioctl(dev_fd, KVM_HAS_DEVICE_ATTR, &attribute); } -/* - * VM VCPU System Regs Set - * - * Input Args: - * vm - Virtual Machine - * vcpuid - VCPU ID - * sregs - Values to set VCPU system regs to - * - * Output Args: None - * - * Return: None - * - * Sets the system regs of the VCPU specified by vcpuid to the values - * given by sregs. - */ -void vcpu_sregs_set(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_sregs *sregs) +int __kvm_test_create_device(struct kvm_vm *vm, uint64_t type) { - int ret = _vcpu_sregs_set(vm, vcpuid, sregs); - TEST_ASSERT(ret == 0, "KVM_RUN IOCTL failed, " - "rc: %i errno: %i", ret, errno); + struct kvm_create_device create_dev = { + .type = type, + .flags = KVM_CREATE_DEVICE_TEST, + }; + + return __vm_ioctl(vm, KVM_CREATE_DEVICE, &create_dev); } -int _vcpu_sregs_set(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_sregs *sregs) +int __kvm_create_device(struct kvm_vm *vm, uint64_t type) { - struct vcpu *vcpu = vcpu_find(vm, vcpuid); + struct kvm_create_device create_dev = { + .type = type, + .fd = -1, + .flags = 0, + }; + int err; - TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid); + err = __vm_ioctl(vm, KVM_CREATE_DEVICE, &create_dev); + TEST_ASSERT(err <= 0, "KVM_CREATE_DEVICE shouldn't return a positive value"); + return err ? : create_dev.fd; +} - return ioctl(vcpu->fd, KVM_SET_SREGS, sregs); +int __kvm_device_attr_get(int dev_fd, uint32_t group, uint64_t attr, void *val) +{ + struct kvm_device_attr kvmattr = { + .group = group, + .attr = attr, + .flags = 0, + .addr = (uintptr_t)val, + }; + + return __kvm_ioctl(dev_fd, KVM_GET_DEVICE_ATTR, &kvmattr); } -void vcpu_fpu_get(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_fpu *fpu) +int __kvm_device_attr_set(int dev_fd, uint32_t group, uint64_t attr, void *val) { - int ret; + struct kvm_device_attr kvmattr = { + .group = group, + .attr = attr, + .flags = 0, + .addr = (uintptr_t)val, + }; - ret = _vcpu_ioctl(vm, vcpuid, KVM_GET_FPU, fpu); - TEST_ASSERT(ret == 0, "KVM_GET_FPU failed, rc: %i errno: %i (%s)", - ret, errno, strerror(errno)); + return __kvm_ioctl(dev_fd, KVM_SET_DEVICE_ATTR, &kvmattr); } -void vcpu_fpu_set(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_fpu *fpu) +/* + * IRQ related functions. + */ + +int _kvm_irq_line(struct kvm_vm *vm, uint32_t irq, int level) { - int ret; + struct kvm_irq_level irq_level = { + .irq = irq, + .level = level, + }; - ret = _vcpu_ioctl(vm, vcpuid, KVM_SET_FPU, fpu); - TEST_ASSERT(ret == 0, "KVM_SET_FPU failed, rc: %i errno: %i (%s)", - ret, errno, strerror(errno)); + return __vm_ioctl(vm, KVM_IRQ_LINE, &irq_level); } -void vcpu_get_reg(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_one_reg *reg) +void kvm_irq_line(struct kvm_vm *vm, uint32_t irq, int level) { - int ret; + int ret = _kvm_irq_line(vm, irq, level); - ret = _vcpu_ioctl(vm, vcpuid, KVM_GET_ONE_REG, reg); - TEST_ASSERT(ret == 0, "KVM_GET_ONE_REG failed, rc: %i errno: %i (%s)", - ret, errno, strerror(errno)); + TEST_ASSERT(ret >= 0, KVM_IOCTL_ERROR(KVM_IRQ_LINE, ret)); } -void vcpu_set_reg(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_one_reg *reg) +struct kvm_irq_routing *kvm_gsi_routing_create(void) { - int ret; + struct kvm_irq_routing *routing; + size_t size; - ret = _vcpu_ioctl(vm, vcpuid, KVM_SET_ONE_REG, reg); - TEST_ASSERT(ret == 0, "KVM_SET_ONE_REG failed, rc: %i errno: %i (%s)", - ret, errno, strerror(errno)); + size = sizeof(struct kvm_irq_routing); + /* Allocate space for the max number of entries: this wastes 196 KBs. */ + size += KVM_MAX_IRQ_ROUTES * sizeof(struct kvm_irq_routing_entry); + routing = calloc(1, size); + assert(routing); + + return routing; } -/* - * VCPU Ioctl - * - * Input Args: - * vm - Virtual Machine - * vcpuid - VCPU ID - * cmd - Ioctl number - * arg - Argument to pass to the ioctl - * - * Return: None - * - * Issues an arbitrary ioctl on a VCPU fd. - */ -void vcpu_ioctl(struct kvm_vm *vm, uint32_t vcpuid, - unsigned long cmd, void *arg) +void kvm_gsi_routing_irqchip_add(struct kvm_irq_routing *routing, + uint32_t gsi, uint32_t pin) { - int ret; + int i; - ret = _vcpu_ioctl(vm, vcpuid, cmd, arg); - TEST_ASSERT(ret == 0, "vcpu ioctl %lu failed, rc: %i errno: %i (%s)", - cmd, ret, errno, strerror(errno)); + assert(routing); + assert(routing->nr < KVM_MAX_IRQ_ROUTES); + + i = routing->nr; + routing->entries[i].gsi = gsi; + routing->entries[i].type = KVM_IRQ_ROUTING_IRQCHIP; + routing->entries[i].flags = 0; + routing->entries[i].u.irqchip.irqchip = 0; + routing->entries[i].u.irqchip.pin = pin; + routing->nr++; } -int _vcpu_ioctl(struct kvm_vm *vm, uint32_t vcpuid, - unsigned long cmd, void *arg) +int _kvm_gsi_routing_write(struct kvm_vm *vm, struct kvm_irq_routing *routing) { - struct vcpu *vcpu = vcpu_find(vm, vcpuid); int ret; - TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid); - - ret = ioctl(vcpu->fd, cmd, arg); + assert(routing); + ret = __vm_ioctl(vm, KVM_SET_GSI_ROUTING, routing); + free(routing); return ret; } -/* - * VM Ioctl - * - * Input Args: - * vm - Virtual Machine - * cmd - Ioctl number - * arg - Argument to pass to the ioctl - * - * Return: None - * - * Issues an arbitrary ioctl on a VM fd. - */ -void vm_ioctl(struct kvm_vm *vm, unsigned long cmd, void *arg) +void kvm_gsi_routing_write(struct kvm_vm *vm, struct kvm_irq_routing *routing) { int ret; - ret = ioctl(vm->fd, cmd, arg); - TEST_ASSERT(ret == 0, "vm ioctl %lu failed, rc: %i errno: %i (%s)", - cmd, ret, errno, strerror(errno)); + ret = _kvm_gsi_routing_write(vm, routing); + TEST_ASSERT(!ret, KVM_IOCTL_ERROR(KVM_SET_GSI_ROUTING, ret)); } /* @@ -1535,14 +1943,15 @@ void vm_ioctl(struct kvm_vm *vm, unsigned long cmd, void *arg) */ void vm_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent) { + int ctr; struct userspace_mem_region *region; - struct vcpu *vcpu; + struct kvm_vcpu *vcpu; fprintf(stream, "%*smode: 0x%x\n", indent, "", vm->mode); fprintf(stream, "%*sfd: %i\n", indent, "", vm->fd); fprintf(stream, "%*spage_size: 0x%x\n", indent, "", vm->page_size); fprintf(stream, "%*sMem Regions:\n", indent, ""); - list_for_each_entry(region, &vm->userspace_mem_regions, list) { + hash_for_each(vm->regions.slot_hash, ctr, region, slot_node) { fprintf(stream, "%*sguest_phys: 0x%lx size: 0x%lx " "host_virt: %p\n", indent + 2, "", (uint64_t) region->region.guest_phys_addr, @@ -1550,6 +1959,10 @@ void vm_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent) region->host_mem); fprintf(stream, "%*sunused_phy_pages: ", indent + 2, ""); sparsebit_dump(stream, region->unused_phy_pages, 0); + if (region->protected_phy_pages) { + fprintf(stream, "%*sprotected_phy_pages: ", indent + 2, ""); + sparsebit_dump(stream, region->protected_phy_pages, 0); + } } fprintf(stream, "%*sMapped Virtual Pages:\n", indent, ""); sparsebit_dump(stream, vm->vpages_mapped, indent + 2); @@ -1561,37 +1974,58 @@ void vm_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent) virt_dump(stream, vm, indent + 4); } fprintf(stream, "%*sVCPUs:\n", indent, ""); + list_for_each_entry(vcpu, &vm->vcpus, list) - vcpu_dump(stream, vm, vcpu->id, indent + 2); + vcpu_dump(stream, vcpu, indent + 2); } +#define KVM_EXIT_STRING(x) {KVM_EXIT_##x, #x} + /* Known KVM exit reasons */ static struct exit_reason { unsigned int reason; const char *name; } exit_reasons_known[] = { - {KVM_EXIT_UNKNOWN, "UNKNOWN"}, - {KVM_EXIT_EXCEPTION, "EXCEPTION"}, - {KVM_EXIT_IO, "IO"}, - {KVM_EXIT_HYPERCALL, "HYPERCALL"}, - {KVM_EXIT_DEBUG, "DEBUG"}, - {KVM_EXIT_HLT, "HLT"}, - {KVM_EXIT_MMIO, "MMIO"}, - {KVM_EXIT_IRQ_WINDOW_OPEN, "IRQ_WINDOW_OPEN"}, - {KVM_EXIT_SHUTDOWN, "SHUTDOWN"}, - {KVM_EXIT_FAIL_ENTRY, "FAIL_ENTRY"}, - {KVM_EXIT_INTR, "INTR"}, - {KVM_EXIT_SET_TPR, "SET_TPR"}, - {KVM_EXIT_TPR_ACCESS, "TPR_ACCESS"}, - {KVM_EXIT_S390_SIEIC, "S390_SIEIC"}, - {KVM_EXIT_S390_RESET, "S390_RESET"}, - {KVM_EXIT_DCR, "DCR"}, - {KVM_EXIT_NMI, "NMI"}, - {KVM_EXIT_INTERNAL_ERROR, "INTERNAL_ERROR"}, - {KVM_EXIT_OSI, "OSI"}, - {KVM_EXIT_PAPR_HCALL, "PAPR_HCALL"}, + KVM_EXIT_STRING(UNKNOWN), + KVM_EXIT_STRING(EXCEPTION), + KVM_EXIT_STRING(IO), + KVM_EXIT_STRING(HYPERCALL), + KVM_EXIT_STRING(DEBUG), + KVM_EXIT_STRING(HLT), + KVM_EXIT_STRING(MMIO), + KVM_EXIT_STRING(IRQ_WINDOW_OPEN), + KVM_EXIT_STRING(SHUTDOWN), + KVM_EXIT_STRING(FAIL_ENTRY), + KVM_EXIT_STRING(INTR), + KVM_EXIT_STRING(SET_TPR), + KVM_EXIT_STRING(TPR_ACCESS), + KVM_EXIT_STRING(S390_SIEIC), + KVM_EXIT_STRING(S390_RESET), + KVM_EXIT_STRING(DCR), + KVM_EXIT_STRING(NMI), + KVM_EXIT_STRING(INTERNAL_ERROR), + KVM_EXIT_STRING(OSI), + KVM_EXIT_STRING(PAPR_HCALL), + KVM_EXIT_STRING(S390_UCONTROL), + KVM_EXIT_STRING(WATCHDOG), + KVM_EXIT_STRING(S390_TSCH), + KVM_EXIT_STRING(EPR), + KVM_EXIT_STRING(SYSTEM_EVENT), + KVM_EXIT_STRING(S390_STSI), + KVM_EXIT_STRING(IOAPIC_EOI), + KVM_EXIT_STRING(HYPERV), + KVM_EXIT_STRING(ARM_NISV), + KVM_EXIT_STRING(X86_RDMSR), + KVM_EXIT_STRING(X86_WRMSR), + KVM_EXIT_STRING(DIRTY_RING_FULL), + KVM_EXIT_STRING(AP_RESET_HOLD), + KVM_EXIT_STRING(X86_BUS_LOCK), + KVM_EXIT_STRING(XEN), + KVM_EXIT_STRING(RISCV_SBI), + KVM_EXIT_STRING(RISCV_CSR), + KVM_EXIT_STRING(NOTIFY), #ifdef KVM_EXIT_MEMORY_NOT_PRESENT - {KVM_EXIT_MEMORY_NOT_PRESENT, "MEMORY_NOT_PRESENT"}, + KVM_EXIT_STRING(MEMORY_NOT_PRESENT), #endif }; @@ -1630,6 +2064,7 @@ const char *exit_reason_str(unsigned int exit_reason) * num - number of pages * paddr_min - Physical address minimum * memslot - Memory region to allocate page from + * protected - True if the pages will be used as protected/private memory * * Output Args: None * @@ -1641,8 +2076,9 @@ const char *exit_reason_str(unsigned int exit_reason) * and their base address is returned. A TEST_ASSERT failure occurs if * not enough pages are available at or above paddr_min. */ -vm_paddr_t vm_phy_pages_alloc(struct kvm_vm *vm, size_t num, - vm_paddr_t paddr_min, uint32_t memslot) +vm_paddr_t __vm_phy_pages_alloc(struct kvm_vm *vm, size_t num, + vm_paddr_t paddr_min, uint32_t memslot, + bool protected) { struct userspace_mem_region *region; sparsebit_idx_t pg, base; @@ -1655,8 +2091,10 @@ vm_paddr_t vm_phy_pages_alloc(struct kvm_vm *vm, size_t num, paddr_min, vm->page_size); region = memslot2region(vm, memslot); - base = pg = paddr_min >> vm->page_shift; + TEST_ASSERT(!protected || region->protected_phy_pages, + "Region doesn't support protected memory"); + base = pg = paddr_min >> vm->page_shift; do { for (; pg < base + num; ++pg) { if (!sparsebit_is_set(region->unused_phy_pages, pg)) { @@ -1675,8 +2113,11 @@ vm_paddr_t vm_phy_pages_alloc(struct kvm_vm *vm, size_t num, abort(); } - for (pg = base; pg < base + num; ++pg) + for (pg = base; pg < base + num; ++pg) { sparsebit_clear(region->unused_phy_pages, pg); + if (protected) + sparsebit_set(region->protected_phy_pages, pg); + } return base * vm->page_size; } @@ -1687,6 +2128,12 @@ vm_paddr_t vm_phy_page_alloc(struct kvm_vm *vm, vm_paddr_t paddr_min, return vm_phy_pages_alloc(vm, 1, paddr_min, memslot); } +vm_paddr_t vm_alloc_page_table(struct kvm_vm *vm) +{ + return vm_phy_page_alloc(vm, KVM_GUEST_PAGE_TABLE_MIN_PADDR, + vm->memslots[MEM_REGION_PT]); +} + /* * Address Guest Virtual to Host Virtual * @@ -1704,60 +2151,9 @@ void *addr_gva2hva(struct kvm_vm *vm, vm_vaddr_t gva) return addr_gpa2hva(vm, addr_gva2gpa(vm, gva)); } -/* - * Is Unrestricted Guest - * - * Input Args: - * vm - Virtual Machine - * - * Output Args: None - * - * Return: True if the unrestricted guest is set to 'Y', otherwise return false. - * - * Check if the unrestricted guest flag is enabled. - */ -bool vm_is_unrestricted_guest(struct kvm_vm *vm) -{ - char val = 'N'; - size_t count; - FILE *f; - - if (vm == NULL) { - /* Ensure that the KVM vendor-specific module is loaded. */ - f = fopen(KVM_DEV_PATH, "r"); - TEST_ASSERT(f != NULL, "Error in opening KVM dev file: %d", - errno); - fclose(f); - } - - f = fopen("/sys/module/kvm_intel/parameters/unrestricted_guest", "r"); - if (f) { - count = fread(&val, sizeof(char), 1, f); - TEST_ASSERT(count == 1, "Unable to read from param file."); - fclose(f); - } - - return val == 'Y'; -} - -unsigned int vm_get_page_size(struct kvm_vm *vm) +unsigned long __weak vm_compute_max_gfn(struct kvm_vm *vm) { - return vm->page_size; -} - -unsigned int vm_get_page_shift(struct kvm_vm *vm) -{ - return vm->page_shift; -} - -unsigned int vm_get_max_gfn(struct kvm_vm *vm) -{ - return vm->max_gfn; -} - -int vm_get_fd(struct kvm_vm *vm) -{ - return vm->fd; + return ((1ULL << vm->pa_bits) >> vm->page_shift) - 1; } static unsigned int vm_calc_num_pages(unsigned int num_pages, @@ -1799,3 +2195,146 @@ unsigned int vm_calc_num_guest_pages(enum vm_guest_mode mode, size_t size) n = DIV_ROUND_UP(size, vm_guest_mode_params[mode].page_size); return vm_adjust_num_guest_pages(mode, n); } + +/* + * Read binary stats descriptors + * + * Input Args: + * stats_fd - the file descriptor for the binary stats file from which to read + * header - the binary stats metadata header corresponding to the given FD + * + * Output Args: None + * + * Return: + * A pointer to a newly allocated series of stat descriptors. + * Caller is responsible for freeing the returned kvm_stats_desc. + * + * Read the stats descriptors from the binary stats interface. + */ +struct kvm_stats_desc *read_stats_descriptors(int stats_fd, + struct kvm_stats_header *header) +{ + struct kvm_stats_desc *stats_desc; + ssize_t desc_size, total_size, ret; + + desc_size = get_stats_descriptor_size(header); + total_size = header->num_desc * desc_size; + + stats_desc = calloc(header->num_desc, desc_size); + TEST_ASSERT(stats_desc, "Allocate memory for stats descriptors"); + + ret = pread(stats_fd, stats_desc, total_size, header->desc_offset); + TEST_ASSERT(ret == total_size, "Read KVM stats descriptors"); + + return stats_desc; +} + +/* + * Read stat data for a particular stat + * + * Input Args: + * stats_fd - the file descriptor for the binary stats file from which to read + * header - the binary stats metadata header corresponding to the given FD + * desc - the binary stat metadata for the particular stat to be read + * max_elements - the maximum number of 8-byte values to read into data + * + * Output Args: + * data - the buffer into which stat data should be read + * + * Read the data values of a specified stat from the binary stats interface. + */ +void read_stat_data(int stats_fd, struct kvm_stats_header *header, + struct kvm_stats_desc *desc, uint64_t *data, + size_t max_elements) +{ + size_t nr_elements = min_t(ssize_t, desc->size, max_elements); + size_t size = nr_elements * sizeof(*data); + ssize_t ret; + + TEST_ASSERT(desc->size, "No elements in stat '%s'", desc->name); + TEST_ASSERT(max_elements, "Zero elements requested for stat '%s'", desc->name); + + ret = pread(stats_fd, data, size, + header->data_offset + desc->offset); + + TEST_ASSERT(ret >= 0, "pread() failed on stat '%s', errno: %i (%s)", + desc->name, errno, strerror(errno)); + TEST_ASSERT(ret == size, + "pread() on stat '%s' read %ld bytes, wanted %lu bytes", + desc->name, size, ret); +} + +/* + * Read the data of the named stat + * + * Input Args: + * vm - the VM for which the stat should be read + * stat_name - the name of the stat to read + * max_elements - the maximum number of 8-byte values to read into data + * + * Output Args: + * data - the buffer into which stat data should be read + * + * Read the data values of a specified stat from the binary stats interface. + */ +void __vm_get_stat(struct kvm_vm *vm, const char *stat_name, uint64_t *data, + size_t max_elements) +{ + struct kvm_stats_desc *desc; + size_t size_desc; + int i; + + if (!vm->stats_fd) { + vm->stats_fd = vm_get_stats_fd(vm); + read_stats_header(vm->stats_fd, &vm->stats_header); + vm->stats_desc = read_stats_descriptors(vm->stats_fd, + &vm->stats_header); + } + + size_desc = get_stats_descriptor_size(&vm->stats_header); + + for (i = 0; i < vm->stats_header.num_desc; ++i) { + desc = (void *)vm->stats_desc + (i * size_desc); + + if (strcmp(desc->name, stat_name)) + continue; + + read_stat_data(vm->stats_fd, &vm->stats_header, desc, + data, max_elements); + + break; + } +} + +__weak void kvm_arch_vm_post_create(struct kvm_vm *vm) +{ +} + +__weak void kvm_selftest_arch_init(void) +{ +} + +void __attribute((constructor)) kvm_selftest_init(void) +{ + /* Tell stdout not to buffer its content. */ + setbuf(stdout, NULL); + + guest_random_seed = random(); + + kvm_selftest_arch_init(); +} + +bool vm_is_gpa_protected(struct kvm_vm *vm, vm_paddr_t paddr) +{ + sparsebit_idx_t pg = 0; + struct userspace_mem_region *region; + + if (!vm_arch_has_protected_memory(vm)) + return false; + + region = userspace_mem_region_find(vm, paddr, paddr); + TEST_ASSERT(region, "No vm physical memory at 0x%lx", paddr); + + pg = paddr >> vm->page_shift; + return sparsebit_is_set(region->protected_phy_pages, pg); +} diff --git a/tools/testing/selftests/kvm/lib/kvm_util_internal.h b/tools/testing/selftests/kvm/lib/kvm_util_internal.h deleted file mode 100644 index 2ef446520748..000000000000 --- a/tools/testing/selftests/kvm/lib/kvm_util_internal.h +++ /dev/null @@ -1,111 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * tools/testing/selftests/kvm/lib/kvm_util_internal.h - * - * Copyright (C) 2018, Google LLC. - */ - -#ifndef SELFTEST_KVM_UTIL_INTERNAL_H -#define SELFTEST_KVM_UTIL_INTERNAL_H - -#include "sparsebit.h" - -#define KVM_DEV_PATH "/dev/kvm" - -struct userspace_mem_region { - struct kvm_userspace_memory_region region; - struct sparsebit *unused_phy_pages; - int fd; - off_t offset; - void *host_mem; - void *mmap_start; - size_t mmap_size; - struct list_head list; -}; - -struct vcpu { - struct list_head list; - uint32_t id; - int fd; - struct kvm_run *state; -}; - -struct kvm_vm { - int mode; - unsigned long type; - int kvm_fd; - int fd; - unsigned int pgtable_levels; - unsigned int page_size; - unsigned int page_shift; - unsigned int pa_bits; - unsigned int va_bits; - uint64_t max_gfn; - struct list_head vcpus; - struct list_head userspace_mem_regions; - struct sparsebit *vpages_valid; - struct sparsebit *vpages_mapped; - bool has_irqchip; - bool pgd_created; - vm_paddr_t pgd; - vm_vaddr_t gdt; - vm_vaddr_t tss; -}; - -struct vcpu *vcpu_find(struct kvm_vm *vm, uint32_t vcpuid); - -/* - * Virtual Translation Tables Dump - * - * Input Args: - * stream - Output FILE stream - * vm - Virtual Machine - * indent - Left margin indent amount - * - * Output Args: None - * - * Return: None - * - * Dumps to the FILE stream given by @stream, the contents of all the - * virtual translation tables for the VM given by @vm. - */ -void virt_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent); - -/* - * Register Dump - * - * Input Args: - * stream - Output FILE stream - * regs - Registers - * indent - Left margin indent amount - * - * Output Args: None - * - * Return: None - * - * Dumps the state of the registers given by @regs, to the FILE stream - * given by @stream. - */ -void regs_dump(FILE *stream, struct kvm_regs *regs, uint8_t indent); - -/* - * System Register Dump - * - * Input Args: - * stream - Output FILE stream - * sregs - System registers - * indent - Left margin indent amount - * - * Output Args: None - * - * Return: None - * - * Dumps the state of the system registers given by @sregs, to the FILE stream - * given by @stream. - */ -void sregs_dump(FILE *stream, struct kvm_sregs *sregs, uint8_t indent); - -struct userspace_mem_region * -memslot2region(struct kvm_vm *vm, uint32_t memslot); - -#endif /* SELFTEST_KVM_UTIL_INTERNAL_H */ diff --git a/tools/testing/selftests/kvm/lib/memstress.c b/tools/testing/selftests/kvm/lib/memstress.c new file mode 100644 index 000000000000..313277486a1d --- /dev/null +++ b/tools/testing/selftests/kvm/lib/memstress.c @@ -0,0 +1,391 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2020, Google LLC. + */ +#include <inttypes.h> +#include <linux/bitmap.h> + +#include "kvm_util.h" +#include "memstress.h" +#include "processor.h" +#include "ucall_common.h" + +struct memstress_args memstress_args; + +/* + * Guest virtual memory offset of the testing memory slot. + * Must not conflict with identity mapped test code. + */ +static uint64_t guest_test_virt_mem = DEFAULT_GUEST_TEST_MEM; + +struct vcpu_thread { + /* The index of the vCPU. */ + int vcpu_idx; + + /* The pthread backing the vCPU. */ + pthread_t thread; + + /* Set to true once the vCPU thread is up and running. */ + bool running; +}; + +/* The vCPU threads involved in this test. */ +static struct vcpu_thread vcpu_threads[KVM_MAX_VCPUS]; + +/* The function run by each vCPU thread, as provided by the test. */ +static void (*vcpu_thread_fn)(struct memstress_vcpu_args *); + +/* Set to true once all vCPU threads are up and running. */ +static bool all_vcpu_threads_running; + +static struct kvm_vcpu *vcpus[KVM_MAX_VCPUS]; + +/* + * Continuously write to the first 8 bytes of each page in the + * specified region. + */ +void memstress_guest_code(uint32_t vcpu_idx) +{ + struct memstress_args *args = &memstress_args; + struct memstress_vcpu_args *vcpu_args = &args->vcpu_args[vcpu_idx]; + struct guest_random_state rand_state; + uint64_t gva; + uint64_t pages; + uint64_t addr; + uint64_t page; + int i; + + rand_state = new_guest_random_state(guest_random_seed + vcpu_idx); + + gva = vcpu_args->gva; + pages = vcpu_args->pages; + + /* Make sure vCPU args data structure is not corrupt. */ + GUEST_ASSERT(vcpu_args->vcpu_idx == vcpu_idx); + + while (true) { + for (i = 0; i < sizeof(memstress_args); i += args->guest_page_size) + (void) *((volatile char *)args + i); + + for (i = 0; i < pages; i++) { + if (args->random_access) + page = guest_random_u32(&rand_state) % pages; + else + page = i; + + addr = gva + (page * args->guest_page_size); + + if (__guest_random_bool(&rand_state, args->write_percent)) + *(uint64_t *)addr = 0x0123456789ABCDEF; + else + READ_ONCE(*(uint64_t *)addr); + } + + GUEST_SYNC(1); + } +} + +void memstress_setup_vcpus(struct kvm_vm *vm, int nr_vcpus, + struct kvm_vcpu *vcpus[], + uint64_t vcpu_memory_bytes, + bool partition_vcpu_memory_access) +{ + struct memstress_args *args = &memstress_args; + struct memstress_vcpu_args *vcpu_args; + int i; + + for (i = 0; i < nr_vcpus; i++) { + vcpu_args = &args->vcpu_args[i]; + + vcpu_args->vcpu = vcpus[i]; + vcpu_args->vcpu_idx = i; + + if (partition_vcpu_memory_access) { + vcpu_args->gva = guest_test_virt_mem + + (i * vcpu_memory_bytes); + vcpu_args->pages = vcpu_memory_bytes / + args->guest_page_size; + vcpu_args->gpa = args->gpa + (i * vcpu_memory_bytes); + } else { + vcpu_args->gva = guest_test_virt_mem; + vcpu_args->pages = (nr_vcpus * vcpu_memory_bytes) / + args->guest_page_size; + vcpu_args->gpa = args->gpa; + } + + vcpu_args_set(vcpus[i], 1, i); + + pr_debug("Added VCPU %d with test mem gpa [%lx, %lx)\n", + i, vcpu_args->gpa, vcpu_args->gpa + + (vcpu_args->pages * args->guest_page_size)); + } +} + +struct kvm_vm *memstress_create_vm(enum vm_guest_mode mode, int nr_vcpus, + uint64_t vcpu_memory_bytes, int slots, + enum vm_mem_backing_src_type backing_src, + bool partition_vcpu_memory_access) +{ + struct memstress_args *args = &memstress_args; + struct kvm_vm *vm; + uint64_t guest_num_pages, slot0_pages = 0; + uint64_t backing_src_pagesz = get_backing_src_pagesz(backing_src); + uint64_t region_end_gfn; + int i; + + pr_info("Testing guest mode: %s\n", vm_guest_mode_string(mode)); + + /* By default vCPUs will write to memory. */ + args->write_percent = 100; + + /* + * Snapshot the non-huge page size. This is used by the guest code to + * access/dirty pages at the logging granularity. + */ + args->guest_page_size = vm_guest_mode_params[mode].page_size; + + guest_num_pages = vm_adjust_num_guest_pages(mode, + (nr_vcpus * vcpu_memory_bytes) / args->guest_page_size); + + TEST_ASSERT(vcpu_memory_bytes % getpagesize() == 0, + "Guest memory size is not host page size aligned."); + TEST_ASSERT(vcpu_memory_bytes % args->guest_page_size == 0, + "Guest memory size is not guest page size aligned."); + TEST_ASSERT(guest_num_pages % slots == 0, + "Guest memory cannot be evenly divided into %d slots.", + slots); + + /* + * If using nested, allocate extra pages for the nested page tables and + * in-memory data structures. + */ + if (args->nested) + slot0_pages += memstress_nested_pages(nr_vcpus); + + /* + * Pass guest_num_pages to populate the page tables for test memory. + * The memory is also added to memslot 0, but that's a benign side + * effect as KVM allows aliasing HVAs in meslots. + */ + vm = __vm_create_with_vcpus(VM_SHAPE(mode), nr_vcpus, + slot0_pages + guest_num_pages, + memstress_guest_code, vcpus); + + args->vm = vm; + + /* Put the test region at the top guest physical memory. */ + region_end_gfn = vm->max_gfn + 1; + +#ifdef __x86_64__ + /* + * When running vCPUs in L2, restrict the test region to 48 bits to + * avoid needing 5-level page tables to identity map L2. + */ + if (args->nested) + region_end_gfn = min(region_end_gfn, (1UL << 48) / args->guest_page_size); +#endif + /* + * If there should be more memory in the guest test region than there + * can be pages in the guest, it will definitely cause problems. + */ + TEST_ASSERT(guest_num_pages < region_end_gfn, + "Requested more guest memory than address space allows.\n" + " guest pages: %" PRIx64 " max gfn: %" PRIx64 + " nr_vcpus: %d wss: %" PRIx64 "]", + guest_num_pages, region_end_gfn - 1, nr_vcpus, vcpu_memory_bytes); + + args->gpa = (region_end_gfn - guest_num_pages - 1) * args->guest_page_size; + args->gpa = align_down(args->gpa, backing_src_pagesz); +#ifdef __s390x__ + /* Align to 1M (segment size) */ + args->gpa = align_down(args->gpa, 1 << 20); +#endif + args->size = guest_num_pages * args->guest_page_size; + pr_info("guest physical test memory: [0x%lx, 0x%lx)\n", + args->gpa, args->gpa + args->size); + + /* Add extra memory slots for testing */ + for (i = 0; i < slots; i++) { + uint64_t region_pages = guest_num_pages / slots; + vm_paddr_t region_start = args->gpa + region_pages * args->guest_page_size * i; + + vm_userspace_mem_region_add(vm, backing_src, region_start, + MEMSTRESS_MEM_SLOT_INDEX + i, + region_pages, 0); + } + + /* Do mapping for the demand paging memory slot */ + virt_map(vm, guest_test_virt_mem, args->gpa, guest_num_pages); + + memstress_setup_vcpus(vm, nr_vcpus, vcpus, vcpu_memory_bytes, + partition_vcpu_memory_access); + + if (args->nested) { + pr_info("Configuring vCPUs to run in L2 (nested).\n"); + memstress_setup_nested(vm, nr_vcpus, vcpus); + } + + /* Export the shared variables to the guest. */ + sync_global_to_guest(vm, memstress_args); + + return vm; +} + +void memstress_destroy_vm(struct kvm_vm *vm) +{ + kvm_vm_free(vm); +} + +void memstress_set_write_percent(struct kvm_vm *vm, uint32_t write_percent) +{ + memstress_args.write_percent = write_percent; + sync_global_to_guest(vm, memstress_args.write_percent); +} + +void memstress_set_random_access(struct kvm_vm *vm, bool random_access) +{ + memstress_args.random_access = random_access; + sync_global_to_guest(vm, memstress_args.random_access); +} + +uint64_t __weak memstress_nested_pages(int nr_vcpus) +{ + return 0; +} + +void __weak memstress_setup_nested(struct kvm_vm *vm, int nr_vcpus, struct kvm_vcpu **vcpus) +{ + pr_info("%s() not support on this architecture, skipping.\n", __func__); + exit(KSFT_SKIP); +} + +static void *vcpu_thread_main(void *data) +{ + struct vcpu_thread *vcpu = data; + int vcpu_idx = vcpu->vcpu_idx; + + if (memstress_args.pin_vcpus) + kvm_pin_this_task_to_pcpu(memstress_args.vcpu_to_pcpu[vcpu_idx]); + + WRITE_ONCE(vcpu->running, true); + + /* + * Wait for all vCPU threads to be up and running before calling the test- + * provided vCPU thread function. This prevents thread creation (which + * requires taking the mmap_sem in write mode) from interfering with the + * guest faulting in its memory. + */ + while (!READ_ONCE(all_vcpu_threads_running)) + ; + + vcpu_thread_fn(&memstress_args.vcpu_args[vcpu_idx]); + + return NULL; +} + +void memstress_start_vcpu_threads(int nr_vcpus, + void (*vcpu_fn)(struct memstress_vcpu_args *)) +{ + int i; + + vcpu_thread_fn = vcpu_fn; + WRITE_ONCE(all_vcpu_threads_running, false); + WRITE_ONCE(memstress_args.stop_vcpus, false); + + for (i = 0; i < nr_vcpus; i++) { + struct vcpu_thread *vcpu = &vcpu_threads[i]; + + vcpu->vcpu_idx = i; + WRITE_ONCE(vcpu->running, false); + + pthread_create(&vcpu->thread, NULL, vcpu_thread_main, vcpu); + } + + for (i = 0; i < nr_vcpus; i++) { + while (!READ_ONCE(vcpu_threads[i].running)) + ; + } + + WRITE_ONCE(all_vcpu_threads_running, true); +} + +void memstress_join_vcpu_threads(int nr_vcpus) +{ + int i; + + WRITE_ONCE(memstress_args.stop_vcpus, true); + + for (i = 0; i < nr_vcpus; i++) + pthread_join(vcpu_threads[i].thread, NULL); +} + +static void toggle_dirty_logging(struct kvm_vm *vm, int slots, bool enable) +{ + int i; + + for (i = 0; i < slots; i++) { + int slot = MEMSTRESS_MEM_SLOT_INDEX + i; + int flags = enable ? KVM_MEM_LOG_DIRTY_PAGES : 0; + + vm_mem_region_set_flags(vm, slot, flags); + } +} + +void memstress_enable_dirty_logging(struct kvm_vm *vm, int slots) +{ + toggle_dirty_logging(vm, slots, true); +} + +void memstress_disable_dirty_logging(struct kvm_vm *vm, int slots) +{ + toggle_dirty_logging(vm, slots, false); +} + +void memstress_get_dirty_log(struct kvm_vm *vm, unsigned long *bitmaps[], int slots) +{ + int i; + + for (i = 0; i < slots; i++) { + int slot = MEMSTRESS_MEM_SLOT_INDEX + i; + + kvm_vm_get_dirty_log(vm, slot, bitmaps[i]); + } +} + +void memstress_clear_dirty_log(struct kvm_vm *vm, unsigned long *bitmaps[], + int slots, uint64_t pages_per_slot) +{ + int i; + + for (i = 0; i < slots; i++) { + int slot = MEMSTRESS_MEM_SLOT_INDEX + i; + + kvm_vm_clear_dirty_log(vm, slot, bitmaps[i], 0, pages_per_slot); + } +} + +unsigned long **memstress_alloc_bitmaps(int slots, uint64_t pages_per_slot) +{ + unsigned long **bitmaps; + int i; + + bitmaps = malloc(slots * sizeof(bitmaps[0])); + TEST_ASSERT(bitmaps, "Failed to allocate bitmaps array."); + + for (i = 0; i < slots; i++) { + bitmaps[i] = bitmap_zalloc(pages_per_slot); + TEST_ASSERT(bitmaps[i], "Failed to allocate slot bitmap."); + } + + return bitmaps; +} + +void memstress_free_bitmaps(unsigned long *bitmaps[], int slots) +{ + int i; + + for (i = 0; i < slots; i++) + free(bitmaps[i]); + + free(bitmaps); +} diff --git a/tools/testing/selftests/kvm/lib/rbtree.c b/tools/testing/selftests/kvm/lib/rbtree.c new file mode 100644 index 000000000000..a703f0194ea3 --- /dev/null +++ b/tools/testing/selftests/kvm/lib/rbtree.c @@ -0,0 +1 @@ +#include "../../../../lib/rbtree.c" diff --git a/tools/testing/selftests/kvm/lib/riscv/handlers.S b/tools/testing/selftests/kvm/lib/riscv/handlers.S new file mode 100644 index 000000000000..aa0abd3f35bb --- /dev/null +++ b/tools/testing/selftests/kvm/lib/riscv/handlers.S @@ -0,0 +1,101 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2023 Intel Corporation + */ + +#ifndef __ASSEMBLY__ +#define __ASSEMBLY__ +#endif + +#include <asm/csr.h> + +.macro save_context + addi sp, sp, (-8*34) + sd x1, 0(sp) + sd x2, 8(sp) + sd x3, 16(sp) + sd x4, 24(sp) + sd x5, 32(sp) + sd x6, 40(sp) + sd x7, 48(sp) + sd x8, 56(sp) + sd x9, 64(sp) + sd x10, 72(sp) + sd x11, 80(sp) + sd x12, 88(sp) + sd x13, 96(sp) + sd x14, 104(sp) + sd x15, 112(sp) + sd x16, 120(sp) + sd x17, 128(sp) + sd x18, 136(sp) + sd x19, 144(sp) + sd x20, 152(sp) + sd x21, 160(sp) + sd x22, 168(sp) + sd x23, 176(sp) + sd x24, 184(sp) + sd x25, 192(sp) + sd x26, 200(sp) + sd x27, 208(sp) + sd x28, 216(sp) + sd x29, 224(sp) + sd x30, 232(sp) + sd x31, 240(sp) + csrr s0, CSR_SEPC + csrr s1, CSR_SSTATUS + csrr s2, CSR_SCAUSE + sd s0, 248(sp) + sd s1, 256(sp) + sd s2, 264(sp) +.endm + +.macro restore_context + ld s2, 264(sp) + ld s1, 256(sp) + ld s0, 248(sp) + csrw CSR_SCAUSE, s2 + csrw CSR_SSTATUS, s1 + csrw CSR_SEPC, s0 + ld x31, 240(sp) + ld x30, 232(sp) + ld x29, 224(sp) + ld x28, 216(sp) + ld x27, 208(sp) + ld x26, 200(sp) + ld x25, 192(sp) + ld x24, 184(sp) + ld x23, 176(sp) + ld x22, 168(sp) + ld x21, 160(sp) + ld x20, 152(sp) + ld x19, 144(sp) + ld x18, 136(sp) + ld x17, 128(sp) + ld x16, 120(sp) + ld x15, 112(sp) + ld x14, 104(sp) + ld x13, 96(sp) + ld x12, 88(sp) + ld x11, 80(sp) + ld x10, 72(sp) + ld x9, 64(sp) + ld x8, 56(sp) + ld x7, 48(sp) + ld x6, 40(sp) + ld x5, 32(sp) + ld x4, 24(sp) + ld x3, 16(sp) + ld x2, 8(sp) + ld x1, 0(sp) + addi sp, sp, (8*34) +.endm + +.balign 4 +.global exception_vectors +exception_vectors: + save_context + move a0, sp + call route_exception + restore_context + sret diff --git a/tools/testing/selftests/kvm/lib/riscv/processor.c b/tools/testing/selftests/kvm/lib/riscv/processor.c new file mode 100644 index 000000000000..6ae47b3d6b25 --- /dev/null +++ b/tools/testing/selftests/kvm/lib/riscv/processor.c @@ -0,0 +1,517 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * RISC-V code + * + * Copyright (C) 2021 Western Digital Corporation or its affiliates. + */ + +#include <linux/compiler.h> +#include <assert.h> + +#include "kvm_util.h" +#include "processor.h" +#include "ucall_common.h" + +#define DEFAULT_RISCV_GUEST_STACK_VADDR_MIN 0xac0000 + +static vm_vaddr_t exception_handlers; + +bool __vcpu_has_ext(struct kvm_vcpu *vcpu, uint64_t ext) +{ + unsigned long value = 0; + int ret; + + ret = __vcpu_get_reg(vcpu, ext, &value); + + return !ret && !!value; +} + +static uint64_t page_align(struct kvm_vm *vm, uint64_t v) +{ + return (v + vm->page_size) & ~(vm->page_size - 1); +} + +static uint64_t pte_addr(struct kvm_vm *vm, uint64_t entry) +{ + return ((entry & PGTBL_PTE_ADDR_MASK) >> PGTBL_PTE_ADDR_SHIFT) << + PGTBL_PAGE_SIZE_SHIFT; +} + +static uint64_t ptrs_per_pte(struct kvm_vm *vm) +{ + return PGTBL_PAGE_SIZE / sizeof(uint64_t); +} + +static uint64_t pte_index_mask[] = { + PGTBL_L0_INDEX_MASK, + PGTBL_L1_INDEX_MASK, + PGTBL_L2_INDEX_MASK, + PGTBL_L3_INDEX_MASK, +}; + +static uint32_t pte_index_shift[] = { + PGTBL_L0_INDEX_SHIFT, + PGTBL_L1_INDEX_SHIFT, + PGTBL_L2_INDEX_SHIFT, + PGTBL_L3_INDEX_SHIFT, +}; + +static uint64_t pte_index(struct kvm_vm *vm, vm_vaddr_t gva, int level) +{ + TEST_ASSERT(level > -1, + "Negative page table level (%d) not possible", level); + TEST_ASSERT(level < vm->pgtable_levels, + "Invalid page table level (%d)", level); + + return (gva & pte_index_mask[level]) >> pte_index_shift[level]; +} + +void virt_arch_pgd_alloc(struct kvm_vm *vm) +{ + size_t nr_pages = page_align(vm, ptrs_per_pte(vm) * 8) / vm->page_size; + + if (vm->pgd_created) + return; + + vm->pgd = vm_phy_pages_alloc(vm, nr_pages, + KVM_GUEST_PAGE_TABLE_MIN_PADDR, + vm->memslots[MEM_REGION_PT]); + vm->pgd_created = true; +} + +void virt_arch_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr) +{ + uint64_t *ptep, next_ppn; + int level = vm->pgtable_levels - 1; + + TEST_ASSERT((vaddr % vm->page_size) == 0, + "Virtual address not on page boundary,\n" + " vaddr: 0x%lx vm->page_size: 0x%x", vaddr, vm->page_size); + TEST_ASSERT(sparsebit_is_set(vm->vpages_valid, + (vaddr >> vm->page_shift)), + "Invalid virtual address, vaddr: 0x%lx", vaddr); + TEST_ASSERT((paddr % vm->page_size) == 0, + "Physical address not on page boundary,\n" + " paddr: 0x%lx vm->page_size: 0x%x", paddr, vm->page_size); + TEST_ASSERT((paddr >> vm->page_shift) <= vm->max_gfn, + "Physical address beyond maximum supported,\n" + " paddr: 0x%lx vm->max_gfn: 0x%lx vm->page_size: 0x%x", + paddr, vm->max_gfn, vm->page_size); + + ptep = addr_gpa2hva(vm, vm->pgd) + pte_index(vm, vaddr, level) * 8; + if (!*ptep) { + next_ppn = vm_alloc_page_table(vm) >> PGTBL_PAGE_SIZE_SHIFT; + *ptep = (next_ppn << PGTBL_PTE_ADDR_SHIFT) | + PGTBL_PTE_VALID_MASK; + } + level--; + + while (level > -1) { + ptep = addr_gpa2hva(vm, pte_addr(vm, *ptep)) + + pte_index(vm, vaddr, level) * 8; + if (!*ptep && level > 0) { + next_ppn = vm_alloc_page_table(vm) >> + PGTBL_PAGE_SIZE_SHIFT; + *ptep = (next_ppn << PGTBL_PTE_ADDR_SHIFT) | + PGTBL_PTE_VALID_MASK; + } + level--; + } + + paddr = paddr >> PGTBL_PAGE_SIZE_SHIFT; + *ptep = (paddr << PGTBL_PTE_ADDR_SHIFT) | + PGTBL_PTE_PERM_MASK | PGTBL_PTE_VALID_MASK; +} + +vm_paddr_t addr_arch_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva) +{ + uint64_t *ptep; + int level = vm->pgtable_levels - 1; + + if (!vm->pgd_created) + goto unmapped_gva; + + ptep = addr_gpa2hva(vm, vm->pgd) + pte_index(vm, gva, level) * 8; + if (!ptep) + goto unmapped_gva; + level--; + + while (level > -1) { + ptep = addr_gpa2hva(vm, pte_addr(vm, *ptep)) + + pte_index(vm, gva, level) * 8; + if (!ptep) + goto unmapped_gva; + level--; + } + + return pte_addr(vm, *ptep) + (gva & (vm->page_size - 1)); + +unmapped_gva: + TEST_FAIL("No mapping for vm virtual address gva: 0x%lx level: %d", + gva, level); + exit(1); +} + +static void pte_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent, + uint64_t page, int level) +{ +#ifdef DEBUG + static const char *const type[] = { "pte", "pmd", "pud", "p4d"}; + uint64_t pte, *ptep; + + if (level < 0) + return; + + for (pte = page; pte < page + ptrs_per_pte(vm) * 8; pte += 8) { + ptep = addr_gpa2hva(vm, pte); + if (!*ptep) + continue; + fprintf(stream, "%*s%s: %lx: %lx at %p\n", indent, "", + type[level], pte, *ptep, ptep); + pte_dump(stream, vm, indent + 1, + pte_addr(vm, *ptep), level - 1); + } +#endif +} + +void virt_arch_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent) +{ + int level = vm->pgtable_levels - 1; + uint64_t pgd, *ptep; + + if (!vm->pgd_created) + return; + + for (pgd = vm->pgd; pgd < vm->pgd + ptrs_per_pte(vm) * 8; pgd += 8) { + ptep = addr_gpa2hva(vm, pgd); + if (!*ptep) + continue; + fprintf(stream, "%*spgd: %lx: %lx at %p\n", indent, "", + pgd, *ptep, ptep); + pte_dump(stream, vm, indent + 1, + pte_addr(vm, *ptep), level - 1); + } +} + +void riscv_vcpu_mmu_setup(struct kvm_vcpu *vcpu) +{ + struct kvm_vm *vm = vcpu->vm; + unsigned long satp; + + /* + * The RISC-V Sv48 MMU mode supports 56-bit physical address + * for 48-bit virtual address with 4KB last level page size. + */ + switch (vm->mode) { + case VM_MODE_P52V48_4K: + case VM_MODE_P48V48_4K: + case VM_MODE_P40V48_4K: + break; + default: + TEST_FAIL("Unknown guest mode, mode: 0x%x", vm->mode); + } + + satp = (vm->pgd >> PGTBL_PAGE_SIZE_SHIFT) & SATP_PPN; + satp |= SATP_MODE_48; + + vcpu_set_reg(vcpu, RISCV_GENERAL_CSR_REG(satp), satp); +} + +void vcpu_arch_dump(FILE *stream, struct kvm_vcpu *vcpu, uint8_t indent) +{ + struct kvm_riscv_core core; + + vcpu_get_reg(vcpu, RISCV_CORE_REG(mode), &core.mode); + vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.pc), &core.regs.pc); + vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.ra), &core.regs.ra); + vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.sp), &core.regs.sp); + vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.gp), &core.regs.gp); + vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.tp), &core.regs.tp); + vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.t0), &core.regs.t0); + vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.t1), &core.regs.t1); + vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.t2), &core.regs.t2); + vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.s0), &core.regs.s0); + vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.s1), &core.regs.s1); + vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.a0), &core.regs.a0); + vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.a1), &core.regs.a1); + vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.a2), &core.regs.a2); + vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.a3), &core.regs.a3); + vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.a4), &core.regs.a4); + vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.a5), &core.regs.a5); + vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.a6), &core.regs.a6); + vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.a7), &core.regs.a7); + vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.s2), &core.regs.s2); + vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.s3), &core.regs.s3); + vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.s4), &core.regs.s4); + vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.s5), &core.regs.s5); + vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.s6), &core.regs.s6); + vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.s7), &core.regs.s7); + vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.s8), &core.regs.s8); + vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.s9), &core.regs.s9); + vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.s10), &core.regs.s10); + vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.s11), &core.regs.s11); + vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.t3), &core.regs.t3); + vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.t4), &core.regs.t4); + vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.t5), &core.regs.t5); + vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.t6), &core.regs.t6); + + fprintf(stream, + " MODE: 0x%lx\n", core.mode); + fprintf(stream, + " PC: 0x%016lx RA: 0x%016lx SP: 0x%016lx GP: 0x%016lx\n", + core.regs.pc, core.regs.ra, core.regs.sp, core.regs.gp); + fprintf(stream, + " TP: 0x%016lx T0: 0x%016lx T1: 0x%016lx T2: 0x%016lx\n", + core.regs.tp, core.regs.t0, core.regs.t1, core.regs.t2); + fprintf(stream, + " S0: 0x%016lx S1: 0x%016lx A0: 0x%016lx A1: 0x%016lx\n", + core.regs.s0, core.regs.s1, core.regs.a0, core.regs.a1); + fprintf(stream, + " A2: 0x%016lx A3: 0x%016lx A4: 0x%016lx A5: 0x%016lx\n", + core.regs.a2, core.regs.a3, core.regs.a4, core.regs.a5); + fprintf(stream, + " A6: 0x%016lx A7: 0x%016lx S2: 0x%016lx S3: 0x%016lx\n", + core.regs.a6, core.regs.a7, core.regs.s2, core.regs.s3); + fprintf(stream, + " S4: 0x%016lx S5: 0x%016lx S6: 0x%016lx S7: 0x%016lx\n", + core.regs.s4, core.regs.s5, core.regs.s6, core.regs.s7); + fprintf(stream, + " S8: 0x%016lx S9: 0x%016lx S10: 0x%016lx S11: 0x%016lx\n", + core.regs.s8, core.regs.s9, core.regs.s10, core.regs.s11); + fprintf(stream, + " T3: 0x%016lx T4: 0x%016lx T5: 0x%016lx T6: 0x%016lx\n", + core.regs.t3, core.regs.t4, core.regs.t5, core.regs.t6); +} + +static void __aligned(16) guest_unexp_trap(void) +{ + sbi_ecall(KVM_RISCV_SELFTESTS_SBI_EXT, + KVM_RISCV_SELFTESTS_SBI_UNEXP, + 0, 0, 0, 0, 0, 0); +} + +void vcpu_arch_set_entry_point(struct kvm_vcpu *vcpu, void *guest_code) +{ + vcpu_set_reg(vcpu, RISCV_CORE_REG(regs.pc), (unsigned long)guest_code); +} + +struct kvm_vcpu *vm_arch_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id) +{ + int r; + size_t stack_size; + unsigned long stack_vaddr; + unsigned long current_gp = 0; + struct kvm_mp_state mps; + struct kvm_vcpu *vcpu; + + stack_size = vm->page_size == 4096 ? DEFAULT_STACK_PGS * vm->page_size : + vm->page_size; + stack_vaddr = __vm_vaddr_alloc(vm, stack_size, + DEFAULT_RISCV_GUEST_STACK_VADDR_MIN, + MEM_REGION_DATA); + + vcpu = __vm_vcpu_add(vm, vcpu_id); + riscv_vcpu_mmu_setup(vcpu); + + /* + * With SBI HSM support in KVM RISC-V, all secondary VCPUs are + * powered-off by default so we ensure that all secondary VCPUs + * are powered-on using KVM_SET_MP_STATE ioctl(). + */ + mps.mp_state = KVM_MP_STATE_RUNNABLE; + r = __vcpu_ioctl(vcpu, KVM_SET_MP_STATE, &mps); + TEST_ASSERT(!r, "IOCTL KVM_SET_MP_STATE failed (error %d)", r); + + /* Setup global pointer of guest to be same as the host */ + asm volatile ( + "add %0, gp, zero" : "=r" (current_gp) : : "memory"); + vcpu_set_reg(vcpu, RISCV_CORE_REG(regs.gp), current_gp); + + /* Setup stack pointer and program counter of guest */ + vcpu_set_reg(vcpu, RISCV_CORE_REG(regs.sp), stack_vaddr + stack_size); + + /* Setup sscratch for guest_get_vcpuid() */ + vcpu_set_reg(vcpu, RISCV_GENERAL_CSR_REG(sscratch), vcpu_id); + + /* Setup default exception vector of guest */ + vcpu_set_reg(vcpu, RISCV_GENERAL_CSR_REG(stvec), (unsigned long)guest_unexp_trap); + + return vcpu; +} + +void vcpu_args_set(struct kvm_vcpu *vcpu, unsigned int num, ...) +{ + va_list ap; + uint64_t id = RISCV_CORE_REG(regs.a0); + int i; + + TEST_ASSERT(num >= 1 && num <= 8, "Unsupported number of args,\n" + " num: %u", num); + + va_start(ap, num); + + for (i = 0; i < num; i++) { + switch (i) { + case 0: + id = RISCV_CORE_REG(regs.a0); + break; + case 1: + id = RISCV_CORE_REG(regs.a1); + break; + case 2: + id = RISCV_CORE_REG(regs.a2); + break; + case 3: + id = RISCV_CORE_REG(regs.a3); + break; + case 4: + id = RISCV_CORE_REG(regs.a4); + break; + case 5: + id = RISCV_CORE_REG(regs.a5); + break; + case 6: + id = RISCV_CORE_REG(regs.a6); + break; + case 7: + id = RISCV_CORE_REG(regs.a7); + break; + } + vcpu_set_reg(vcpu, id, va_arg(ap, uint64_t)); + } + + va_end(ap); +} + +void kvm_exit_unexpected_exception(int vector, int ec) +{ + ucall(UCALL_UNHANDLED, 2, vector, ec); +} + +void assert_on_unhandled_exception(struct kvm_vcpu *vcpu) +{ + struct ucall uc; + + if (get_ucall(vcpu, &uc) == UCALL_UNHANDLED) { + TEST_FAIL("Unexpected exception (vector:0x%lx, ec:0x%lx)", + uc.args[0], uc.args[1]); + } +} + +struct handlers { + exception_handler_fn exception_handlers[NR_VECTORS][NR_EXCEPTIONS]; +}; + +void route_exception(struct ex_regs *regs) +{ + struct handlers *handlers = (struct handlers *)exception_handlers; + int vector = 0, ec; + + ec = regs->cause & ~CAUSE_IRQ_FLAG; + if (ec >= NR_EXCEPTIONS) + goto unexpected_exception; + + /* Use the same handler for all the interrupts */ + if (regs->cause & CAUSE_IRQ_FLAG) { + vector = 1; + ec = 0; + } + + if (handlers && handlers->exception_handlers[vector][ec]) + return handlers->exception_handlers[vector][ec](regs); + +unexpected_exception: + return kvm_exit_unexpected_exception(vector, ec); +} + +void vcpu_init_vector_tables(struct kvm_vcpu *vcpu) +{ + extern char exception_vectors; + + vcpu_set_reg(vcpu, RISCV_GENERAL_CSR_REG(stvec), (unsigned long)&exception_vectors); +} + +void vm_init_vector_tables(struct kvm_vm *vm) +{ + vm->handlers = __vm_vaddr_alloc(vm, sizeof(struct handlers), + vm->page_size, MEM_REGION_DATA); + + *(vm_vaddr_t *)addr_gva2hva(vm, (vm_vaddr_t)(&exception_handlers)) = vm->handlers; +} + +void vm_install_exception_handler(struct kvm_vm *vm, int vector, exception_handler_fn handler) +{ + struct handlers *handlers = addr_gva2hva(vm, vm->handlers); + + assert(vector < NR_EXCEPTIONS); + handlers->exception_handlers[0][vector] = handler; +} + +void vm_install_interrupt_handler(struct kvm_vm *vm, exception_handler_fn handler) +{ + struct handlers *handlers = addr_gva2hva(vm, vm->handlers); + + handlers->exception_handlers[1][0] = handler; +} + +uint32_t guest_get_vcpuid(void) +{ + return csr_read(CSR_SSCRATCH); +} + +struct sbiret sbi_ecall(int ext, int fid, unsigned long arg0, + unsigned long arg1, unsigned long arg2, + unsigned long arg3, unsigned long arg4, + unsigned long arg5) +{ + register uintptr_t a0 asm ("a0") = (uintptr_t)(arg0); + register uintptr_t a1 asm ("a1") = (uintptr_t)(arg1); + register uintptr_t a2 asm ("a2") = (uintptr_t)(arg2); + register uintptr_t a3 asm ("a3") = (uintptr_t)(arg3); + register uintptr_t a4 asm ("a4") = (uintptr_t)(arg4); + register uintptr_t a5 asm ("a5") = (uintptr_t)(arg5); + register uintptr_t a6 asm ("a6") = (uintptr_t)(fid); + register uintptr_t a7 asm ("a7") = (uintptr_t)(ext); + struct sbiret ret; + + asm volatile ( + "ecall" + : "+r" (a0), "+r" (a1) + : "r" (a2), "r" (a3), "r" (a4), "r" (a5), "r" (a6), "r" (a7) + : "memory"); + ret.error = a0; + ret.value = a1; + + return ret; +} + +bool guest_sbi_probe_extension(int extid, long *out_val) +{ + struct sbiret ret; + + ret = sbi_ecall(SBI_EXT_BASE, SBI_EXT_BASE_PROBE_EXT, extid, + 0, 0, 0, 0, 0); + + __GUEST_ASSERT(!ret.error || ret.error == SBI_ERR_NOT_SUPPORTED, + "ret.error=%ld, ret.value=%ld\n", ret.error, ret.value); + + if (ret.error == SBI_ERR_NOT_SUPPORTED) + return false; + + if (out_val) + *out_val = ret.value; + + return true; +} + +unsigned long get_host_sbi_spec_version(void) +{ + struct sbiret ret; + + ret = sbi_ecall(SBI_EXT_BASE, SBI_EXT_BASE_GET_SPEC_VERSION, 0, + 0, 0, 0, 0, 0); + + GUEST_ASSERT(!ret.error); + + return ret.value; +} diff --git a/tools/testing/selftests/kvm/lib/riscv/ucall.c b/tools/testing/selftests/kvm/lib/riscv/ucall.c new file mode 100644 index 000000000000..14ee17151a59 --- /dev/null +++ b/tools/testing/selftests/kvm/lib/riscv/ucall.c @@ -0,0 +1,31 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * ucall support. A ucall is a "hypercall to userspace". + * + * Copyright (C) 2021 Western Digital Corporation or its affiliates. + */ + +#include <linux/kvm.h> + +#include "kvm_util.h" +#include "processor.h" + +void *ucall_arch_get_ucall(struct kvm_vcpu *vcpu) +{ + struct kvm_run *run = vcpu->run; + + if (run->exit_reason == KVM_EXIT_RISCV_SBI && + run->riscv_sbi.extension_id == KVM_RISCV_SELFTESTS_SBI_EXT) { + switch (run->riscv_sbi.function_id) { + case KVM_RISCV_SELFTESTS_SBI_UCALL: + return (void *)run->riscv_sbi.args[0]; + case KVM_RISCV_SELFTESTS_SBI_UNEXP: + vcpu_dump(stderr, vcpu, 2); + TEST_ASSERT(0, "Unexpected trap taken by guest"); + break; + default: + break; + } + } + return NULL; +} diff --git a/tools/testing/selftests/kvm/lib/s390x/diag318_test_handler.c b/tools/testing/selftests/kvm/lib/s390x/diag318_test_handler.c new file mode 100644 index 000000000000..2c432fa164f1 --- /dev/null +++ b/tools/testing/selftests/kvm/lib/s390x/diag318_test_handler.c @@ -0,0 +1,80 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Test handler for the s390x DIAGNOSE 0x0318 instruction. + * + * Copyright (C) 2020, IBM + */ + +#include "test_util.h" +#include "kvm_util.h" + +#define ICPT_INSTRUCTION 0x04 +#define IPA0_DIAG 0x8300 + +static void guest_code(void) +{ + uint64_t diag318_info = 0x12345678; + + asm volatile ("diag %0,0,0x318\n" : : "d" (diag318_info)); +} + +/* + * The DIAGNOSE 0x0318 instruction call must be handled via userspace. As such, + * we create an ad-hoc VM here to handle the instruction then extract the + * necessary data. It is up to the caller to decide what to do with that data. + */ +static uint64_t diag318_handler(void) +{ + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; + struct kvm_run *run; + uint64_t reg; + uint64_t diag318_info; + + vm = vm_create_with_one_vcpu(&vcpu, guest_code); + vcpu_run(vcpu); + run = vcpu->run; + + TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_S390_SIEIC); + TEST_ASSERT(run->s390_sieic.icptcode == ICPT_INSTRUCTION, + "Unexpected intercept code: 0x%x", run->s390_sieic.icptcode); + TEST_ASSERT((run->s390_sieic.ipa & 0xff00) == IPA0_DIAG, + "Unexpected IPA0 code: 0x%x", (run->s390_sieic.ipa & 0xff00)); + + reg = (run->s390_sieic.ipa & 0x00f0) >> 4; + diag318_info = run->s.regs.gprs[reg]; + + TEST_ASSERT(diag318_info != 0, "DIAGNOSE 0x0318 info not set"); + + kvm_vm_free(vm); + + return diag318_info; +} + +uint64_t get_diag318_info(void) +{ + static uint64_t diag318_info; + static bool printed_skip; + + /* + * If KVM does not support diag318, then return 0 to + * ensure tests do not break. + */ + if (!kvm_has_cap(KVM_CAP_S390_DIAG318)) { + if (!printed_skip) { + fprintf(stdout, "KVM_CAP_S390_DIAG318 not supported. " + "Skipping diag318 test.\n"); + printed_skip = true; + } + return 0; + } + + /* + * If a test has previously requested the diag318 info, + * then don't bother spinning up a temporary VM again. + */ + if (!diag318_info) + diag318_info = diag318_handler(); + + return diag318_info; +} diff --git a/tools/testing/selftests/kvm/lib/s390x/processor.c b/tools/testing/selftests/kvm/lib/s390x/processor.c index a88c5d665725..4ad4492eea1d 100644 --- a/tools/testing/selftests/kvm/lib/s390x/processor.c +++ b/tools/testing/selftests/kvm/lib/s390x/processor.c @@ -5,17 +5,12 @@ * Copyright (C) 2019, Red Hat, Inc. */ -#define _GNU_SOURCE /* for program_invocation_name */ - #include "processor.h" #include "kvm_util.h" -#include "../kvm_util_internal.h" - -#define KVM_GUEST_PAGE_TABLE_MIN_PADDR 0x180000 #define PAGES_PER_REGION 4 -void virt_pgd_alloc(struct kvm_vm *vm, uint32_t memslot) +void virt_arch_pgd_alloc(struct kvm_vm *vm) { vm_paddr_t paddr; @@ -26,7 +21,8 @@ void virt_pgd_alloc(struct kvm_vm *vm, uint32_t memslot) return; paddr = vm_phy_pages_alloc(vm, PAGES_PER_REGION, - KVM_GUEST_PAGE_TABLE_MIN_PADDR, memslot); + KVM_GUEST_PAGE_TABLE_MIN_PADDR, + vm->memslots[MEM_REGION_PT]); memset(addr_gpa2hva(vm, paddr), 0xff, PAGES_PER_REGION * vm->page_size); vm->pgd = paddr; @@ -38,12 +34,12 @@ void virt_pgd_alloc(struct kvm_vm *vm, uint32_t memslot) * a page table (ri == 4). Returns a suitable region/segment table entry * which points to the freshly allocated pages. */ -static uint64_t virt_alloc_region(struct kvm_vm *vm, int ri, uint32_t memslot) +static uint64_t virt_alloc_region(struct kvm_vm *vm, int ri) { uint64_t taddr; taddr = vm_phy_pages_alloc(vm, ri < 4 ? PAGES_PER_REGION : 1, - KVM_GUEST_PAGE_TABLE_MIN_PADDR, memslot); + KVM_GUEST_PAGE_TABLE_MIN_PADDR, 0); memset(addr_gpa2hva(vm, taddr), 0xff, PAGES_PER_REGION * vm->page_size); return (taddr & REGION_ENTRY_ORIGIN) @@ -51,8 +47,7 @@ static uint64_t virt_alloc_region(struct kvm_vm *vm, int ri, uint32_t memslot) | ((ri < 4 ? (PAGES_PER_REGION - 1) : 0) & REGION_ENTRY_LENGTH); } -void virt_pg_map(struct kvm_vm *vm, uint64_t gva, uint64_t gpa, - uint32_t memslot) +void virt_arch_pg_map(struct kvm_vm *vm, uint64_t gva, uint64_t gpa) { int ri, idx; uint64_t *entry; @@ -79,7 +74,7 @@ void virt_pg_map(struct kvm_vm *vm, uint64_t gva, uint64_t gpa, for (ri = 1; ri <= 4; ri++) { idx = (gva >> (64 - 11 * ri)) & 0x7ffu; if (entry[idx] & REGION_ENTRY_INVALID) - entry[idx] = virt_alloc_region(vm, ri, memslot); + entry[idx] = virt_alloc_region(vm, ri); entry = addr_gpa2hva(vm, entry[idx] & REGION_ENTRY_ORIGIN); } @@ -91,7 +86,7 @@ void virt_pg_map(struct kvm_vm *vm, uint64_t gva, uint64_t gpa, entry[idx] = gpa; } -vm_paddr_t addr_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva) +vm_paddr_t addr_arch_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva) { int ri, idx; uint64_t *entry; @@ -152,7 +147,7 @@ static void virt_dump_region(FILE *stream, struct kvm_vm *vm, uint8_t indent, } } -void virt_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent) +void virt_arch_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent) { if (!vm->pgd_created) return; @@ -160,84 +155,69 @@ void virt_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent) virt_dump_region(stream, vm, indent, vm->pgd); } -struct kvm_vm *vm_create_default(uint32_t vcpuid, uint64_t extra_mem_pages, - void *guest_code) +void vcpu_arch_set_entry_point(struct kvm_vcpu *vcpu, void *guest_code) { - /* - * The additional amount of pages required for the page tables is: - * 1 * n / 256 + 4 * (n / 256) / 2048 + 4 * (n / 256) / 2048^2 + ... - * which is definitely smaller than (n / 256) * 2. - */ - uint64_t extra_pg_pages = extra_mem_pages / 256 * 2; - struct kvm_vm *vm; - - vm = vm_create(VM_MODE_DEFAULT, - DEFAULT_GUEST_PHY_PAGES + extra_pg_pages, O_RDWR); - - kvm_vm_elf_load(vm, program_invocation_name, 0, 0); - vm_vcpu_add_default(vm, vcpuid, guest_code); - - return vm; + vcpu->run->psw_addr = (uintptr_t)guest_code; } -void vm_vcpu_add_default(struct kvm_vm *vm, uint32_t vcpuid, void *guest_code) +struct kvm_vcpu *vm_arch_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id) { size_t stack_size = DEFAULT_STACK_PGS * getpagesize(); uint64_t stack_vaddr; struct kvm_regs regs; struct kvm_sregs sregs; - struct kvm_run *run; + struct kvm_vcpu *vcpu; TEST_ASSERT(vm->page_size == 4096, "Unsupported page size: 0x%x", vm->page_size); - stack_vaddr = vm_vaddr_alloc(vm, stack_size, - DEFAULT_GUEST_STACK_VADDR_MIN, 0, 0); + stack_vaddr = __vm_vaddr_alloc(vm, stack_size, + DEFAULT_GUEST_STACK_VADDR_MIN, + MEM_REGION_DATA); - vm_vcpu_add(vm, vcpuid); + vcpu = __vm_vcpu_add(vm, vcpu_id); /* Setup guest registers */ - vcpu_regs_get(vm, vcpuid, ®s); + vcpu_regs_get(vcpu, ®s); regs.gprs[15] = stack_vaddr + (DEFAULT_STACK_PGS * getpagesize()) - 160; - vcpu_regs_set(vm, vcpuid, ®s); + vcpu_regs_set(vcpu, ®s); - vcpu_sregs_get(vm, vcpuid, &sregs); + vcpu_sregs_get(vcpu, &sregs); sregs.crs[0] |= 0x00040000; /* Enable floating point regs */ sregs.crs[1] = vm->pgd | 0xf; /* Primary region table */ - vcpu_sregs_set(vm, vcpuid, &sregs); + vcpu_sregs_set(vcpu, &sregs); + + vcpu->run->psw_mask = 0x0400000180000000ULL; /* DAT enabled + 64 bit mode */ - run = vcpu_state(vm, vcpuid); - run->psw_mask = 0x0400000180000000ULL; /* DAT enabled + 64 bit mode */ - run->psw_addr = (uintptr_t)guest_code; + return vcpu; } -void vcpu_args_set(struct kvm_vm *vm, uint32_t vcpuid, unsigned int num, ...) +void vcpu_args_set(struct kvm_vcpu *vcpu, unsigned int num, ...) { va_list ap; struct kvm_regs regs; int i; TEST_ASSERT(num >= 1 && num <= 5, "Unsupported number of args,\n" - " num: %u\n", + " num: %u", num); va_start(ap, num); - vcpu_regs_get(vm, vcpuid, ®s); + vcpu_regs_get(vcpu, ®s); for (i = 0; i < num; i++) regs.gprs[i + 2] = va_arg(ap, uint64_t); - vcpu_regs_set(vm, vcpuid, ®s); + vcpu_regs_set(vcpu, ®s); va_end(ap); } -void vcpu_dump(FILE *stream, struct kvm_vm *vm, uint32_t vcpuid, uint8_t indent) +void vcpu_arch_dump(FILE *stream, struct kvm_vcpu *vcpu, uint8_t indent) { - struct vcpu *vcpu = vcpu_find(vm, vcpuid); - - if (!vcpu) - return; - fprintf(stream, "%*spstate: psw: 0x%.16llx:0x%.16llx\n", - indent, "", vcpu->state->psw_mask, vcpu->state->psw_addr); + indent, "", vcpu->run->psw_mask, vcpu->run->psw_addr); +} + +void assert_on_unhandled_exception(struct kvm_vcpu *vcpu) +{ } diff --git a/tools/testing/selftests/kvm/lib/s390x/ucall.c b/tools/testing/selftests/kvm/lib/s390x/ucall.c index fd589dc9bfab..cca98734653d 100644 --- a/tools/testing/selftests/kvm/lib/s390x/ucall.c +++ b/tools/testing/selftests/kvm/lib/s390x/ucall.c @@ -6,37 +6,9 @@ */ #include "kvm_util.h" -void ucall_init(struct kvm_vm *vm, void *arg) +void *ucall_arch_get_ucall(struct kvm_vcpu *vcpu) { -} - -void ucall_uninit(struct kvm_vm *vm) -{ -} - -void ucall(uint64_t cmd, int nargs, ...) -{ - struct ucall uc = { - .cmd = cmd, - }; - va_list va; - int i; - - nargs = nargs <= UCALL_MAX_ARGS ? nargs : UCALL_MAX_ARGS; - - va_start(va, nargs); - for (i = 0; i < nargs; ++i) - uc.args[i] = va_arg(va, uint64_t); - va_end(va); - - /* Exit via DIAGNOSE 0x501 (normally used for breakpoints) */ - asm volatile ("diag 0,%0,0x501" : : "a"(&uc) : "memory"); -} - -uint64_t get_ucall(struct kvm_vm *vm, uint32_t vcpu_id, struct ucall *uc) -{ - struct kvm_run *run = vcpu_state(vm, vcpu_id); - struct ucall ucall = {}; + struct kvm_run *run = vcpu->run; if (run->exit_reason == KVM_EXIT_S390_SIEIC && run->s390_sieic.icptcode == 4 && @@ -44,13 +16,7 @@ uint64_t get_ucall(struct kvm_vm *vm, uint32_t vcpu_id, struct ucall *uc) (run->s390_sieic.ipb >> 16) == 0x501) { int reg = run->s390_sieic.ipa & 0xf; - memcpy(&ucall, addr_gva2hva(vm, run->s.regs.gprs[reg]), - sizeof(ucall)); - - vcpu_run_complete_io(vm, vcpu_id); - if (uc) - memcpy(uc, &ucall, sizeof(ucall)); + return (void *)run->s.regs.gprs[reg]; } - - return ucall.cmd; + return NULL; } diff --git a/tools/testing/selftests/kvm/lib/sparsebit.c b/tools/testing/selftests/kvm/lib/sparsebit.c index 031ba3c932ed..cfed9d26cc71 100644 --- a/tools/testing/selftests/kvm/lib/sparsebit.c +++ b/tools/testing/selftests/kvm/lib/sparsebit.c @@ -202,7 +202,7 @@ static sparsebit_num_t node_num_set(struct node *nodep) /* Returns a pointer to the node that describes the * lowest bit index. */ -static struct node *node_first(struct sparsebit *s) +static struct node *node_first(const struct sparsebit *s) { struct node *nodep; @@ -216,7 +216,7 @@ static struct node *node_first(struct sparsebit *s) * lowest bit index > the index of the node pointed to by np. * Returns NULL if no node with a higher index exists. */ -static struct node *node_next(struct sparsebit *s, struct node *np) +static struct node *node_next(const struct sparsebit *s, struct node *np) { struct node *nodep = np; @@ -244,7 +244,7 @@ static struct node *node_next(struct sparsebit *s, struct node *np) * highest index < the index of the node pointed to by np. * Returns NULL if no node with a lower index exists. */ -static struct node *node_prev(struct sparsebit *s, struct node *np) +static struct node *node_prev(const struct sparsebit *s, struct node *np) { struct node *nodep = np; @@ -273,7 +273,7 @@ static struct node *node_prev(struct sparsebit *s, struct node *np) * subtree and duplicates the bit settings to the newly allocated nodes. * Returns the newly allocated copy of subtree. */ -static struct node *node_copy_subtree(struct node *subtree) +static struct node *node_copy_subtree(const struct node *subtree) { struct node *root; @@ -307,7 +307,7 @@ static struct node *node_copy_subtree(struct node *subtree) * index is within the bits described by the mask bits or the number of * contiguous bits set after the mask. Returns NULL if there is no such node. */ -static struct node *node_find(struct sparsebit *s, sparsebit_idx_t idx) +static struct node *node_find(const struct sparsebit *s, sparsebit_idx_t idx) { struct node *nodep; @@ -393,7 +393,7 @@ static struct node *node_add(struct sparsebit *s, sparsebit_idx_t idx) } /* Returns whether all the bits in the sparsebit array are set. */ -bool sparsebit_all_set(struct sparsebit *s) +bool sparsebit_all_set(const struct sparsebit *s) { /* * If any nodes there must be at least one bit set. Only case @@ -634,7 +634,6 @@ static void node_reduce(struct sparsebit *s, struct node *nodep) tmp = node_prev(s, nodep); node_rm(s, nodep); - nodep = NULL; nodep = tmp; reduction_performed = true; @@ -776,7 +775,7 @@ static void node_reduce(struct sparsebit *s, struct node *nodep) /* Returns whether the bit at the index given by idx, within the * sparsebit array is set or not. */ -bool sparsebit_is_set(struct sparsebit *s, sparsebit_idx_t idx) +bool sparsebit_is_set(const struct sparsebit *s, sparsebit_idx_t idx) { struct node *nodep; @@ -922,7 +921,7 @@ static inline sparsebit_idx_t node_first_clear(struct node *nodep, int start) * used by test cases after they detect an unexpected condition, as a means * to capture diagnostic information. */ -static void sparsebit_dump_internal(FILE *stream, struct sparsebit *s, +static void sparsebit_dump_internal(FILE *stream, const struct sparsebit *s, unsigned int indent) { /* Dump the contents of s */ @@ -970,7 +969,7 @@ void sparsebit_free(struct sparsebit **sbitp) * sparsebit_alloc(). It can though already have bits set, which * if different from src will be cleared. */ -void sparsebit_copy(struct sparsebit *d, struct sparsebit *s) +void sparsebit_copy(struct sparsebit *d, const struct sparsebit *s) { /* First clear any bits already set in the destination */ sparsebit_clear_all(d); @@ -982,7 +981,7 @@ void sparsebit_copy(struct sparsebit *d, struct sparsebit *s) } /* Returns whether num consecutive bits starting at idx are all set. */ -bool sparsebit_is_set_num(struct sparsebit *s, +bool sparsebit_is_set_num(const struct sparsebit *s, sparsebit_idx_t idx, sparsebit_num_t num) { sparsebit_idx_t next_cleared; @@ -1006,14 +1005,14 @@ bool sparsebit_is_set_num(struct sparsebit *s, } /* Returns whether the bit at the index given by idx. */ -bool sparsebit_is_clear(struct sparsebit *s, +bool sparsebit_is_clear(const struct sparsebit *s, sparsebit_idx_t idx) { return !sparsebit_is_set(s, idx); } /* Returns whether num consecutive bits starting at idx are all cleared. */ -bool sparsebit_is_clear_num(struct sparsebit *s, +bool sparsebit_is_clear_num(const struct sparsebit *s, sparsebit_idx_t idx, sparsebit_num_t num) { sparsebit_idx_t next_set; @@ -1042,13 +1041,13 @@ bool sparsebit_is_clear_num(struct sparsebit *s, * value. Use sparsebit_any_set(), instead of sparsebit_num_set() > 0, * to determine if the sparsebit array has any bits set. */ -sparsebit_num_t sparsebit_num_set(struct sparsebit *s) +sparsebit_num_t sparsebit_num_set(const struct sparsebit *s) { return s->num_set; } /* Returns whether any bit is set in the sparsebit array. */ -bool sparsebit_any_set(struct sparsebit *s) +bool sparsebit_any_set(const struct sparsebit *s) { /* * Nodes only describe set bits. If any nodes then there @@ -1071,20 +1070,20 @@ bool sparsebit_any_set(struct sparsebit *s) } /* Returns whether all the bits in the sparsebit array are cleared. */ -bool sparsebit_all_clear(struct sparsebit *s) +bool sparsebit_all_clear(const struct sparsebit *s) { return !sparsebit_any_set(s); } /* Returns whether all the bits in the sparsebit array are set. */ -bool sparsebit_any_clear(struct sparsebit *s) +bool sparsebit_any_clear(const struct sparsebit *s) { return !sparsebit_all_set(s); } /* Returns the index of the first set bit. Abort if no bits are set. */ -sparsebit_idx_t sparsebit_first_set(struct sparsebit *s) +sparsebit_idx_t sparsebit_first_set(const struct sparsebit *s) { struct node *nodep; @@ -1098,7 +1097,7 @@ sparsebit_idx_t sparsebit_first_set(struct sparsebit *s) /* Returns the index of the first cleared bit. Abort if * no bits are cleared. */ -sparsebit_idx_t sparsebit_first_clear(struct sparsebit *s) +sparsebit_idx_t sparsebit_first_clear(const struct sparsebit *s) { struct node *nodep1, *nodep2; @@ -1152,7 +1151,7 @@ sparsebit_idx_t sparsebit_first_clear(struct sparsebit *s) /* Returns index of next bit set within s after the index given by prev. * Returns 0 if there are no bits after prev that are set. */ -sparsebit_idx_t sparsebit_next_set(struct sparsebit *s, +sparsebit_idx_t sparsebit_next_set(const struct sparsebit *s, sparsebit_idx_t prev) { sparsebit_idx_t lowest_possible = prev + 1; @@ -1245,7 +1244,7 @@ sparsebit_idx_t sparsebit_next_set(struct sparsebit *s, /* Returns index of next bit cleared within s after the index given by prev. * Returns 0 if there are no bits after prev that are cleared. */ -sparsebit_idx_t sparsebit_next_clear(struct sparsebit *s, +sparsebit_idx_t sparsebit_next_clear(const struct sparsebit *s, sparsebit_idx_t prev) { sparsebit_idx_t lowest_possible = prev + 1; @@ -1301,7 +1300,7 @@ sparsebit_idx_t sparsebit_next_clear(struct sparsebit *s, * and returns the index of the first sequence of num consecutively set * bits. Returns a value of 0 of no such sequence exists. */ -sparsebit_idx_t sparsebit_next_set_num(struct sparsebit *s, +sparsebit_idx_t sparsebit_next_set_num(const struct sparsebit *s, sparsebit_idx_t start, sparsebit_num_t num) { sparsebit_idx_t idx; @@ -1336,7 +1335,7 @@ sparsebit_idx_t sparsebit_next_set_num(struct sparsebit *s, * and returns the index of the first sequence of num consecutively cleared * bits. Returns a value of 0 of no such sequence exists. */ -sparsebit_idx_t sparsebit_next_clear_num(struct sparsebit *s, +sparsebit_idx_t sparsebit_next_clear_num(const struct sparsebit *s, sparsebit_idx_t start, sparsebit_num_t num) { sparsebit_idx_t idx; @@ -1584,7 +1583,7 @@ static size_t display_range(FILE *stream, sparsebit_idx_t low, * contiguous bits. This is done because '-' is used to specify command-line * options, and sometimes ranges are specified as command-line arguments. */ -void sparsebit_dump(FILE *stream, struct sparsebit *s, +void sparsebit_dump(FILE *stream, const struct sparsebit *s, unsigned int indent) { size_t current_line_len = 0; @@ -1682,7 +1681,7 @@ void sparsebit_dump(FILE *stream, struct sparsebit *s, * s. On error, diagnostic information is printed to stderr and * abort is called. */ -void sparsebit_validate_internal(struct sparsebit *s) +void sparsebit_validate_internal(const struct sparsebit *s) { bool error_detected = false; struct node *nodep, *prev = NULL; @@ -1866,7 +1865,7 @@ void sparsebit_validate_internal(struct sparsebit *s) * of total bits set. */ if (s->num_set != total_bits_set) { - fprintf(stderr, "Number of bits set missmatch,\n" + fprintf(stderr, "Number of bits set mismatch,\n" " s->num_set: 0x%lx total_bits_set: 0x%lx", s->num_set, total_bits_set); @@ -1890,7 +1889,6 @@ void sparsebit_validate_internal(struct sparsebit *s) */ #include <stdlib.h> -#include <assert.h> struct range { sparsebit_idx_t first, last; diff --git a/tools/testing/selftests/kvm/lib/string_override.c b/tools/testing/selftests/kvm/lib/string_override.c new file mode 100644 index 000000000000..5d1c87277c49 --- /dev/null +++ b/tools/testing/selftests/kvm/lib/string_override.c @@ -0,0 +1,48 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include <stddef.h> + +/* + * Override the "basic" built-in string helpers so that they can be used in + * guest code. KVM selftests don't support dynamic loading in guest code and + * will jump into the weeds if the compiler decides to insert an out-of-line + * call via the PLT. + */ +int memcmp(const void *cs, const void *ct, size_t count) +{ + const unsigned char *su1, *su2; + int res = 0; + + for (su1 = cs, su2 = ct; 0 < count; ++su1, ++su2, count--) { + if ((res = *su1 - *su2) != 0) + break; + } + return res; +} + +void *memcpy(void *dest, const void *src, size_t count) +{ + char *tmp = dest; + const char *s = src; + + while (count--) + *tmp++ = *s++; + return dest; +} + +void *memset(void *s, int c, size_t count) +{ + char *xs = s; + + while (count--) + *xs++ = c; + return s; +} + +size_t strnlen(const char *s, size_t count) +{ + const char *sc; + + for (sc = s; count-- && *sc != '\0'; ++sc) + /* nothing */; + return sc - s; +} diff --git a/tools/testing/selftests/kvm/lib/test_util.c b/tools/testing/selftests/kvm/lib/test_util.c index 689e97c27ee2..8ed0b74ae837 100644 --- a/tools/testing/selftests/kvm/lib/test_util.c +++ b/tools/testing/selftests/kvm/lib/test_util.c @@ -4,13 +4,38 @@ * * Copyright (C) 2020, Google LLC. */ -#include <stdlib.h> +#include <stdio.h> +#include <stdarg.h> +#include <assert.h> #include <ctype.h> #include <limits.h> -#include <assert.h> +#include <stdlib.h> +#include <time.h> +#include <sys/stat.h> +#include <sys/syscall.h> +#include <linux/mman.h> +#include "linux/kernel.h" + #include "test_util.h" /* + * Random number generator that is usable from guest code. This is the + * Park-Miller LCG using standard constants. + */ + +struct guest_random_state new_guest_random_state(uint32_t seed) +{ + struct guest_random_state s = {.seed = seed}; + return s; +} + +uint32_t guest_random_u32(struct guest_random_state *state) +{ + state->seed = (uint64_t)state->seed * 48271 % ((uint32_t)(1 << 31) - 1); + return state->seed; +} + +/* * Parses "[0-9]+[kmgt]?". */ size_t parse_size(const char *size) @@ -81,6 +106,21 @@ struct timespec timespec_sub(struct timespec ts1, struct timespec ts2) return timespec_add_ns((struct timespec){0}, ns1 - ns2); } +struct timespec timespec_elapsed(struct timespec start) +{ + struct timespec end; + + clock_gettime(CLOCK_MONOTONIC, &end); + return timespec_sub(end, start); +} + +struct timespec timespec_div(struct timespec ts, int divisor) +{ + int64_t ns = timespec_to_ns(ts) / divisor; + + return timespec_add_ns((struct timespec){0}, ns); +} + void print_skip(const char *fmt, ...) { va_list ap; @@ -91,3 +131,287 @@ void print_skip(const char *fmt, ...) va_end(ap); puts(", skipping test"); } + +bool thp_configured(void) +{ + int ret; + struct stat statbuf; + + ret = stat("/sys/kernel/mm/transparent_hugepage", &statbuf); + TEST_ASSERT(ret == 0 || (ret == -1 && errno == ENOENT), + "Error in stating /sys/kernel/mm/transparent_hugepage"); + + return ret == 0; +} + +size_t get_trans_hugepagesz(void) +{ + size_t size; + FILE *f; + int ret; + + TEST_ASSERT(thp_configured(), "THP is not configured in host kernel"); + + f = fopen("/sys/kernel/mm/transparent_hugepage/hpage_pmd_size", "r"); + TEST_ASSERT(f != NULL, "Error in opening transparent_hugepage/hpage_pmd_size"); + + ret = fscanf(f, "%ld", &size); + ret = fscanf(f, "%ld", &size); + TEST_ASSERT(ret < 1, "Error reading transparent_hugepage/hpage_pmd_size"); + fclose(f); + + return size; +} + +size_t get_def_hugetlb_pagesz(void) +{ + char buf[64]; + const char *hugepagesize = "Hugepagesize:"; + const char *hugepages_total = "HugePages_Total:"; + FILE *f; + + f = fopen("/proc/meminfo", "r"); + TEST_ASSERT(f != NULL, "Error in opening /proc/meminfo"); + + while (fgets(buf, sizeof(buf), f) != NULL) { + if (strstr(buf, hugepages_total) == buf) { + unsigned long long total = strtoull(buf + strlen(hugepages_total), NULL, 10); + if (!total) { + fprintf(stderr, "HUGETLB is not enabled in /proc/sys/vm/nr_hugepages\n"); + exit(KSFT_SKIP); + } + } + if (strstr(buf, hugepagesize) == buf) { + fclose(f); + return strtoull(buf + strlen(hugepagesize), NULL, 10) << 10; + } + } + + if (feof(f)) { + fprintf(stderr, "HUGETLB is not configured in host kernel"); + exit(KSFT_SKIP); + } + + TEST_FAIL("Error in reading /proc/meminfo"); +} + +#define ANON_FLAGS (MAP_PRIVATE | MAP_ANONYMOUS) +#define ANON_HUGE_FLAGS (ANON_FLAGS | MAP_HUGETLB) + +const struct vm_mem_backing_src_alias *vm_mem_backing_src_alias(uint32_t i) +{ + static const struct vm_mem_backing_src_alias aliases[] = { + [VM_MEM_SRC_ANONYMOUS] = { + .name = "anonymous", + .flag = ANON_FLAGS, + }, + [VM_MEM_SRC_ANONYMOUS_THP] = { + .name = "anonymous_thp", + .flag = ANON_FLAGS, + }, + [VM_MEM_SRC_ANONYMOUS_HUGETLB] = { + .name = "anonymous_hugetlb", + .flag = ANON_HUGE_FLAGS, + }, + [VM_MEM_SRC_ANONYMOUS_HUGETLB_16KB] = { + .name = "anonymous_hugetlb_16kb", + .flag = ANON_HUGE_FLAGS | MAP_HUGE_16KB, + }, + [VM_MEM_SRC_ANONYMOUS_HUGETLB_64KB] = { + .name = "anonymous_hugetlb_64kb", + .flag = ANON_HUGE_FLAGS | MAP_HUGE_64KB, + }, + [VM_MEM_SRC_ANONYMOUS_HUGETLB_512KB] = { + .name = "anonymous_hugetlb_512kb", + .flag = ANON_HUGE_FLAGS | MAP_HUGE_512KB, + }, + [VM_MEM_SRC_ANONYMOUS_HUGETLB_1MB] = { + .name = "anonymous_hugetlb_1mb", + .flag = ANON_HUGE_FLAGS | MAP_HUGE_1MB, + }, + [VM_MEM_SRC_ANONYMOUS_HUGETLB_2MB] = { + .name = "anonymous_hugetlb_2mb", + .flag = ANON_HUGE_FLAGS | MAP_HUGE_2MB, + }, + [VM_MEM_SRC_ANONYMOUS_HUGETLB_8MB] = { + .name = "anonymous_hugetlb_8mb", + .flag = ANON_HUGE_FLAGS | MAP_HUGE_8MB, + }, + [VM_MEM_SRC_ANONYMOUS_HUGETLB_16MB] = { + .name = "anonymous_hugetlb_16mb", + .flag = ANON_HUGE_FLAGS | MAP_HUGE_16MB, + }, + [VM_MEM_SRC_ANONYMOUS_HUGETLB_32MB] = { + .name = "anonymous_hugetlb_32mb", + .flag = ANON_HUGE_FLAGS | MAP_HUGE_32MB, + }, + [VM_MEM_SRC_ANONYMOUS_HUGETLB_256MB] = { + .name = "anonymous_hugetlb_256mb", + .flag = ANON_HUGE_FLAGS | MAP_HUGE_256MB, + }, + [VM_MEM_SRC_ANONYMOUS_HUGETLB_512MB] = { + .name = "anonymous_hugetlb_512mb", + .flag = ANON_HUGE_FLAGS | MAP_HUGE_512MB, + }, + [VM_MEM_SRC_ANONYMOUS_HUGETLB_1GB] = { + .name = "anonymous_hugetlb_1gb", + .flag = ANON_HUGE_FLAGS | MAP_HUGE_1GB, + }, + [VM_MEM_SRC_ANONYMOUS_HUGETLB_2GB] = { + .name = "anonymous_hugetlb_2gb", + .flag = ANON_HUGE_FLAGS | MAP_HUGE_2GB, + }, + [VM_MEM_SRC_ANONYMOUS_HUGETLB_16GB] = { + .name = "anonymous_hugetlb_16gb", + .flag = ANON_HUGE_FLAGS | MAP_HUGE_16GB, + }, + [VM_MEM_SRC_SHMEM] = { + .name = "shmem", + .flag = MAP_SHARED, + }, + [VM_MEM_SRC_SHARED_HUGETLB] = { + .name = "shared_hugetlb", + /* + * No MAP_HUGETLB, we use MFD_HUGETLB instead. Since + * we're using "file backed" memory, we need to specify + * this when the FD is created, not when the area is + * mapped. + */ + .flag = MAP_SHARED, + }, + }; + _Static_assert(ARRAY_SIZE(aliases) == NUM_SRC_TYPES, + "Missing new backing src types?"); + + TEST_ASSERT(i < NUM_SRC_TYPES, "Backing src type ID %d too big", i); + + return &aliases[i]; +} + +#define MAP_HUGE_PAGE_SIZE(x) (1ULL << ((x >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK)) + +size_t get_backing_src_pagesz(uint32_t i) +{ + uint32_t flag = vm_mem_backing_src_alias(i)->flag; + + switch (i) { + case VM_MEM_SRC_ANONYMOUS: + case VM_MEM_SRC_SHMEM: + return getpagesize(); + case VM_MEM_SRC_ANONYMOUS_THP: + return get_trans_hugepagesz(); + case VM_MEM_SRC_ANONYMOUS_HUGETLB: + case VM_MEM_SRC_SHARED_HUGETLB: + return get_def_hugetlb_pagesz(); + default: + return MAP_HUGE_PAGE_SIZE(flag); + } +} + +bool is_backing_src_hugetlb(uint32_t i) +{ + return !!(vm_mem_backing_src_alias(i)->flag & MAP_HUGETLB); +} + +static void print_available_backing_src_types(const char *prefix) +{ + int i; + + printf("%sAvailable backing src types:\n", prefix); + + for (i = 0; i < NUM_SRC_TYPES; i++) + printf("%s %s\n", prefix, vm_mem_backing_src_alias(i)->name); +} + +void backing_src_help(const char *flag) +{ + printf(" %s: specify the type of memory that should be used to\n" + " back the guest data region. (default: %s)\n", + flag, vm_mem_backing_src_alias(DEFAULT_VM_MEM_SRC)->name); + print_available_backing_src_types(" "); +} + +enum vm_mem_backing_src_type parse_backing_src_type(const char *type_name) +{ + int i; + + for (i = 0; i < NUM_SRC_TYPES; i++) + if (!strcmp(type_name, vm_mem_backing_src_alias(i)->name)) + return i; + + print_available_backing_src_types(""); + TEST_FAIL("Unknown backing src type: %s", type_name); + return -1; +} + +long get_run_delay(void) +{ + char path[64]; + long val[2]; + FILE *fp; + + sprintf(path, "/proc/%ld/schedstat", syscall(SYS_gettid)); + fp = fopen(path, "r"); + /* Return MIN_RUN_DELAY_NS upon failure just to be safe */ + if (fscanf(fp, "%ld %ld ", &val[0], &val[1]) < 2) + val[1] = MIN_RUN_DELAY_NS; + fclose(fp); + + return val[1]; +} + +int atoi_paranoid(const char *num_str) +{ + char *end_ptr; + long num; + + errno = 0; + num = strtol(num_str, &end_ptr, 0); + TEST_ASSERT(!errno, "strtol(\"%s\") failed", num_str); + TEST_ASSERT(num_str != end_ptr, + "strtol(\"%s\") didn't find a valid integer.", num_str); + TEST_ASSERT(*end_ptr == '\0', + "strtol(\"%s\") failed to parse trailing characters \"%s\".", + num_str, end_ptr); + TEST_ASSERT(num >= INT_MIN && num <= INT_MAX, + "%ld not in range of [%d, %d]", num, INT_MIN, INT_MAX); + + return num; +} + +char *strdup_printf(const char *fmt, ...) +{ + va_list ap; + char *str; + + va_start(ap, fmt); + TEST_ASSERT(vasprintf(&str, fmt, ap) >= 0, "vasprintf() failed"); + va_end(ap); + + return str; +} + +#define CLOCKSOURCE_PATH "/sys/devices/system/clocksource/clocksource0/current_clocksource" + +char *sys_get_cur_clocksource(void) +{ + char *clk_name; + struct stat st; + FILE *fp; + + fp = fopen(CLOCKSOURCE_PATH, "r"); + TEST_ASSERT(fp, "failed to open clocksource file, errno: %d", errno); + + TEST_ASSERT(!fstat(fileno(fp), &st), "failed to stat clocksource file, errno: %d", + errno); + + clk_name = malloc(st.st_size); + TEST_ASSERT(clk_name, "failed to allocate buffer to read file"); + + TEST_ASSERT(fgets(clk_name, st.st_size, fp), "failed to read clocksource file: %d", + ferror(fp)); + + fclose(fp); + + return clk_name; +} diff --git a/tools/testing/selftests/kvm/lib/ucall_common.c b/tools/testing/selftests/kvm/lib/ucall_common.c new file mode 100644 index 000000000000..42151e571953 --- /dev/null +++ b/tools/testing/selftests/kvm/lib/ucall_common.c @@ -0,0 +1,163 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include "linux/types.h" +#include "linux/bitmap.h" +#include "linux/atomic.h" + +#include "kvm_util.h" +#include "ucall_common.h" + + +#define GUEST_UCALL_FAILED -1 + +struct ucall_header { + DECLARE_BITMAP(in_use, KVM_MAX_VCPUS); + struct ucall ucalls[KVM_MAX_VCPUS]; +}; + +int ucall_nr_pages_required(uint64_t page_size) +{ + return align_up(sizeof(struct ucall_header), page_size) / page_size; +} + +/* + * ucall_pool holds per-VM values (global data is duplicated by each VM), it + * must not be accessed from host code. + */ +static struct ucall_header *ucall_pool; + +void ucall_init(struct kvm_vm *vm, vm_paddr_t mmio_gpa) +{ + struct ucall_header *hdr; + struct ucall *uc; + vm_vaddr_t vaddr; + int i; + + vaddr = vm_vaddr_alloc_shared(vm, sizeof(*hdr), KVM_UTIL_MIN_VADDR, + MEM_REGION_DATA); + hdr = (struct ucall_header *)addr_gva2hva(vm, vaddr); + memset(hdr, 0, sizeof(*hdr)); + + for (i = 0; i < KVM_MAX_VCPUS; ++i) { + uc = &hdr->ucalls[i]; + uc->hva = uc; + } + + write_guest_global(vm, ucall_pool, (struct ucall_header *)vaddr); + + ucall_arch_init(vm, mmio_gpa); +} + +static struct ucall *ucall_alloc(void) +{ + struct ucall *uc; + int i; + + if (!ucall_pool) + goto ucall_failed; + + for (i = 0; i < KVM_MAX_VCPUS; ++i) { + if (!test_and_set_bit(i, ucall_pool->in_use)) { + uc = &ucall_pool->ucalls[i]; + memset(uc->args, 0, sizeof(uc->args)); + return uc; + } + } + +ucall_failed: + /* + * If the vCPU cannot grab a ucall structure, make a bare ucall with a + * magic value to signal to get_ucall() that things went sideways. + * GUEST_ASSERT() depends on ucall_alloc() and so cannot be used here. + */ + ucall_arch_do_ucall(GUEST_UCALL_FAILED); + return NULL; +} + +static void ucall_free(struct ucall *uc) +{ + /* Beware, here be pointer arithmetic. */ + clear_bit(uc - ucall_pool->ucalls, ucall_pool->in_use); +} + +void ucall_assert(uint64_t cmd, const char *exp, const char *file, + unsigned int line, const char *fmt, ...) +{ + struct ucall *uc; + va_list va; + + uc = ucall_alloc(); + uc->cmd = cmd; + + WRITE_ONCE(uc->args[GUEST_ERROR_STRING], (uint64_t)(exp)); + WRITE_ONCE(uc->args[GUEST_FILE], (uint64_t)(file)); + WRITE_ONCE(uc->args[GUEST_LINE], line); + + va_start(va, fmt); + guest_vsnprintf(uc->buffer, UCALL_BUFFER_LEN, fmt, va); + va_end(va); + + ucall_arch_do_ucall((vm_vaddr_t)uc->hva); + + ucall_free(uc); +} + +void ucall_fmt(uint64_t cmd, const char *fmt, ...) +{ + struct ucall *uc; + va_list va; + + uc = ucall_alloc(); + uc->cmd = cmd; + + va_start(va, fmt); + guest_vsnprintf(uc->buffer, UCALL_BUFFER_LEN, fmt, va); + va_end(va); + + ucall_arch_do_ucall((vm_vaddr_t)uc->hva); + + ucall_free(uc); +} + +void ucall(uint64_t cmd, int nargs, ...) +{ + struct ucall *uc; + va_list va; + int i; + + uc = ucall_alloc(); + + WRITE_ONCE(uc->cmd, cmd); + + nargs = min(nargs, UCALL_MAX_ARGS); + + va_start(va, nargs); + for (i = 0; i < nargs; ++i) + WRITE_ONCE(uc->args[i], va_arg(va, uint64_t)); + va_end(va); + + ucall_arch_do_ucall((vm_vaddr_t)uc->hva); + + ucall_free(uc); +} + +uint64_t get_ucall(struct kvm_vcpu *vcpu, struct ucall *uc) +{ + struct ucall ucall; + void *addr; + + if (!uc) + uc = &ucall; + + addr = ucall_arch_get_ucall(vcpu); + if (addr) { + TEST_ASSERT(addr != (void *)GUEST_UCALL_FAILED, + "Guest failed to allocate ucall struct"); + + memcpy(uc, addr, sizeof(*uc)); + vcpu_run_complete_io(vcpu); + } else { + memset(uc, 0, sizeof(*uc)); + } + + return uc->cmd; +} diff --git a/tools/testing/selftests/kvm/lib/userfaultfd_util.c b/tools/testing/selftests/kvm/lib/userfaultfd_util.c new file mode 100644 index 000000000000..7c9de8414462 --- /dev/null +++ b/tools/testing/selftests/kvm/lib/userfaultfd_util.c @@ -0,0 +1,206 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * KVM userfaultfd util + * Adapted from demand_paging_test.c + * + * Copyright (C) 2018, Red Hat, Inc. + * Copyright (C) 2019-2022 Google LLC + */ +#include <inttypes.h> +#include <stdio.h> +#include <stdlib.h> +#include <time.h> +#include <poll.h> +#include <pthread.h> +#include <linux/userfaultfd.h> +#include <sys/epoll.h> +#include <sys/syscall.h> + +#include "kvm_util.h" +#include "test_util.h" +#include "memstress.h" +#include "userfaultfd_util.h" + +#ifdef __NR_userfaultfd + +static void *uffd_handler_thread_fn(void *arg) +{ + struct uffd_reader_args *reader_args = (struct uffd_reader_args *)arg; + int uffd = reader_args->uffd; + int64_t pages = 0; + struct timespec start; + struct timespec ts_diff; + struct epoll_event evt; + int epollfd; + + epollfd = epoll_create(1); + TEST_ASSERT(epollfd >= 0, "Failed to create epollfd."); + + evt.events = EPOLLIN | EPOLLEXCLUSIVE; + evt.data.u32 = 0; + TEST_ASSERT(!epoll_ctl(epollfd, EPOLL_CTL_ADD, uffd, &evt), + "Failed to add uffd to epollfd"); + + evt.events = EPOLLIN; + evt.data.u32 = 1; + TEST_ASSERT(!epoll_ctl(epollfd, EPOLL_CTL_ADD, reader_args->pipe, &evt), + "Failed to add pipe to epollfd"); + + clock_gettime(CLOCK_MONOTONIC, &start); + while (1) { + struct uffd_msg msg; + int r; + + r = epoll_wait(epollfd, &evt, 1, -1); + TEST_ASSERT(r == 1, + "Unexpected number of events (%d) from epoll, errno = %d", + r, errno); + + if (evt.data.u32 == 1) { + char tmp_chr; + + TEST_ASSERT(!(evt.events & (EPOLLERR | EPOLLHUP)), + "Reader thread received EPOLLERR or EPOLLHUP on pipe."); + r = read(reader_args->pipe, &tmp_chr, 1); + TEST_ASSERT(r == 1, + "Error reading pipefd in uffd reader thread"); + break; + } + + TEST_ASSERT(!(evt.events & (EPOLLERR | EPOLLHUP)), + "Reader thread received EPOLLERR or EPOLLHUP on uffd."); + + r = read(uffd, &msg, sizeof(msg)); + if (r == -1) { + TEST_ASSERT(errno == EAGAIN, + "Error reading from UFFD: errno = %d", errno); + continue; + } + + TEST_ASSERT(r == sizeof(msg), + "Read on uffd returned unexpected number of bytes (%d)", r); + + if (!(msg.event & UFFD_EVENT_PAGEFAULT)) + continue; + + if (reader_args->delay) + usleep(reader_args->delay); + r = reader_args->handler(reader_args->uffd_mode, uffd, &msg); + TEST_ASSERT(r >= 0, + "Reader thread handler fn returned negative value %d", r); + pages++; + } + + ts_diff = timespec_elapsed(start); + PER_VCPU_DEBUG("userfaulted %ld pages over %ld.%.9lds. (%f/sec)\n", + pages, ts_diff.tv_sec, ts_diff.tv_nsec, + pages / ((double)ts_diff.tv_sec + (double)ts_diff.tv_nsec / NSEC_PER_SEC)); + + return NULL; +} + +struct uffd_desc *uffd_setup_demand_paging(int uffd_mode, useconds_t delay, + void *hva, uint64_t len, + uint64_t num_readers, + uffd_handler_t handler) +{ + struct uffd_desc *uffd_desc; + bool is_minor = (uffd_mode == UFFDIO_REGISTER_MODE_MINOR); + int uffd; + struct uffdio_api uffdio_api; + struct uffdio_register uffdio_register; + uint64_t expected_ioctls = ((uint64_t) 1) << _UFFDIO_COPY; + int ret, i; + + PER_PAGE_DEBUG("Userfaultfd %s mode, faults resolved with %s\n", + is_minor ? "MINOR" : "MISSING", + is_minor ? "UFFDIO_CONINUE" : "UFFDIO_COPY"); + + uffd_desc = malloc(sizeof(struct uffd_desc)); + TEST_ASSERT(uffd_desc, "Failed to malloc uffd descriptor"); + + uffd_desc->pipefds = calloc(sizeof(int), num_readers); + TEST_ASSERT(uffd_desc->pipefds, "Failed to alloc pipes"); + + uffd_desc->readers = calloc(sizeof(pthread_t), num_readers); + TEST_ASSERT(uffd_desc->readers, "Failed to alloc reader threads"); + + uffd_desc->reader_args = calloc(sizeof(struct uffd_reader_args), num_readers); + TEST_ASSERT(uffd_desc->reader_args, "Failed to alloc reader_args"); + + uffd_desc->num_readers = num_readers; + + /* In order to get minor faults, prefault via the alias. */ + if (is_minor) + expected_ioctls = ((uint64_t) 1) << _UFFDIO_CONTINUE; + + uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK); + TEST_ASSERT(uffd >= 0, "uffd creation failed, errno: %d", errno); + + uffdio_api.api = UFFD_API; + uffdio_api.features = 0; + TEST_ASSERT(ioctl(uffd, UFFDIO_API, &uffdio_api) != -1, + "ioctl UFFDIO_API failed: %" PRIu64, + (uint64_t)uffdio_api.api); + + uffdio_register.range.start = (uint64_t)hva; + uffdio_register.range.len = len; + uffdio_register.mode = uffd_mode; + TEST_ASSERT(ioctl(uffd, UFFDIO_REGISTER, &uffdio_register) != -1, + "ioctl UFFDIO_REGISTER failed"); + TEST_ASSERT((uffdio_register.ioctls & expected_ioctls) == + expected_ioctls, "missing userfaultfd ioctls"); + + uffd_desc->uffd = uffd; + for (i = 0; i < uffd_desc->num_readers; ++i) { + int pipes[2]; + + ret = pipe2((int *) &pipes, O_CLOEXEC | O_NONBLOCK); + TEST_ASSERT(!ret, "Failed to set up pipefd %i for uffd_desc %p", + i, uffd_desc); + + uffd_desc->pipefds[i] = pipes[1]; + + uffd_desc->reader_args[i].uffd_mode = uffd_mode; + uffd_desc->reader_args[i].uffd = uffd; + uffd_desc->reader_args[i].delay = delay; + uffd_desc->reader_args[i].handler = handler; + uffd_desc->reader_args[i].pipe = pipes[0]; + + pthread_create(&uffd_desc->readers[i], NULL, uffd_handler_thread_fn, + &uffd_desc->reader_args[i]); + + PER_VCPU_DEBUG("Created uffd thread %i for HVA range [%p, %p)\n", + i, hva, hva + len); + } + + return uffd_desc; +} + +void uffd_stop_demand_paging(struct uffd_desc *uffd) +{ + char c = 0; + int i; + + for (i = 0; i < uffd->num_readers; ++i) + TEST_ASSERT(write(uffd->pipefds[i], &c, 1) == 1, + "Unable to write to pipefd %i for uffd_desc %p", i, uffd); + + for (i = 0; i < uffd->num_readers; ++i) + TEST_ASSERT(!pthread_join(uffd->readers[i], NULL), + "Pthread_join failed on reader %i for uffd_desc %p", i, uffd); + + close(uffd->uffd); + + for (i = 0; i < uffd->num_readers; ++i) { + close(uffd->pipefds[i]); + close(uffd->reader_args[i].pipe); + } + + free(uffd->pipefds); + free(uffd->readers); + free(uffd->reader_args); + free(uffd); +} + +#endif /* __NR_userfaultfd */ diff --git a/tools/testing/selftests/kvm/lib/x86_64/apic.c b/tools/testing/selftests/kvm/lib/x86_64/apic.c new file mode 100644 index 000000000000..89153a333e83 --- /dev/null +++ b/tools/testing/selftests/kvm/lib/x86_64/apic.c @@ -0,0 +1,43 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2021, Google LLC. + */ + +#include "apic.h" + +void apic_disable(void) +{ + wrmsr(MSR_IA32_APICBASE, + rdmsr(MSR_IA32_APICBASE) & + ~(MSR_IA32_APICBASE_ENABLE | MSR_IA32_APICBASE_EXTD)); +} + +void xapic_enable(void) +{ + uint64_t val = rdmsr(MSR_IA32_APICBASE); + + /* Per SDM: to enable xAPIC when in x2APIC must first disable APIC */ + if (val & MSR_IA32_APICBASE_EXTD) { + apic_disable(); + wrmsr(MSR_IA32_APICBASE, + rdmsr(MSR_IA32_APICBASE) | MSR_IA32_APICBASE_ENABLE); + } else if (!(val & MSR_IA32_APICBASE_ENABLE)) { + wrmsr(MSR_IA32_APICBASE, val | MSR_IA32_APICBASE_ENABLE); + } + + /* + * Per SDM: reset value of spurious interrupt vector register has the + * APIC software enabled bit=0. It must be enabled in addition to the + * enable bit in the MSR. + */ + val = xapic_read_reg(APIC_SPIV) | APIC_SPIV_APIC_ENABLED; + xapic_write_reg(APIC_SPIV, val); +} + +void x2apic_enable(void) +{ + wrmsr(MSR_IA32_APICBASE, rdmsr(MSR_IA32_APICBASE) | + MSR_IA32_APICBASE_ENABLE | MSR_IA32_APICBASE_EXTD); + x2apic_write_reg(APIC_SPIV, + x2apic_read_reg(APIC_SPIV) | APIC_SPIV_APIC_ENABLED); +} diff --git a/tools/testing/selftests/kvm/lib/x86_64/handlers.S b/tools/testing/selftests/kvm/lib/x86_64/handlers.S new file mode 100644 index 000000000000..7629819734af --- /dev/null +++ b/tools/testing/selftests/kvm/lib/x86_64/handlers.S @@ -0,0 +1,81 @@ +handle_exception: + push %r15 + push %r14 + push %r13 + push %r12 + push %r11 + push %r10 + push %r9 + push %r8 + + push %rdi + push %rsi + push %rbp + push %rbx + push %rdx + push %rcx + push %rax + mov %rsp, %rdi + + call route_exception + + pop %rax + pop %rcx + pop %rdx + pop %rbx + pop %rbp + pop %rsi + pop %rdi + pop %r8 + pop %r9 + pop %r10 + pop %r11 + pop %r12 + pop %r13 + pop %r14 + pop %r15 + + /* Discard vector and error code. */ + add $16, %rsp + iretq + +/* + * Build the handle_exception wrappers which push the vector/error code on the + * stack and an array of pointers to those wrappers. + */ +.pushsection .rodata +.globl idt_handlers +idt_handlers: +.popsection + +.macro HANDLERS has_error from to + vector = \from + .rept \to - \from + 1 + .align 8 + + /* Fetch current address and append it to idt_handlers. */ +666 : +.pushsection .rodata + .quad 666b +.popsection + + .if ! \has_error + pushq $0 + .endif + pushq $vector + jmp handle_exception + vector = vector + 1 + .endr +.endm + +.global idt_handler_code +idt_handler_code: + HANDLERS has_error=0 from=0 to=7 + HANDLERS has_error=1 from=8 to=8 + HANDLERS has_error=0 from=9 to=9 + HANDLERS has_error=1 from=10 to=14 + HANDLERS has_error=0 from=15 to=16 + HANDLERS has_error=1 from=17 to=17 + HANDLERS has_error=0 from=18 to=255 + +.section .note.GNU-stack, "", %progbits diff --git a/tools/testing/selftests/kvm/lib/x86_64/hyperv.c b/tools/testing/selftests/kvm/lib/x86_64/hyperv.c new file mode 100644 index 000000000000..efb7e7a1354d --- /dev/null +++ b/tools/testing/selftests/kvm/lib/x86_64/hyperv.c @@ -0,0 +1,46 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Hyper-V specific functions. + * + * Copyright (C) 2021, Red Hat Inc. + */ +#include <stdint.h> +#include "processor.h" +#include "hyperv.h" + +struct hyperv_test_pages *vcpu_alloc_hyperv_test_pages(struct kvm_vm *vm, + vm_vaddr_t *p_hv_pages_gva) +{ + vm_vaddr_t hv_pages_gva = vm_vaddr_alloc_page(vm); + struct hyperv_test_pages *hv = addr_gva2hva(vm, hv_pages_gva); + + /* Setup of a region of guest memory for the VP Assist page. */ + hv->vp_assist = (void *)vm_vaddr_alloc_page(vm); + hv->vp_assist_hva = addr_gva2hva(vm, (uintptr_t)hv->vp_assist); + hv->vp_assist_gpa = addr_gva2gpa(vm, (uintptr_t)hv->vp_assist); + + /* Setup of a region of guest memory for the partition assist page. */ + hv->partition_assist = (void *)vm_vaddr_alloc_page(vm); + hv->partition_assist_hva = addr_gva2hva(vm, (uintptr_t)hv->partition_assist); + hv->partition_assist_gpa = addr_gva2gpa(vm, (uintptr_t)hv->partition_assist); + + /* Setup of a region of guest memory for the enlightened VMCS. */ + hv->enlightened_vmcs = (void *)vm_vaddr_alloc_page(vm); + hv->enlightened_vmcs_hva = addr_gva2hva(vm, (uintptr_t)hv->enlightened_vmcs); + hv->enlightened_vmcs_gpa = addr_gva2gpa(vm, (uintptr_t)hv->enlightened_vmcs); + + *p_hv_pages_gva = hv_pages_gva; + return hv; +} + +int enable_vp_assist(uint64_t vp_assist_pa, void *vp_assist) +{ + uint64_t val = (vp_assist_pa & HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_MASK) | + HV_X64_MSR_VP_ASSIST_PAGE_ENABLE; + + wrmsr(HV_X64_MSR_VP_ASSIST_PAGE, val); + + current_vp_assist = vp_assist; + + return 0; +} diff --git a/tools/testing/selftests/kvm/lib/x86_64/memstress.c b/tools/testing/selftests/kvm/lib/x86_64/memstress.c new file mode 100644 index 000000000000..d61e623afc8c --- /dev/null +++ b/tools/testing/selftests/kvm/lib/x86_64/memstress.c @@ -0,0 +1,112 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * x86_64-specific extensions to memstress.c. + * + * Copyright (C) 2022, Google, Inc. + */ +#include <stdio.h> +#include <stdlib.h> +#include <linux/bitmap.h> +#include <linux/bitops.h> + +#include "test_util.h" +#include "kvm_util.h" +#include "memstress.h" +#include "processor.h" +#include "vmx.h" + +void memstress_l2_guest_code(uint64_t vcpu_id) +{ + memstress_guest_code(vcpu_id); + vmcall(); +} + +extern char memstress_l2_guest_entry[]; +__asm__( +"memstress_l2_guest_entry:" +" mov (%rsp), %rdi;" +" call memstress_l2_guest_code;" +" ud2;" +); + +static void memstress_l1_guest_code(struct vmx_pages *vmx, uint64_t vcpu_id) +{ +#define L2_GUEST_STACK_SIZE 64 + unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; + unsigned long *rsp; + + GUEST_ASSERT(vmx->vmcs_gpa); + GUEST_ASSERT(prepare_for_vmx_operation(vmx)); + GUEST_ASSERT(load_vmcs(vmx)); + GUEST_ASSERT(ept_1g_pages_supported()); + + rsp = &l2_guest_stack[L2_GUEST_STACK_SIZE - 1]; + *rsp = vcpu_id; + prepare_vmcs(vmx, memstress_l2_guest_entry, rsp); + + GUEST_ASSERT(!vmlaunch()); + GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL); + GUEST_DONE(); +} + +uint64_t memstress_nested_pages(int nr_vcpus) +{ + /* + * 513 page tables is enough to identity-map 256 TiB of L2 with 1G + * pages and 4-level paging, plus a few pages per-vCPU for data + * structures such as the VMCS. + */ + return 513 + 10 * nr_vcpus; +} + +void memstress_setup_ept(struct vmx_pages *vmx, struct kvm_vm *vm) +{ + uint64_t start, end; + + prepare_eptp(vmx, vm, 0); + + /* + * Identity map the first 4G and the test region with 1G pages so that + * KVM can shadow the EPT12 with the maximum huge page size supported + * by the backing source. + */ + nested_identity_map_1g(vmx, vm, 0, 0x100000000ULL); + + start = align_down(memstress_args.gpa, PG_SIZE_1G); + end = align_up(memstress_args.gpa + memstress_args.size, PG_SIZE_1G); + nested_identity_map_1g(vmx, vm, start, end - start); +} + +void memstress_setup_nested(struct kvm_vm *vm, int nr_vcpus, struct kvm_vcpu *vcpus[]) +{ + struct vmx_pages *vmx, *vmx0 = NULL; + struct kvm_regs regs; + vm_vaddr_t vmx_gva; + int vcpu_id; + + TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX)); + TEST_REQUIRE(kvm_cpu_has_ept()); + + for (vcpu_id = 0; vcpu_id < nr_vcpus; vcpu_id++) { + vmx = vcpu_alloc_vmx(vm, &vmx_gva); + + if (vcpu_id == 0) { + memstress_setup_ept(vmx, vm); + vmx0 = vmx; + } else { + /* Share the same EPT table across all vCPUs. */ + vmx->eptp = vmx0->eptp; + vmx->eptp_hva = vmx0->eptp_hva; + vmx->eptp_gpa = vmx0->eptp_gpa; + } + + /* + * Override the vCPU to run memstress_l1_guest_code() which will + * bounce it into L2 before calling memstress_guest_code(). + */ + vcpu_regs_get(vcpus[vcpu_id], ®s); + regs.rip = (unsigned long) memstress_l1_guest_code; + vcpu_regs_set(vcpus[vcpu_id], ®s); + vcpu_args_set(vcpus[vcpu_id], 2, vmx_gva, vcpu_id); + } +} diff --git a/tools/testing/selftests/kvm/lib/x86_64/pmu.c b/tools/testing/selftests/kvm/lib/x86_64/pmu.c new file mode 100644 index 000000000000..f31f0427c17c --- /dev/null +++ b/tools/testing/selftests/kvm/lib/x86_64/pmu.c @@ -0,0 +1,31 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2023, Tencent, Inc. + */ + +#include <stdint.h> + +#include <linux/kernel.h> + +#include "kvm_util.h" +#include "pmu.h" + +const uint64_t intel_pmu_arch_events[] = { + INTEL_ARCH_CPU_CYCLES, + INTEL_ARCH_INSTRUCTIONS_RETIRED, + INTEL_ARCH_REFERENCE_CYCLES, + INTEL_ARCH_LLC_REFERENCES, + INTEL_ARCH_LLC_MISSES, + INTEL_ARCH_BRANCHES_RETIRED, + INTEL_ARCH_BRANCHES_MISPREDICTED, + INTEL_ARCH_TOPDOWN_SLOTS, +}; +kvm_static_assert(ARRAY_SIZE(intel_pmu_arch_events) == NR_INTEL_ARCH_EVENTS); + +const uint64_t amd_pmu_zen_events[] = { + AMD_ZEN_CORE_CYCLES, + AMD_ZEN_INSTRUCTIONS_RETIRED, + AMD_ZEN_BRANCHES_RETIRED, + AMD_ZEN_BRANCHES_MISPREDICTED, +}; +kvm_static_assert(ARRAY_SIZE(amd_pmu_zen_events) == NR_AMD_ZEN_EVENTS); diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c index f6eb34eaa0d2..c664e446136b 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/processor.c +++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c @@ -5,80 +5,28 @@ * Copyright (C) 2018, Google LLC. */ -#define _GNU_SOURCE /* for program_invocation_name */ - +#include "linux/bitmap.h" #include "test_util.h" #include "kvm_util.h" -#include "../kvm_util_internal.h" #include "processor.h" +#include "sev.h" + +#ifndef NUM_INTERRUPTS +#define NUM_INTERRUPTS 256 +#endif + +#define KERNEL_CS 0x8 +#define KERNEL_DS 0x10 +#define KERNEL_TSS 0x18 -/* Minimum physical address used for virtual translation tables. */ -#define KVM_GUEST_PAGE_TABLE_MIN_PADDR 0x180000 - -/* Virtual translation table structure declarations */ -struct pageMapL4Entry { - uint64_t present:1; - uint64_t writable:1; - uint64_t user:1; - uint64_t write_through:1; - uint64_t cache_disable:1; - uint64_t accessed:1; - uint64_t ignored_06:1; - uint64_t page_size:1; - uint64_t ignored_11_08:4; - uint64_t address:40; - uint64_t ignored_62_52:11; - uint64_t execute_disable:1; -}; - -struct pageDirectoryPointerEntry { - uint64_t present:1; - uint64_t writable:1; - uint64_t user:1; - uint64_t write_through:1; - uint64_t cache_disable:1; - uint64_t accessed:1; - uint64_t ignored_06:1; - uint64_t page_size:1; - uint64_t ignored_11_08:4; - uint64_t address:40; - uint64_t ignored_62_52:11; - uint64_t execute_disable:1; -}; - -struct pageDirectoryEntry { - uint64_t present:1; - uint64_t writable:1; - uint64_t user:1; - uint64_t write_through:1; - uint64_t cache_disable:1; - uint64_t accessed:1; - uint64_t ignored_06:1; - uint64_t page_size:1; - uint64_t ignored_11_08:4; - uint64_t address:40; - uint64_t ignored_62_52:11; - uint64_t execute_disable:1; -}; - -struct pageTableEntry { - uint64_t present:1; - uint64_t writable:1; - uint64_t user:1; - uint64_t write_through:1; - uint64_t cache_disable:1; - uint64_t accessed:1; - uint64_t dirty:1; - uint64_t reserved_07:1; - uint64_t global:1; - uint64_t ignored_11_09:3; - uint64_t address:40; - uint64_t ignored_62_52:11; - uint64_t execute_disable:1; -}; - -void regs_dump(FILE *stream, struct kvm_regs *regs, - uint8_t indent) +#define MAX_NR_CPUID_ENTRIES 100 + +vm_vaddr_t exception_handlers; +bool host_cpu_is_amd; +bool host_cpu_is_intel; +bool is_forced_emulation_enabled; + +static void regs_dump(FILE *stream, struct kvm_regs *regs, uint8_t indent) { fprintf(stream, "%*srax: 0x%.16llx rbx: 0x%.16llx " "rcx: 0x%.16llx rdx: 0x%.16llx\n", @@ -101,21 +49,6 @@ void regs_dump(FILE *stream, struct kvm_regs *regs, regs->rip, regs->rflags); } -/* - * Segment Dump - * - * Input Args: - * stream - Output FILE stream - * segment - KVM segment - * indent - Left margin indent amount - * - * Output Args: None - * - * Return: None - * - * Dumps the state of the KVM segment given by @segment, to the FILE stream - * given by @stream. - */ static void segment_dump(FILE *stream, struct kvm_segment *segment, uint8_t indent) { @@ -133,21 +66,6 @@ static void segment_dump(FILE *stream, struct kvm_segment *segment, segment->unusable, segment->padding); } -/* - * dtable Dump - * - * Input Args: - * stream - Output FILE stream - * dtable - KVM dtable - * indent - Left margin indent amount - * - * Output Args: None - * - * Return: None - * - * Dumps the state of the KVM dtable given by @dtable, to the FILE stream - * given by @stream. - */ static void dtable_dump(FILE *stream, struct kvm_dtable *dtable, uint8_t indent) { @@ -157,8 +75,7 @@ static void dtable_dump(FILE *stream, struct kvm_dtable *dtable, dtable->padding[0], dtable->padding[1], dtable->padding[2]); } -void sregs_dump(FILE *stream, struct kvm_sregs *sregs, - uint8_t indent) +static void sregs_dump(FILE *stream, struct kvm_sregs *sregs, uint8_t indent) { unsigned int i; @@ -200,97 +117,217 @@ void sregs_dump(FILE *stream, struct kvm_sregs *sregs, } } -void virt_pgd_alloc(struct kvm_vm *vm, uint32_t pgd_memslot) +bool kvm_is_tdp_enabled(void) +{ + if (host_cpu_is_intel) + return get_kvm_intel_param_bool("ept"); + else + return get_kvm_amd_param_bool("npt"); +} + +void virt_arch_pgd_alloc(struct kvm_vm *vm) { TEST_ASSERT(vm->mode == VM_MODE_PXXV48_4K, "Attempt to use " "unknown or unsupported guest mode, mode: 0x%x", vm->mode); /* If needed, create page map l4 table. */ if (!vm->pgd_created) { - vm_paddr_t paddr = vm_phy_page_alloc(vm, - KVM_GUEST_PAGE_TABLE_MIN_PADDR, pgd_memslot); - vm->pgd = paddr; + vm->pgd = vm_alloc_page_table(vm); vm->pgd_created = true; } } -void virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, - uint32_t pgd_memslot) +static void *virt_get_pte(struct kvm_vm *vm, uint64_t *parent_pte, + uint64_t vaddr, int level) +{ + uint64_t pt_gpa = PTE_GET_PA(*parent_pte); + uint64_t *page_table = addr_gpa2hva(vm, pt_gpa); + int index = (vaddr >> PG_LEVEL_SHIFT(level)) & 0x1ffu; + + TEST_ASSERT((*parent_pte & PTE_PRESENT_MASK) || parent_pte == &vm->pgd, + "Parent PTE (level %d) not PRESENT for gva: 0x%08lx", + level + 1, vaddr); + + return &page_table[index]; +} + +static uint64_t *virt_create_upper_pte(struct kvm_vm *vm, + uint64_t *parent_pte, + uint64_t vaddr, + uint64_t paddr, + int current_level, + int target_level) +{ + uint64_t *pte = virt_get_pte(vm, parent_pte, vaddr, current_level); + + paddr = vm_untag_gpa(vm, paddr); + + if (!(*pte & PTE_PRESENT_MASK)) { + *pte = PTE_PRESENT_MASK | PTE_WRITABLE_MASK; + if (current_level == target_level) + *pte |= PTE_LARGE_MASK | (paddr & PHYSICAL_PAGE_MASK); + else + *pte |= vm_alloc_page_table(vm) & PHYSICAL_PAGE_MASK; + } else { + /* + * Entry already present. Assert that the caller doesn't want + * a hugepage at this level, and that there isn't a hugepage at + * this level. + */ + TEST_ASSERT(current_level != target_level, + "Cannot create hugepage at level: %u, vaddr: 0x%lx", + current_level, vaddr); + TEST_ASSERT(!(*pte & PTE_LARGE_MASK), + "Cannot create page table at level: %u, vaddr: 0x%lx", + current_level, vaddr); + } + return pte; +} + +void __virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, int level) { - uint16_t index[4]; - struct pageMapL4Entry *pml4e; + const uint64_t pg_size = PG_LEVEL_SIZE(level); + uint64_t *pml4e, *pdpe, *pde; + uint64_t *pte; + + TEST_ASSERT(vm->mode == VM_MODE_PXXV48_4K, + "Unknown or unsupported guest mode, mode: 0x%x", vm->mode); + + TEST_ASSERT((vaddr % pg_size) == 0, + "Virtual address not aligned,\n" + "vaddr: 0x%lx page size: 0x%lx", vaddr, pg_size); + TEST_ASSERT(sparsebit_is_set(vm->vpages_valid, (vaddr >> vm->page_shift)), + "Invalid virtual address, vaddr: 0x%lx", vaddr); + TEST_ASSERT((paddr % pg_size) == 0, + "Physical address not aligned,\n" + " paddr: 0x%lx page size: 0x%lx", paddr, pg_size); + TEST_ASSERT((paddr >> vm->page_shift) <= vm->max_gfn, + "Physical address beyond maximum supported,\n" + " paddr: 0x%lx vm->max_gfn: 0x%lx vm->page_size: 0x%x", + paddr, vm->max_gfn, vm->page_size); + TEST_ASSERT(vm_untag_gpa(vm, paddr) == paddr, + "Unexpected bits in paddr: %lx", paddr); + + /* + * Allocate upper level page tables, if not already present. Return + * early if a hugepage was created. + */ + pml4e = virt_create_upper_pte(vm, &vm->pgd, vaddr, paddr, PG_LEVEL_512G, level); + if (*pml4e & PTE_LARGE_MASK) + return; + + pdpe = virt_create_upper_pte(vm, pml4e, vaddr, paddr, PG_LEVEL_1G, level); + if (*pdpe & PTE_LARGE_MASK) + return; + + pde = virt_create_upper_pte(vm, pdpe, vaddr, paddr, PG_LEVEL_2M, level); + if (*pde & PTE_LARGE_MASK) + return; + + /* Fill in page table entry. */ + pte = virt_get_pte(vm, pde, vaddr, PG_LEVEL_4K); + TEST_ASSERT(!(*pte & PTE_PRESENT_MASK), + "PTE already present for 4k page at vaddr: 0x%lx", vaddr); + *pte = PTE_PRESENT_MASK | PTE_WRITABLE_MASK | (paddr & PHYSICAL_PAGE_MASK); + + /* + * Neither SEV nor TDX supports shared page tables, so only the final + * leaf PTE needs manually set the C/S-bit. + */ + if (vm_is_gpa_protected(vm, paddr)) + *pte |= vm->arch.c_bit; + else + *pte |= vm->arch.s_bit; +} + +void virt_arch_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr) +{ + __virt_pg_map(vm, vaddr, paddr, PG_LEVEL_4K); +} + +void virt_map_level(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, + uint64_t nr_bytes, int level) +{ + uint64_t pg_size = PG_LEVEL_SIZE(level); + uint64_t nr_pages = nr_bytes / pg_size; + int i; + + TEST_ASSERT(nr_bytes % pg_size == 0, + "Region size not aligned: nr_bytes: 0x%lx, page size: 0x%lx", + nr_bytes, pg_size); + + for (i = 0; i < nr_pages; i++) { + __virt_pg_map(vm, vaddr, paddr, level); + + vaddr += pg_size; + paddr += pg_size; + } +} + +static bool vm_is_target_pte(uint64_t *pte, int *level, int current_level) +{ + if (*pte & PTE_LARGE_MASK) { + TEST_ASSERT(*level == PG_LEVEL_NONE || + *level == current_level, + "Unexpected hugepage at level %d", current_level); + *level = current_level; + } + + return *level == current_level; +} + +uint64_t *__vm_get_page_table_entry(struct kvm_vm *vm, uint64_t vaddr, + int *level) +{ + uint64_t *pml4e, *pdpe, *pde; + + TEST_ASSERT(!vm->arch.is_pt_protected, + "Walking page tables of protected guests is impossible"); + + TEST_ASSERT(*level >= PG_LEVEL_NONE && *level < PG_LEVEL_NUM, + "Invalid PG_LEVEL_* '%d'", *level); TEST_ASSERT(vm->mode == VM_MODE_PXXV48_4K, "Attempt to use " "unknown or unsupported guest mode, mode: 0x%x", vm->mode); - - TEST_ASSERT((vaddr % vm->page_size) == 0, - "Virtual address not on page boundary,\n" - " vaddr: 0x%lx vm->page_size: 0x%x", - vaddr, vm->page_size); TEST_ASSERT(sparsebit_is_set(vm->vpages_valid, (vaddr >> vm->page_shift)), "Invalid virtual address, vaddr: 0x%lx", vaddr); - TEST_ASSERT((paddr % vm->page_size) == 0, - "Physical address not on page boundary,\n" - " paddr: 0x%lx vm->page_size: 0x%x", - paddr, vm->page_size); - TEST_ASSERT((paddr >> vm->page_shift) <= vm->max_gfn, - "Physical address beyond beyond maximum supported,\n" - " paddr: 0x%lx vm->max_gfn: 0x%lx vm->page_size: 0x%x", - paddr, vm->max_gfn, vm->page_size); - - index[0] = (vaddr >> 12) & 0x1ffu; - index[1] = (vaddr >> 21) & 0x1ffu; - index[2] = (vaddr >> 30) & 0x1ffu; - index[3] = (vaddr >> 39) & 0x1ffu; - - /* Allocate page directory pointer table if not present. */ - pml4e = addr_gpa2hva(vm, vm->pgd); - if (!pml4e[index[3]].present) { - pml4e[index[3]].address = vm_phy_page_alloc(vm, - KVM_GUEST_PAGE_TABLE_MIN_PADDR, pgd_memslot) - >> vm->page_shift; - pml4e[index[3]].writable = true; - pml4e[index[3]].present = true; - } + /* + * Based on the mode check above there are 48 bits in the vaddr, so + * shift 16 to sign extend the last bit (bit-47), + */ + TEST_ASSERT(vaddr == (((int64_t)vaddr << 16) >> 16), + "Canonical check failed. The virtual address is invalid."); - /* Allocate page directory table if not present. */ - struct pageDirectoryPointerEntry *pdpe; - pdpe = addr_gpa2hva(vm, pml4e[index[3]].address * vm->page_size); - if (!pdpe[index[2]].present) { - pdpe[index[2]].address = vm_phy_page_alloc(vm, - KVM_GUEST_PAGE_TABLE_MIN_PADDR, pgd_memslot) - >> vm->page_shift; - pdpe[index[2]].writable = true; - pdpe[index[2]].present = true; - } + pml4e = virt_get_pte(vm, &vm->pgd, vaddr, PG_LEVEL_512G); + if (vm_is_target_pte(pml4e, level, PG_LEVEL_512G)) + return pml4e; - /* Allocate page table if not present. */ - struct pageDirectoryEntry *pde; - pde = addr_gpa2hva(vm, pdpe[index[2]].address * vm->page_size); - if (!pde[index[1]].present) { - pde[index[1]].address = vm_phy_page_alloc(vm, - KVM_GUEST_PAGE_TABLE_MIN_PADDR, pgd_memslot) - >> vm->page_shift; - pde[index[1]].writable = true; - pde[index[1]].present = true; - } + pdpe = virt_get_pte(vm, pml4e, vaddr, PG_LEVEL_1G); + if (vm_is_target_pte(pdpe, level, PG_LEVEL_1G)) + return pdpe; - /* Fill in page table entry. */ - struct pageTableEntry *pte; - pte = addr_gpa2hva(vm, pde[index[1]].address * vm->page_size); - pte[index[0]].address = paddr >> vm->page_shift; - pte[index[0]].writable = true; - pte[index[0]].present = 1; + pde = virt_get_pte(vm, pdpe, vaddr, PG_LEVEL_2M); + if (vm_is_target_pte(pde, level, PG_LEVEL_2M)) + return pde; + + return virt_get_pte(vm, pde, vaddr, PG_LEVEL_4K); } -void virt_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent) +uint64_t *vm_get_page_table_entry(struct kvm_vm *vm, uint64_t vaddr) { - struct pageMapL4Entry *pml4e, *pml4e_start; - struct pageDirectoryPointerEntry *pdpe, *pdpe_start; - struct pageDirectoryEntry *pde, *pde_start; - struct pageTableEntry *pte, *pte_start; + int level = PG_LEVEL_4K; + + return __vm_get_page_table_entry(vm, vaddr, &level); +} + +void virt_arch_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent) +{ + uint64_t *pml4e, *pml4e_start; + uint64_t *pdpe, *pdpe_start; + uint64_t *pde, *pde_start; + uint64_t *pte, *pte_start; if (!vm->pgd_created) return; @@ -300,62 +337,58 @@ void virt_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent) fprintf(stream, "%*s index hvaddr gpaddr " "addr w exec dirty\n", indent, ""); - pml4e_start = (struct pageMapL4Entry *) addr_gpa2hva(vm, - vm->pgd); + pml4e_start = (uint64_t *) addr_gpa2hva(vm, vm->pgd); for (uint16_t n1 = 0; n1 <= 0x1ffu; n1++) { pml4e = &pml4e_start[n1]; - if (!pml4e->present) + if (!(*pml4e & PTE_PRESENT_MASK)) continue; - fprintf(stream, "%*spml4e 0x%-3zx %p 0x%-12lx 0x%-10lx %u " + fprintf(stream, "%*spml4e 0x%-3zx %p 0x%-12lx 0x%-10llx %u " " %u\n", indent, "", pml4e - pml4e_start, pml4e, - addr_hva2gpa(vm, pml4e), (uint64_t) pml4e->address, - pml4e->writable, pml4e->execute_disable); + addr_hva2gpa(vm, pml4e), PTE_GET_PFN(*pml4e), + !!(*pml4e & PTE_WRITABLE_MASK), !!(*pml4e & PTE_NX_MASK)); - pdpe_start = addr_gpa2hva(vm, pml4e->address - * vm->page_size); + pdpe_start = addr_gpa2hva(vm, *pml4e & PHYSICAL_PAGE_MASK); for (uint16_t n2 = 0; n2 <= 0x1ffu; n2++) { pdpe = &pdpe_start[n2]; - if (!pdpe->present) + if (!(*pdpe & PTE_PRESENT_MASK)) continue; - fprintf(stream, "%*spdpe 0x%-3zx %p 0x%-12lx 0x%-10lx " + fprintf(stream, "%*spdpe 0x%-3zx %p 0x%-12lx 0x%-10llx " "%u %u\n", indent, "", pdpe - pdpe_start, pdpe, addr_hva2gpa(vm, pdpe), - (uint64_t) pdpe->address, pdpe->writable, - pdpe->execute_disable); + PTE_GET_PFN(*pdpe), !!(*pdpe & PTE_WRITABLE_MASK), + !!(*pdpe & PTE_NX_MASK)); - pde_start = addr_gpa2hva(vm, - pdpe->address * vm->page_size); + pde_start = addr_gpa2hva(vm, *pdpe & PHYSICAL_PAGE_MASK); for (uint16_t n3 = 0; n3 <= 0x1ffu; n3++) { pde = &pde_start[n3]; - if (!pde->present) + if (!(*pde & PTE_PRESENT_MASK)) continue; fprintf(stream, "%*spde 0x%-3zx %p " - "0x%-12lx 0x%-10lx %u %u\n", + "0x%-12lx 0x%-10llx %u %u\n", indent, "", pde - pde_start, pde, addr_hva2gpa(vm, pde), - (uint64_t) pde->address, pde->writable, - pde->execute_disable); + PTE_GET_PFN(*pde), !!(*pde & PTE_WRITABLE_MASK), + !!(*pde & PTE_NX_MASK)); - pte_start = addr_gpa2hva(vm, - pde->address * vm->page_size); + pte_start = addr_gpa2hva(vm, *pde & PHYSICAL_PAGE_MASK); for (uint16_t n4 = 0; n4 <= 0x1ffu; n4++) { pte = &pte_start[n4]; - if (!pte->present) + if (!(*pte & PTE_PRESENT_MASK)) continue; fprintf(stream, "%*spte 0x%-3zx %p " - "0x%-12lx 0x%-10lx %u %u " + "0x%-12lx 0x%-10llx %u %u " " %u 0x%-10lx\n", indent, "", pte - pte_start, pte, addr_hva2gpa(vm, pte), - (uint64_t) pte->address, - pte->writable, - pte->execute_disable, - pte->dirty, + PTE_GET_PFN(*pte), + !!(*pte & PTE_WRITABLE_MASK), + !!(*pte & PTE_NX_MASK), + !!(*pte & PTE_DIRTY_MASK), ((uint64_t) n1 << 27) | ((uint64_t) n2 << 18) | ((uint64_t) n3 << 9) @@ -386,17 +419,18 @@ static void kvm_seg_set_unusable(struct kvm_segment *segp) static void kvm_seg_fill_gdt_64bit(struct kvm_vm *vm, struct kvm_segment *segp) { - void *gdt = addr_gva2hva(vm, vm->gdt); + void *gdt = addr_gva2hva(vm, vm->arch.gdt); struct desc64 *desc = gdt + (segp->selector >> 3) * 8; desc->limit0 = segp->limit & 0xFFFF; desc->base0 = segp->base & 0xFFFF; desc->base1 = segp->base >> 16; - desc->s = segp->s; desc->type = segp->type; + desc->s = segp->s; desc->dpl = segp->dpl; desc->p = segp->present; desc->limit1 = segp->limit >> 16; + desc->avl = segp->avl; desc->l = segp->l; desc->db = segp->db; desc->g = segp->g; @@ -405,27 +439,10 @@ static void kvm_seg_fill_gdt_64bit(struct kvm_vm *vm, struct kvm_segment *segp) desc->base3 = segp->base >> 32; } - -/* - * Set Long Mode Flat Kernel Code Segment - * - * Input Args: - * vm - VM whose GDT is being filled, or NULL to only write segp - * selector - selector value - * - * Output Args: - * segp - Pointer to KVM segment - * - * Return: None - * - * Sets up the KVM segment pointed to by @segp, to be a code segment - * with the selector value given by @selector. - */ -static void kvm_seg_set_kernel_code_64bit(struct kvm_vm *vm, uint16_t selector, - struct kvm_segment *segp) +static void kvm_seg_set_kernel_code_64bit(struct kvm_segment *segp) { memset(segp, 0, sizeof(*segp)); - segp->selector = selector; + segp->selector = KERNEL_CS; segp->limit = 0xFFFFFFFFu; segp->s = 0x1; /* kTypeCodeData */ segp->type = 0x08 | 0x01 | 0x02; /* kFlagCode | kFlagCodeAccessed @@ -434,30 +451,12 @@ static void kvm_seg_set_kernel_code_64bit(struct kvm_vm *vm, uint16_t selector, segp->g = true; segp->l = true; segp->present = 1; - if (vm) - kvm_seg_fill_gdt_64bit(vm, segp); } -/* - * Set Long Mode Flat Kernel Data Segment - * - * Input Args: - * vm - VM whose GDT is being filled, or NULL to only write segp - * selector - selector value - * - * Output Args: - * segp - Pointer to KVM segment - * - * Return: None - * - * Sets up the KVM segment pointed to by @segp, to be a data segment - * with the selector value given by @selector. - */ -static void kvm_seg_set_kernel_data_64bit(struct kvm_vm *vm, uint16_t selector, - struct kvm_segment *segp) +static void kvm_seg_set_kernel_data_64bit(struct kvm_segment *segp) { memset(segp, 0, sizeof(*segp)); - segp->selector = selector; + segp->selector = KERNEL_DS; segp->limit = 0xFFFFFFFFu; segp->s = 0x1; /* kTypeCodeData */ segp->type = 0x00 | 0x01 | 0x02; /* kFlagData | kFlagDataAccessed @@ -465,395 +464,456 @@ static void kvm_seg_set_kernel_data_64bit(struct kvm_vm *vm, uint16_t selector, */ segp->g = true; segp->present = true; - if (vm) - kvm_seg_fill_gdt_64bit(vm, segp); } -vm_paddr_t addr_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva) +vm_paddr_t addr_arch_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva) { - uint16_t index[4]; - struct pageMapL4Entry *pml4e; - struct pageDirectoryPointerEntry *pdpe; - struct pageDirectoryEntry *pde; - struct pageTableEntry *pte; + int level = PG_LEVEL_NONE; + uint64_t *pte = __vm_get_page_table_entry(vm, gva, &level); - TEST_ASSERT(vm->mode == VM_MODE_PXXV48_4K, "Attempt to use " - "unknown or unsupported guest mode, mode: 0x%x", vm->mode); + TEST_ASSERT(*pte & PTE_PRESENT_MASK, + "Leaf PTE not PRESENT for gva: 0x%08lx", gva); - index[0] = (gva >> 12) & 0x1ffu; - index[1] = (gva >> 21) & 0x1ffu; - index[2] = (gva >> 30) & 0x1ffu; - index[3] = (gva >> 39) & 0x1ffu; + /* + * No need for a hugepage mask on the PTE, x86-64 requires the "unused" + * address bits to be zero. + */ + return vm_untag_gpa(vm, PTE_GET_PA(*pte)) | (gva & ~HUGEPAGE_MASK(level)); +} - if (!vm->pgd_created) - goto unmapped_gva; - pml4e = addr_gpa2hva(vm, vm->pgd); - if (!pml4e[index[3]].present) - goto unmapped_gva; +static void kvm_seg_set_tss_64bit(vm_vaddr_t base, struct kvm_segment *segp) +{ + memset(segp, 0, sizeof(*segp)); + segp->base = base; + segp->limit = 0x67; + segp->selector = KERNEL_TSS; + segp->type = 0xb; + segp->present = 1; +} + +static void vcpu_init_sregs(struct kvm_vm *vm, struct kvm_vcpu *vcpu) +{ + struct kvm_sregs sregs; + + TEST_ASSERT_EQ(vm->mode, VM_MODE_PXXV48_4K); + + /* Set mode specific system register values. */ + vcpu_sregs_get(vcpu, &sregs); - pdpe = addr_gpa2hva(vm, pml4e[index[3]].address * vm->page_size); - if (!pdpe[index[2]].present) - goto unmapped_gva; + sregs.idt.base = vm->arch.idt; + sregs.idt.limit = NUM_INTERRUPTS * sizeof(struct idt_entry) - 1; + sregs.gdt.base = vm->arch.gdt; + sregs.gdt.limit = getpagesize() - 1; - pde = addr_gpa2hva(vm, pdpe[index[2]].address * vm->page_size); - if (!pde[index[1]].present) - goto unmapped_gva; + sregs.cr0 = X86_CR0_PE | X86_CR0_NE | X86_CR0_PG; + sregs.cr4 |= X86_CR4_PAE | X86_CR4_OSFXSR; + sregs.efer |= (EFER_LME | EFER_LMA | EFER_NX); - pte = addr_gpa2hva(vm, pde[index[1]].address * vm->page_size); - if (!pte[index[0]].present) - goto unmapped_gva; + kvm_seg_set_unusable(&sregs.ldt); + kvm_seg_set_kernel_code_64bit(&sregs.cs); + kvm_seg_set_kernel_data_64bit(&sregs.ds); + kvm_seg_set_kernel_data_64bit(&sregs.es); + kvm_seg_set_kernel_data_64bit(&sregs.gs); + kvm_seg_set_tss_64bit(vm->arch.tss, &sregs.tr); - return (pte[index[0]].address * vm->page_size) + (gva & 0xfffu); + sregs.cr3 = vm->pgd; + vcpu_sregs_set(vcpu, &sregs); +} -unmapped_gva: - TEST_FAIL("No mapping for vm virtual address, gva: 0x%lx", gva); - exit(EXIT_FAILURE); +static void set_idt_entry(struct kvm_vm *vm, int vector, unsigned long addr, + int dpl, unsigned short selector) +{ + struct idt_entry *base = + (struct idt_entry *)addr_gva2hva(vm, vm->arch.idt); + struct idt_entry *e = &base[vector]; + + memset(e, 0, sizeof(*e)); + e->offset0 = addr; + e->selector = selector; + e->ist = 0; + e->type = 14; + e->dpl = dpl; + e->p = 1; + e->offset1 = addr >> 16; + e->offset2 = addr >> 32; } -static void kvm_setup_gdt(struct kvm_vm *vm, struct kvm_dtable *dt, int gdt_memslot, - int pgd_memslot) +static bool kvm_fixup_exception(struct ex_regs *regs) { - if (!vm->gdt) - vm->gdt = vm_vaddr_alloc(vm, getpagesize(), - KVM_UTIL_MIN_VADDR, gdt_memslot, pgd_memslot); + if (regs->r9 != KVM_EXCEPTION_MAGIC || regs->rip != regs->r10) + return false; - dt->base = vm->gdt; - dt->limit = getpagesize(); + if (regs->vector == DE_VECTOR) + return false; + + regs->rip = regs->r11; + regs->r9 = regs->vector; + regs->r10 = regs->error_code; + return true; } -static void kvm_setup_tss_64bit(struct kvm_vm *vm, struct kvm_segment *segp, - int selector, int gdt_memslot, - int pgd_memslot) +void route_exception(struct ex_regs *regs) { - if (!vm->tss) - vm->tss = vm_vaddr_alloc(vm, getpagesize(), - KVM_UTIL_MIN_VADDR, gdt_memslot, pgd_memslot); + typedef void(*handler)(struct ex_regs *); + handler *handlers = (handler *)exception_handlers; - memset(segp, 0, sizeof(*segp)); - segp->base = vm->tss; - segp->limit = 0x67; - segp->selector = selector; - segp->type = 0xb; - segp->present = 1; - kvm_seg_fill_gdt_64bit(vm, segp); + if (handlers && handlers[regs->vector]) { + handlers[regs->vector](regs); + return; + } + + if (kvm_fixup_exception(regs)) + return; + + ucall_assert(UCALL_UNHANDLED, + "Unhandled exception in guest", __FILE__, __LINE__, + "Unhandled exception '0x%lx' at guest RIP '0x%lx'", + regs->vector, regs->rip); } -static void vcpu_setup(struct kvm_vm *vm, int vcpuid, int pgd_memslot, int gdt_memslot) +static void vm_init_descriptor_tables(struct kvm_vm *vm) { - struct kvm_sregs sregs; + extern void *idt_handlers; + struct kvm_segment seg; + int i; - /* Set mode specific system register values. */ - vcpu_sregs_get(vm, vcpuid, &sregs); + vm->arch.gdt = __vm_vaddr_alloc_page(vm, MEM_REGION_DATA); + vm->arch.idt = __vm_vaddr_alloc_page(vm, MEM_REGION_DATA); + vm->handlers = __vm_vaddr_alloc_page(vm, MEM_REGION_DATA); + vm->arch.tss = __vm_vaddr_alloc_page(vm, MEM_REGION_DATA); + + /* Handlers have the same address in both address spaces.*/ + for (i = 0; i < NUM_INTERRUPTS; i++) + set_idt_entry(vm, i, (unsigned long)(&idt_handlers)[i], 0, KERNEL_CS); + + *(vm_vaddr_t *)addr_gva2hva(vm, (vm_vaddr_t)(&exception_handlers)) = vm->handlers; - sregs.idt.limit = 0; + kvm_seg_set_kernel_code_64bit(&seg); + kvm_seg_fill_gdt_64bit(vm, &seg); - kvm_setup_gdt(vm, &sregs.gdt, gdt_memslot, pgd_memslot); + kvm_seg_set_kernel_data_64bit(&seg); + kvm_seg_fill_gdt_64bit(vm, &seg); - switch (vm->mode) { - case VM_MODE_PXXV48_4K: - sregs.cr0 = X86_CR0_PE | X86_CR0_NE | X86_CR0_PG; - sregs.cr4 |= X86_CR4_PAE | X86_CR4_OSFXSR; - sregs.efer |= (EFER_LME | EFER_LMA | EFER_NX); + kvm_seg_set_tss_64bit(vm->arch.tss, &seg); + kvm_seg_fill_gdt_64bit(vm, &seg); +} + +void vm_install_exception_handler(struct kvm_vm *vm, int vector, + void (*handler)(struct ex_regs *)) +{ + vm_vaddr_t *handlers = (vm_vaddr_t *)addr_gva2hva(vm, vm->handlers); - kvm_seg_set_unusable(&sregs.ldt); - kvm_seg_set_kernel_code_64bit(vm, 0x8, &sregs.cs); - kvm_seg_set_kernel_data_64bit(vm, 0x10, &sregs.ds); - kvm_seg_set_kernel_data_64bit(vm, 0x10, &sregs.es); - kvm_setup_tss_64bit(vm, &sregs.tr, 0x18, gdt_memslot, pgd_memslot); - break; + handlers[vector] = (vm_vaddr_t)handler; +} + +void assert_on_unhandled_exception(struct kvm_vcpu *vcpu) +{ + struct ucall uc; - default: - TEST_FAIL("Unknown guest mode, mode: 0x%x", vm->mode); + if (get_ucall(vcpu, &uc) == UCALL_UNHANDLED) + REPORT_GUEST_ASSERT(uc); +} + +void kvm_arch_vm_post_create(struct kvm_vm *vm) +{ + vm_create_irqchip(vm); + vm_init_descriptor_tables(vm); + + sync_global_to_guest(vm, host_cpu_is_intel); + sync_global_to_guest(vm, host_cpu_is_amd); + sync_global_to_guest(vm, is_forced_emulation_enabled); + + if (vm->type == KVM_X86_SEV_VM || vm->type == KVM_X86_SEV_ES_VM) { + struct kvm_sev_init init = { 0 }; + + vm_sev_ioctl(vm, KVM_SEV_INIT2, &init); } +} - sregs.cr3 = vm->pgd; - vcpu_sregs_set(vm, vcpuid, &sregs); +void vcpu_arch_set_entry_point(struct kvm_vcpu *vcpu, void *guest_code) +{ + struct kvm_regs regs; + + vcpu_regs_get(vcpu, ®s); + regs.rip = (unsigned long) guest_code; + vcpu_regs_set(vcpu, ®s); } -void vm_vcpu_add_default(struct kvm_vm *vm, uint32_t vcpuid, void *guest_code) +struct kvm_vcpu *vm_arch_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id) { struct kvm_mp_state mp_state; struct kvm_regs regs; vm_vaddr_t stack_vaddr; - stack_vaddr = vm_vaddr_alloc(vm, DEFAULT_STACK_PGS * getpagesize(), - DEFAULT_GUEST_STACK_VADDR_MIN, 0, 0); + struct kvm_vcpu *vcpu; + + stack_vaddr = __vm_vaddr_alloc(vm, DEFAULT_STACK_PGS * getpagesize(), + DEFAULT_GUEST_STACK_VADDR_MIN, + MEM_REGION_DATA); + + stack_vaddr += DEFAULT_STACK_PGS * getpagesize(); + + /* + * Align stack to match calling sequence requirements in section "The + * Stack Frame" of the System V ABI AMD64 Architecture Processor + * Supplement, which requires the value (%rsp + 8) to be a multiple of + * 16 when control is transferred to the function entry point. + * + * If this code is ever used to launch a vCPU with 32-bit entry point it + * may need to subtract 4 bytes instead of 8 bytes. + */ + TEST_ASSERT(IS_ALIGNED(stack_vaddr, PAGE_SIZE), + "__vm_vaddr_alloc() did not provide a page-aligned address"); + stack_vaddr -= 8; - /* Create VCPU */ - vm_vcpu_add(vm, vcpuid); - vcpu_setup(vm, vcpuid, 0, 0); + vcpu = __vm_vcpu_add(vm, vcpu_id); + vcpu_init_cpuid(vcpu, kvm_get_supported_cpuid()); + vcpu_init_sregs(vm, vcpu); /* Setup guest general purpose registers */ - vcpu_regs_get(vm, vcpuid, ®s); + vcpu_regs_get(vcpu, ®s); regs.rflags = regs.rflags | 0x2; - regs.rsp = stack_vaddr + (DEFAULT_STACK_PGS * getpagesize()); - regs.rip = (unsigned long) guest_code; - vcpu_regs_set(vm, vcpuid, ®s); + regs.rsp = stack_vaddr; + vcpu_regs_set(vcpu, ®s); /* Setup the MP state */ mp_state.mp_state = 0; - vcpu_set_mp_state(vm, vcpuid, &mp_state); + vcpu_mp_state_set(vcpu, &mp_state); + + return vcpu; } -/* - * Allocate an instance of struct kvm_cpuid2 - * - * Input Args: None - * - * Output Args: None - * - * Return: A pointer to the allocated struct. The caller is responsible - * for freeing this struct. - * - * Since kvm_cpuid2 uses a 0-length array to allow a the size of the - * array to be decided at allocation time, allocation is slightly - * complicated. This function uses a reasonable default length for - * the array and performs the appropriate allocation. - */ -static struct kvm_cpuid2 *allocate_kvm_cpuid2(void) -{ - struct kvm_cpuid2 *cpuid; - int nent = 100; - size_t size; - - size = sizeof(*cpuid); - size += nent * sizeof(struct kvm_cpuid_entry2); - cpuid = malloc(size); - if (!cpuid) { - perror("malloc"); - abort(); - } +struct kvm_vcpu *vm_arch_vcpu_recreate(struct kvm_vm *vm, uint32_t vcpu_id) +{ + struct kvm_vcpu *vcpu = __vm_vcpu_add(vm, vcpu_id); - cpuid->nent = nent; + vcpu_init_cpuid(vcpu, kvm_get_supported_cpuid()); - return cpuid; + return vcpu; } -/* - * KVM Supported CPUID Get - * - * Input Args: None - * - * Output Args: - * - * Return: The supported KVM CPUID - * - * Get the guest CPUID supported by KVM. - */ -struct kvm_cpuid2 *kvm_get_supported_cpuid(void) +void vcpu_arch_free(struct kvm_vcpu *vcpu) +{ + if (vcpu->cpuid) + free(vcpu->cpuid); +} + +/* Do not use kvm_supported_cpuid directly except for validity checks. */ +static void *kvm_supported_cpuid; + +const struct kvm_cpuid2 *kvm_get_supported_cpuid(void) { - static struct kvm_cpuid2 *cpuid; - int ret; int kvm_fd; - if (cpuid) - return cpuid; + if (kvm_supported_cpuid) + return kvm_supported_cpuid; - cpuid = allocate_kvm_cpuid2(); - kvm_fd = open(KVM_DEV_PATH, O_RDONLY); - if (kvm_fd < 0) - exit(KSFT_SKIP); + kvm_supported_cpuid = allocate_kvm_cpuid2(MAX_NR_CPUID_ENTRIES); + kvm_fd = open_kvm_dev_path_or_exit(); - ret = ioctl(kvm_fd, KVM_GET_SUPPORTED_CPUID, cpuid); - TEST_ASSERT(ret == 0, "KVM_GET_SUPPORTED_CPUID failed %d %d\n", - ret, errno); + kvm_ioctl(kvm_fd, KVM_GET_SUPPORTED_CPUID, + (struct kvm_cpuid2 *)kvm_supported_cpuid); close(kvm_fd); - return cpuid; + return kvm_supported_cpuid; } -/* - * Locate a cpuid entry. - * - * Input Args: - * function: The function of the cpuid entry to find. - * index: The index of the cpuid entry. - * - * Output Args: None - * - * Return: A pointer to the cpuid entry. Never returns NULL. - */ -struct kvm_cpuid_entry2 * -kvm_get_supported_cpuid_index(uint32_t function, uint32_t index) +static uint32_t __kvm_cpu_has(const struct kvm_cpuid2 *cpuid, + uint32_t function, uint32_t index, + uint8_t reg, uint8_t lo, uint8_t hi) { - struct kvm_cpuid2 *cpuid; - struct kvm_cpuid_entry2 *entry = NULL; + const struct kvm_cpuid_entry2 *entry; int i; - cpuid = kvm_get_supported_cpuid(); for (i = 0; i < cpuid->nent; i++) { - if (cpuid->entries[i].function == function && - cpuid->entries[i].index == index) { - entry = &cpuid->entries[i]; - break; - } + entry = &cpuid->entries[i]; + + /* + * The output registers in kvm_cpuid_entry2 are in alphabetical + * order, but kvm_x86_cpu_feature matches that mess, so yay + * pointer shenanigans! + */ + if (entry->function == function && entry->index == index) + return ((&entry->eax)[reg] & GENMASK(hi, lo)) >> lo; } - TEST_ASSERT(entry, "Guest CPUID entry not found: (EAX=%x, ECX=%x).", - function, index); - return entry; + return 0; } -/* - * VM VCPU CPUID Set - * - * Input Args: - * vm - Virtual Machine - * vcpuid - VCPU id - * cpuid - The CPUID values to set. - * - * Output Args: None - * - * Return: void - * - * Set the VCPU's CPUID. - */ -void vcpu_set_cpuid(struct kvm_vm *vm, - uint32_t vcpuid, struct kvm_cpuid2 *cpuid) +bool kvm_cpuid_has(const struct kvm_cpuid2 *cpuid, + struct kvm_x86_cpu_feature feature) { - struct vcpu *vcpu = vcpu_find(vm, vcpuid); - int rc; + return __kvm_cpu_has(cpuid, feature.function, feature.index, + feature.reg, feature.bit, feature.bit); +} + +uint32_t kvm_cpuid_property(const struct kvm_cpuid2 *cpuid, + struct kvm_x86_cpu_property property) +{ + return __kvm_cpu_has(cpuid, property.function, property.index, + property.reg, property.lo_bit, property.hi_bit); +} + +uint64_t kvm_get_feature_msr(uint64_t msr_index) +{ + struct { + struct kvm_msrs header; + struct kvm_msr_entry entry; + } buffer = {}; + int r, kvm_fd; - TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid); + buffer.header.nmsrs = 1; + buffer.entry.index = msr_index; + kvm_fd = open_kvm_dev_path_or_exit(); - rc = ioctl(vcpu->fd, KVM_SET_CPUID2, cpuid); - TEST_ASSERT(rc == 0, "KVM_SET_CPUID2 failed, rc: %i errno: %i", - rc, errno); + r = __kvm_ioctl(kvm_fd, KVM_GET_MSRS, &buffer.header); + TEST_ASSERT(r == 1, KVM_IOCTL_ERROR(KVM_GET_MSRS, r)); + close(kvm_fd); + return buffer.entry.data; } -struct kvm_vm *vm_create_default(uint32_t vcpuid, uint64_t extra_mem_pages, - void *guest_code) +void __vm_xsave_require_permission(uint64_t xfeature, const char *name) { - struct kvm_vm *vm; - /* - * For x86 the maximum page table size for a memory region - * will be when only 4K pages are used. In that case the - * total extra size for page tables (for extra N pages) will - * be: N/512+N/512^2+N/512^3+... which is definitely smaller - * than N/512*2. - */ - uint64_t extra_pg_pages = extra_mem_pages / 512 * 2; + int kvm_fd; + u64 bitmask; + long rc; + struct kvm_device_attr attr = { + .group = 0, + .attr = KVM_X86_XCOMP_GUEST_SUPP, + .addr = (unsigned long) &bitmask, + }; - /* Create VM */ - vm = vm_create(VM_MODE_DEFAULT, - DEFAULT_GUEST_PHY_PAGES + extra_pg_pages, - O_RDWR); + TEST_ASSERT(!kvm_supported_cpuid, + "kvm_get_supported_cpuid() cannot be used before ARCH_REQ_XCOMP_GUEST_PERM"); - /* Setup guest code */ - kvm_vm_elf_load(vm, program_invocation_name, 0, 0); + TEST_ASSERT(is_power_of_2(xfeature), + "Dynamic XFeatures must be enabled one at a time"); - /* Setup IRQ Chip */ - vm_create_irqchip(vm); + kvm_fd = open_kvm_dev_path_or_exit(); + rc = __kvm_ioctl(kvm_fd, KVM_GET_DEVICE_ATTR, &attr); + close(kvm_fd); + + if (rc == -1 && (errno == ENXIO || errno == EINVAL)) + __TEST_REQUIRE(0, "KVM_X86_XCOMP_GUEST_SUPP not supported"); - /* Add the first vCPU. */ - vm_vcpu_add_default(vm, vcpuid, guest_code); + TEST_ASSERT(rc == 0, "KVM_GET_DEVICE_ATTR(0, KVM_X86_XCOMP_GUEST_SUPP) error: %ld", rc); - return vm; + __TEST_REQUIRE(bitmask & xfeature, + "Required XSAVE feature '%s' not supported", name); + + TEST_REQUIRE(!syscall(SYS_arch_prctl, ARCH_REQ_XCOMP_GUEST_PERM, ilog2(xfeature))); + + rc = syscall(SYS_arch_prctl, ARCH_GET_XCOMP_GUEST_PERM, &bitmask); + TEST_ASSERT(rc == 0, "prctl(ARCH_GET_XCOMP_GUEST_PERM) error: %ld", rc); + TEST_ASSERT(bitmask & xfeature, + "'%s' (0x%lx) not permitted after prctl(ARCH_REQ_XCOMP_GUEST_PERM) permitted=0x%lx", + name, xfeature, bitmask); } -/* - * VCPU Get MSR - * - * Input Args: - * vm - Virtual Machine - * vcpuid - VCPU ID - * msr_index - Index of MSR - * - * Output Args: None - * - * Return: On success, value of the MSR. On failure a TEST_ASSERT is produced. - * - * Get value of MSR for VCPU. - */ -uint64_t vcpu_get_msr(struct kvm_vm *vm, uint32_t vcpuid, uint64_t msr_index) +void vcpu_init_cpuid(struct kvm_vcpu *vcpu, const struct kvm_cpuid2 *cpuid) +{ + TEST_ASSERT(cpuid != vcpu->cpuid, "@cpuid can't be the vCPU's CPUID"); + + /* Allow overriding the default CPUID. */ + if (vcpu->cpuid && vcpu->cpuid->nent < cpuid->nent) { + free(vcpu->cpuid); + vcpu->cpuid = NULL; + } + + if (!vcpu->cpuid) + vcpu->cpuid = allocate_kvm_cpuid2(cpuid->nent); + + memcpy(vcpu->cpuid, cpuid, kvm_cpuid2_size(cpuid->nent)); + vcpu_set_cpuid(vcpu); +} + +void vcpu_set_cpuid_property(struct kvm_vcpu *vcpu, + struct kvm_x86_cpu_property property, + uint32_t value) +{ + struct kvm_cpuid_entry2 *entry; + + entry = __vcpu_get_cpuid_entry(vcpu, property.function, property.index); + + (&entry->eax)[property.reg] &= ~GENMASK(property.hi_bit, property.lo_bit); + (&entry->eax)[property.reg] |= value << property.lo_bit; + + vcpu_set_cpuid(vcpu); + + /* Sanity check that @value doesn't exceed the bounds in any way. */ + TEST_ASSERT_EQ(kvm_cpuid_property(vcpu->cpuid, property), value); +} + +void vcpu_clear_cpuid_entry(struct kvm_vcpu *vcpu, uint32_t function) +{ + struct kvm_cpuid_entry2 *entry = vcpu_get_cpuid_entry(vcpu, function); + + entry->eax = 0; + entry->ebx = 0; + entry->ecx = 0; + entry->edx = 0; + vcpu_set_cpuid(vcpu); +} + +void vcpu_set_or_clear_cpuid_feature(struct kvm_vcpu *vcpu, + struct kvm_x86_cpu_feature feature, + bool set) +{ + struct kvm_cpuid_entry2 *entry; + u32 *reg; + + entry = __vcpu_get_cpuid_entry(vcpu, feature.function, feature.index); + reg = (&entry->eax) + feature.reg; + + if (set) + *reg |= BIT(feature.bit); + else + *reg &= ~BIT(feature.bit); + + vcpu_set_cpuid(vcpu); +} + +uint64_t vcpu_get_msr(struct kvm_vcpu *vcpu, uint64_t msr_index) { - struct vcpu *vcpu = vcpu_find(vm, vcpuid); struct { struct kvm_msrs header; struct kvm_msr_entry entry; } buffer = {}; - int r; - TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid); buffer.header.nmsrs = 1; buffer.entry.index = msr_index; - r = ioctl(vcpu->fd, KVM_GET_MSRS, &buffer.header); - TEST_ASSERT(r == 1, "KVM_GET_MSRS IOCTL failed,\n" - " rc: %i errno: %i", r, errno); + + vcpu_msrs_get(vcpu, &buffer.header); return buffer.entry.data; } -/* - * _VCPU Set MSR - * - * Input Args: - * vm - Virtual Machine - * vcpuid - VCPU ID - * msr_index - Index of MSR - * msr_value - New value of MSR - * - * Output Args: None - * - * Return: The result of KVM_SET_MSRS. - * - * Sets the value of an MSR for the given VCPU. - */ -int _vcpu_set_msr(struct kvm_vm *vm, uint32_t vcpuid, uint64_t msr_index, - uint64_t msr_value) +int _vcpu_set_msr(struct kvm_vcpu *vcpu, uint64_t msr_index, uint64_t msr_value) { - struct vcpu *vcpu = vcpu_find(vm, vcpuid); struct { struct kvm_msrs header; struct kvm_msr_entry entry; } buffer = {}; - int r; - TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid); memset(&buffer, 0, sizeof(buffer)); buffer.header.nmsrs = 1; buffer.entry.index = msr_index; buffer.entry.data = msr_value; - r = ioctl(vcpu->fd, KVM_SET_MSRS, &buffer.header); - return r; -} - -/* - * VCPU Set MSR - * - * Input Args: - * vm - Virtual Machine - * vcpuid - VCPU ID - * msr_index - Index of MSR - * msr_value - New value of MSR - * - * Output Args: None - * - * Return: On success, nothing. On failure a TEST_ASSERT is produced. - * - * Set value of MSR for VCPU. - */ -void vcpu_set_msr(struct kvm_vm *vm, uint32_t vcpuid, uint64_t msr_index, - uint64_t msr_value) -{ - int r; - r = _vcpu_set_msr(vm, vcpuid, msr_index, msr_value); - TEST_ASSERT(r == 1, "KVM_SET_MSRS IOCTL failed,\n" - " rc: %i errno: %i", r, errno); + return __vcpu_ioctl(vcpu, KVM_SET_MSRS, &buffer.header); } -void vcpu_args_set(struct kvm_vm *vm, uint32_t vcpuid, unsigned int num, ...) +void vcpu_args_set(struct kvm_vcpu *vcpu, unsigned int num, ...) { va_list ap; struct kvm_regs regs; TEST_ASSERT(num >= 1 && num <= 6, "Unsupported number of args,\n" - " num: %u\n", + " num: %u", num); va_start(ap, num); - vcpu_regs_get(vm, vcpuid, ®s); + vcpu_regs_get(vcpu, ®s); if (num >= 1) regs.rdi = va_arg(ap, uint64_t); @@ -873,86 +933,112 @@ void vcpu_args_set(struct kvm_vm *vm, uint32_t vcpuid, unsigned int num, ...) if (num >= 6) regs.r9 = va_arg(ap, uint64_t); - vcpu_regs_set(vm, vcpuid, ®s); + vcpu_regs_set(vcpu, ®s); va_end(ap); } -void vcpu_dump(FILE *stream, struct kvm_vm *vm, uint32_t vcpuid, uint8_t indent) +void vcpu_arch_dump(FILE *stream, struct kvm_vcpu *vcpu, uint8_t indent) { struct kvm_regs regs; struct kvm_sregs sregs; - fprintf(stream, "%*scpuid: %u\n", indent, "", vcpuid); + fprintf(stream, "%*svCPU ID: %u\n", indent, "", vcpu->id); fprintf(stream, "%*sregs:\n", indent + 2, ""); - vcpu_regs_get(vm, vcpuid, ®s); + vcpu_regs_get(vcpu, ®s); regs_dump(stream, ®s, indent + 4); fprintf(stream, "%*ssregs:\n", indent + 2, ""); - vcpu_sregs_get(vm, vcpuid, &sregs); + vcpu_sregs_get(vcpu, &sregs); sregs_dump(stream, &sregs, indent + 4); } -struct kvm_x86_state { - struct kvm_vcpu_events events; - struct kvm_mp_state mp_state; - struct kvm_regs regs; - struct kvm_xsave xsave; - struct kvm_xcrs xcrs; - struct kvm_sregs sregs; - struct kvm_debugregs debugregs; - union { - struct kvm_nested_state nested; - char nested_[16384]; - }; - struct kvm_msrs msrs; -}; - -static int kvm_get_num_msrs_fd(int kvm_fd) +static struct kvm_msr_list *__kvm_get_msr_index_list(bool feature_msrs) { + struct kvm_msr_list *list; struct kvm_msr_list nmsrs; - int r; + int kvm_fd, r; + + kvm_fd = open_kvm_dev_path_or_exit(); nmsrs.nmsrs = 0; - r = ioctl(kvm_fd, KVM_GET_MSR_INDEX_LIST, &nmsrs); - TEST_ASSERT(r == -1 && errno == E2BIG, "Unexpected result from KVM_GET_MSR_INDEX_LIST probe, r: %i", - r); + if (!feature_msrs) + r = __kvm_ioctl(kvm_fd, KVM_GET_MSR_INDEX_LIST, &nmsrs); + else + r = __kvm_ioctl(kvm_fd, KVM_GET_MSR_FEATURE_INDEX_LIST, &nmsrs); + + TEST_ASSERT(r == -1 && errno == E2BIG, + "Expected -E2BIG, got rc: %i errno: %i (%s)", + r, errno, strerror(errno)); + + list = malloc(sizeof(*list) + nmsrs.nmsrs * sizeof(list->indices[0])); + TEST_ASSERT(list, "-ENOMEM when allocating MSR index list"); + list->nmsrs = nmsrs.nmsrs; + + if (!feature_msrs) + kvm_ioctl(kvm_fd, KVM_GET_MSR_INDEX_LIST, list); + else + kvm_ioctl(kvm_fd, KVM_GET_MSR_FEATURE_INDEX_LIST, list); + close(kvm_fd); - return nmsrs.nmsrs; + TEST_ASSERT(list->nmsrs == nmsrs.nmsrs, + "Number of MSRs in list changed, was %d, now %d", + nmsrs.nmsrs, list->nmsrs); + return list; } -static int kvm_get_num_msrs(struct kvm_vm *vm) +const struct kvm_msr_list *kvm_get_msr_index_list(void) { - return kvm_get_num_msrs_fd(vm->kvm_fd); + static const struct kvm_msr_list *list; + + if (!list) + list = __kvm_get_msr_index_list(false); + return list; } -struct kvm_msr_list *kvm_get_msr_index_list(void) + +const struct kvm_msr_list *kvm_get_feature_msr_index_list(void) { - struct kvm_msr_list *list; - int nmsrs, r, kvm_fd; + static const struct kvm_msr_list *list; - kvm_fd = open(KVM_DEV_PATH, O_RDONLY); - if (kvm_fd < 0) - exit(KSFT_SKIP); + if (!list) + list = __kvm_get_msr_index_list(true); + return list; +} - nmsrs = kvm_get_num_msrs_fd(kvm_fd); - list = malloc(sizeof(*list) + nmsrs * sizeof(list->indices[0])); - list->nmsrs = nmsrs; - r = ioctl(kvm_fd, KVM_GET_MSR_INDEX_LIST, list); - close(kvm_fd); +bool kvm_msr_is_in_save_restore_list(uint32_t msr_index) +{ + const struct kvm_msr_list *list = kvm_get_msr_index_list(); + int i; - TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_MSR_INDEX_LIST, r: %i", - r); + for (i = 0; i < list->nmsrs; ++i) { + if (list->indices[i] == msr_index) + return true; + } - return list; + return false; } -struct kvm_x86_state *vcpu_save_state(struct kvm_vm *vm, uint32_t vcpuid) +static void vcpu_save_xsave_state(struct kvm_vcpu *vcpu, + struct kvm_x86_state *state) { - struct vcpu *vcpu = vcpu_find(vm, vcpuid); - struct kvm_msr_list *list; + int size = vm_check_cap(vcpu->vm, KVM_CAP_XSAVE2); + + if (size) { + state->xsave = malloc(size); + vcpu_xsave2_get(vcpu, state->xsave); + } else { + state->xsave = malloc(sizeof(struct kvm_xsave)); + vcpu_xsave_get(vcpu, state->xsave); + } +} + +struct kvm_x86_state *vcpu_save_state(struct kvm_vcpu *vcpu) +{ + const struct kvm_msr_list *msr_list = kvm_get_msr_index_list(); struct kvm_x86_state *state; - int nmsrs, r, i; + int i; + static int nested_size = -1; if (nested_size == -1) { @@ -968,153 +1054,259 @@ struct kvm_x86_state *vcpu_save_state(struct kvm_vm *vm, uint32_t vcpuid) * kernel with KVM_RUN. Complete IO prior to migrating state * to a new VM. */ - vcpu_run_complete_io(vm, vcpuid); - - nmsrs = kvm_get_num_msrs(vm); - list = malloc(sizeof(*list) + nmsrs * sizeof(list->indices[0])); - list->nmsrs = nmsrs; - r = ioctl(vm->kvm_fd, KVM_GET_MSR_INDEX_LIST, list); - TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_MSR_INDEX_LIST, r: %i", - r); - - state = malloc(sizeof(*state) + nmsrs * sizeof(state->msrs.entries[0])); - r = ioctl(vcpu->fd, KVM_GET_VCPU_EVENTS, &state->events); - TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_VCPU_EVENTS, r: %i", - r); - - r = ioctl(vcpu->fd, KVM_GET_MP_STATE, &state->mp_state); - TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_MP_STATE, r: %i", - r); - - r = ioctl(vcpu->fd, KVM_GET_REGS, &state->regs); - TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_REGS, r: %i", - r); - - r = ioctl(vcpu->fd, KVM_GET_XSAVE, &state->xsave); - TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_XSAVE, r: %i", - r); - - if (kvm_check_cap(KVM_CAP_XCRS)) { - r = ioctl(vcpu->fd, KVM_GET_XCRS, &state->xcrs); - TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_XCRS, r: %i", - r); - } + vcpu_run_complete_io(vcpu); - r = ioctl(vcpu->fd, KVM_GET_SREGS, &state->sregs); - TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_SREGS, r: %i", - r); + state = malloc(sizeof(*state) + msr_list->nmsrs * sizeof(state->msrs.entries[0])); + TEST_ASSERT(state, "-ENOMEM when allocating kvm state"); + + vcpu_events_get(vcpu, &state->events); + vcpu_mp_state_get(vcpu, &state->mp_state); + vcpu_regs_get(vcpu, &state->regs); + vcpu_save_xsave_state(vcpu, state); + + if (kvm_has_cap(KVM_CAP_XCRS)) + vcpu_xcrs_get(vcpu, &state->xcrs); + + vcpu_sregs_get(vcpu, &state->sregs); if (nested_size) { state->nested.size = sizeof(state->nested_); - r = ioctl(vcpu->fd, KVM_GET_NESTED_STATE, &state->nested); - TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_NESTED_STATE, r: %i", - r); + + vcpu_nested_state_get(vcpu, &state->nested); TEST_ASSERT(state->nested.size <= nested_size, - "Nested state size too big, %i (KVM_CHECK_CAP gave %i)", - state->nested.size, nested_size); - } else + "Nested state size too big, %i (KVM_CHECK_CAP gave %i)", + state->nested.size, nested_size); + } else { state->nested.size = 0; + } - state->msrs.nmsrs = nmsrs; - for (i = 0; i < nmsrs; i++) - state->msrs.entries[i].index = list->indices[i]; - r = ioctl(vcpu->fd, KVM_GET_MSRS, &state->msrs); - TEST_ASSERT(r == nmsrs, "Unexpected result from KVM_GET_MSRS, r: %i (failed MSR was 0x%x)", - r, r == nmsrs ? -1 : list->indices[r]); + state->msrs.nmsrs = msr_list->nmsrs; + for (i = 0; i < msr_list->nmsrs; i++) + state->msrs.entries[i].index = msr_list->indices[i]; + vcpu_msrs_get(vcpu, &state->msrs); - r = ioctl(vcpu->fd, KVM_GET_DEBUGREGS, &state->debugregs); - TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_DEBUGREGS, r: %i", - r); + vcpu_debugregs_get(vcpu, &state->debugregs); - free(list); return state; } -void vcpu_load_state(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_x86_state *state) +void vcpu_load_state(struct kvm_vcpu *vcpu, struct kvm_x86_state *state) { - struct vcpu *vcpu = vcpu_find(vm, vcpuid); - int r; + vcpu_sregs_set(vcpu, &state->sregs); + vcpu_msrs_set(vcpu, &state->msrs); - r = ioctl(vcpu->fd, KVM_SET_XSAVE, &state->xsave); - TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_XSAVE, r: %i", - r); + if (kvm_has_cap(KVM_CAP_XCRS)) + vcpu_xcrs_set(vcpu, &state->xcrs); - if (kvm_check_cap(KVM_CAP_XCRS)) { - r = ioctl(vcpu->fd, KVM_SET_XCRS, &state->xcrs); - TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_XCRS, r: %i", - r); + vcpu_xsave_set(vcpu, state->xsave); + vcpu_events_set(vcpu, &state->events); + vcpu_mp_state_set(vcpu, &state->mp_state); + vcpu_debugregs_set(vcpu, &state->debugregs); + vcpu_regs_set(vcpu, &state->regs); + + if (state->nested.size) + vcpu_nested_state_set(vcpu, &state->nested); +} + +void kvm_x86_state_cleanup(struct kvm_x86_state *state) +{ + free(state->xsave); + free(state); +} + +void kvm_get_cpu_address_width(unsigned int *pa_bits, unsigned int *va_bits) +{ + if (!kvm_cpu_has_p(X86_PROPERTY_MAX_PHY_ADDR)) { + *pa_bits = kvm_cpu_has(X86_FEATURE_PAE) ? 36 : 32; + *va_bits = 32; + } else { + *pa_bits = kvm_cpu_property(X86_PROPERTY_MAX_PHY_ADDR); + *va_bits = kvm_cpu_property(X86_PROPERTY_MAX_VIRT_ADDR); + } +} + +void kvm_init_vm_address_properties(struct kvm_vm *vm) +{ + if (vm->type == KVM_X86_SEV_VM || vm->type == KVM_X86_SEV_ES_VM) { + vm->arch.sev_fd = open_sev_dev_path_or_exit(); + vm->arch.c_bit = BIT_ULL(this_cpu_property(X86_PROPERTY_SEV_C_BIT)); + vm->gpa_tag_mask = vm->arch.c_bit; + } else { + vm->arch.sev_fd = -1; + } +} + +const struct kvm_cpuid_entry2 *get_cpuid_entry(const struct kvm_cpuid2 *cpuid, + uint32_t function, uint32_t index) +{ + int i; + + for (i = 0; i < cpuid->nent; i++) { + if (cpuid->entries[i].function == function && + cpuid->entries[i].index == index) + return &cpuid->entries[i]; } - r = ioctl(vcpu->fd, KVM_SET_SREGS, &state->sregs); - TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_SREGS, r: %i", - r); + TEST_FAIL("CPUID function 0x%x index 0x%x not found ", function, index); - r = ioctl(vcpu->fd, KVM_SET_MSRS, &state->msrs); - TEST_ASSERT(r == state->msrs.nmsrs, "Unexpected result from KVM_SET_MSRS, r: %i (failed at %x)", - r, r == state->msrs.nmsrs ? -1 : state->msrs.entries[r].index); + return NULL; +} + +#define X86_HYPERCALL(inputs...) \ +({ \ + uint64_t r; \ + \ + asm volatile("test %[use_vmmcall], %[use_vmmcall]\n\t" \ + "jnz 1f\n\t" \ + "vmcall\n\t" \ + "jmp 2f\n\t" \ + "1: vmmcall\n\t" \ + "2:" \ + : "=a"(r) \ + : [use_vmmcall] "r" (host_cpu_is_amd), inputs); \ + \ + r; \ +}) + +uint64_t kvm_hypercall(uint64_t nr, uint64_t a0, uint64_t a1, uint64_t a2, + uint64_t a3) +{ + return X86_HYPERCALL("a"(nr), "b"(a0), "c"(a1), "d"(a2), "S"(a3)); +} + +uint64_t __xen_hypercall(uint64_t nr, uint64_t a0, void *a1) +{ + return X86_HYPERCALL("a"(nr), "D"(a0), "S"(a1)); +} + +void xen_hypercall(uint64_t nr, uint64_t a0, void *a1) +{ + GUEST_ASSERT(!__xen_hypercall(nr, a0, a1)); +} + +const struct kvm_cpuid2 *kvm_get_supported_hv_cpuid(void) +{ + static struct kvm_cpuid2 *cpuid; + int kvm_fd; + + if (cpuid) + return cpuid; - r = ioctl(vcpu->fd, KVM_SET_VCPU_EVENTS, &state->events); - TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_VCPU_EVENTS, r: %i", - r); + cpuid = allocate_kvm_cpuid2(MAX_NR_CPUID_ENTRIES); + kvm_fd = open_kvm_dev_path_or_exit(); - r = ioctl(vcpu->fd, KVM_SET_MP_STATE, &state->mp_state); - TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_MP_STATE, r: %i", - r); + kvm_ioctl(kvm_fd, KVM_GET_SUPPORTED_HV_CPUID, cpuid); - r = ioctl(vcpu->fd, KVM_SET_DEBUGREGS, &state->debugregs); - TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_DEBUGREGS, r: %i", - r); + close(kvm_fd); + return cpuid; +} - r = ioctl(vcpu->fd, KVM_SET_REGS, &state->regs); - TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_REGS, r: %i", - r); +void vcpu_set_hv_cpuid(struct kvm_vcpu *vcpu) +{ + static struct kvm_cpuid2 *cpuid_full; + const struct kvm_cpuid2 *cpuid_sys, *cpuid_hv; + int i, nent = 0; + + if (!cpuid_full) { + cpuid_sys = kvm_get_supported_cpuid(); + cpuid_hv = kvm_get_supported_hv_cpuid(); + + cpuid_full = allocate_kvm_cpuid2(cpuid_sys->nent + cpuid_hv->nent); + if (!cpuid_full) { + perror("malloc"); + abort(); + } - if (state->nested.size) { - r = ioctl(vcpu->fd, KVM_SET_NESTED_STATE, &state->nested); - TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_NESTED_STATE, r: %i", - r); + /* Need to skip KVM CPUID leaves 0x400000xx */ + for (i = 0; i < cpuid_sys->nent; i++) { + if (cpuid_sys->entries[i].function >= 0x40000000 && + cpuid_sys->entries[i].function < 0x40000100) + continue; + cpuid_full->entries[nent] = cpuid_sys->entries[i]; + nent++; + } + + memcpy(&cpuid_full->entries[nent], cpuid_hv->entries, + cpuid_hv->nent * sizeof(struct kvm_cpuid_entry2)); + cpuid_full->nent = nent + cpuid_hv->nent; } + + vcpu_init_cpuid(vcpu, cpuid_full); } -bool is_intel_cpu(void) +const struct kvm_cpuid2 *vcpu_get_supported_hv_cpuid(struct kvm_vcpu *vcpu) { - int eax, ebx, ecx, edx; - const uint32_t *chunk; - const int leaf = 0; + struct kvm_cpuid2 *cpuid = allocate_kvm_cpuid2(MAX_NR_CPUID_ENTRIES); - __asm__ __volatile__( - "cpuid" - : /* output */ "=a"(eax), "=b"(ebx), - "=c"(ecx), "=d"(edx) - : /* input */ "0"(leaf), "2"(0)); + vcpu_ioctl(vcpu, KVM_GET_SUPPORTED_HV_CPUID, cpuid); - chunk = (const uint32_t *)("GenuineIntel"); - return (ebx == chunk[0] && edx == chunk[1] && ecx == chunk[2]); + return cpuid; } -uint32_t kvm_get_cpuid_max_basic(void) +unsigned long vm_compute_max_gfn(struct kvm_vm *vm) { - return kvm_get_supported_cpuid_entry(0)->eax; + const unsigned long num_ht_pages = 12 << (30 - vm->page_shift); /* 12 GiB */ + unsigned long ht_gfn, max_gfn, max_pfn; + uint8_t maxphyaddr; + + max_gfn = (1ULL << (vm->pa_bits - vm->page_shift)) - 1; + + /* Avoid reserved HyperTransport region on AMD processors. */ + if (!host_cpu_is_amd) + return max_gfn; + + /* On parts with <40 physical address bits, the area is fully hidden */ + if (vm->pa_bits < 40) + return max_gfn; + + /* Before family 17h, the HyperTransport area is just below 1T. */ + ht_gfn = (1 << 28) - num_ht_pages; + if (this_cpu_family() < 0x17) + goto done; + + /* + * Otherwise it's at the top of the physical address space, possibly + * reduced due to SME by bits 11:6 of CPUID[0x8000001f].EBX. Use + * the old conservative value if MAXPHYADDR is not enumerated. + */ + if (!this_cpu_has_p(X86_PROPERTY_MAX_PHY_ADDR)) + goto done; + + maxphyaddr = this_cpu_property(X86_PROPERTY_MAX_PHY_ADDR); + max_pfn = (1ULL << (maxphyaddr - vm->page_shift)) - 1; + + if (this_cpu_has_p(X86_PROPERTY_PHYS_ADDR_REDUCTION)) + max_pfn >>= this_cpu_property(X86_PROPERTY_PHYS_ADDR_REDUCTION); + + ht_gfn = max_pfn - num_ht_pages; +done: + return min(max_gfn, ht_gfn - 1); } -uint32_t kvm_get_cpuid_max_extended(void) +/* Returns true if kvm_intel was loaded with unrestricted_guest=1. */ +bool vm_is_unrestricted_guest(struct kvm_vm *vm) { - return kvm_get_supported_cpuid_entry(0x80000000)->eax; + /* Ensure that a KVM vendor-specific module is loaded. */ + if (vm == NULL) + close(open_kvm_dev_path_or_exit()); + + return get_kvm_intel_param_bool("unrestricted_guest"); } -void kvm_get_cpu_address_width(unsigned int *pa_bits, unsigned int *va_bits) +void kvm_selftest_arch_init(void) { - struct kvm_cpuid_entry2 *entry; - bool pae; + host_cpu_is_intel = this_cpu_is_intel(); + host_cpu_is_amd = this_cpu_is_amd(); + is_forced_emulation_enabled = kvm_is_forced_emulation_enabled(); +} - /* SDM 4.1.4 */ - if (kvm_get_cpuid_max_extended() < 0x80000008) { - pae = kvm_get_supported_cpuid_entry(1)->edx & (1 << 6); - *pa_bits = pae ? 36 : 32; - *va_bits = 32; - } else { - entry = kvm_get_supported_cpuid_entry(0x80000008); - *pa_bits = entry->eax & 0xff; - *va_bits = (entry->eax >> 8) & 0xff; - } +bool sys_clocksource_is_based_on_tsc(void) +{ + char *clk_name = sys_get_cur_clocksource(); + bool ret = !strcmp(clk_name, "tsc\n") || + !strcmp(clk_name, "hyperv_clocksource_tsc_page\n"); + + free(clk_name); + + return ret; } diff --git a/tools/testing/selftests/kvm/lib/x86_64/sev.c b/tools/testing/selftests/kvm/lib/x86_64/sev.c new file mode 100644 index 000000000000..e9535ee20b7f --- /dev/null +++ b/tools/testing/selftests/kvm/lib/x86_64/sev.c @@ -0,0 +1,141 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include <stdint.h> +#include <stdbool.h> + +#include "sev.h" + +/* + * sparsebit_next_clear() can return 0 if [x, 2**64-1] are all set, and the + * -1 would then cause an underflow back to 2**64 - 1. This is expected and + * correct. + * + * If the last range in the sparsebit is [x, y] and we try to iterate, + * sparsebit_next_set() will return 0, and sparsebit_next_clear() will try + * and find the first range, but that's correct because the condition + * expression would cause us to quit the loop. + */ +static void encrypt_region(struct kvm_vm *vm, struct userspace_mem_region *region) +{ + const struct sparsebit *protected_phy_pages = region->protected_phy_pages; + const vm_paddr_t gpa_base = region->region.guest_phys_addr; + const sparsebit_idx_t lowest_page_in_region = gpa_base >> vm->page_shift; + sparsebit_idx_t i, j; + + if (!sparsebit_any_set(protected_phy_pages)) + return; + + sev_register_encrypted_memory(vm, region); + + sparsebit_for_each_set_range(protected_phy_pages, i, j) { + const uint64_t size = (j - i + 1) * vm->page_size; + const uint64_t offset = (i - lowest_page_in_region) * vm->page_size; + + sev_launch_update_data(vm, gpa_base + offset, size); + } +} + +void sev_vm_init(struct kvm_vm *vm) +{ + if (vm->type == KVM_X86_DEFAULT_VM) { + assert(vm->arch.sev_fd == -1); + vm->arch.sev_fd = open_sev_dev_path_or_exit(); + vm_sev_ioctl(vm, KVM_SEV_INIT, NULL); + } else { + struct kvm_sev_init init = { 0 }; + assert(vm->type == KVM_X86_SEV_VM); + vm_sev_ioctl(vm, KVM_SEV_INIT2, &init); + } +} + +void sev_es_vm_init(struct kvm_vm *vm) +{ + if (vm->type == KVM_X86_DEFAULT_VM) { + assert(vm->arch.sev_fd == -1); + vm->arch.sev_fd = open_sev_dev_path_or_exit(); + vm_sev_ioctl(vm, KVM_SEV_ES_INIT, NULL); + } else { + struct kvm_sev_init init = { 0 }; + assert(vm->type == KVM_X86_SEV_ES_VM); + vm_sev_ioctl(vm, KVM_SEV_INIT2, &init); + } +} + +void sev_vm_launch(struct kvm_vm *vm, uint32_t policy) +{ + struct kvm_sev_launch_start launch_start = { + .policy = policy, + }; + struct userspace_mem_region *region; + struct kvm_sev_guest_status status; + int ctr; + + vm_sev_ioctl(vm, KVM_SEV_LAUNCH_START, &launch_start); + vm_sev_ioctl(vm, KVM_SEV_GUEST_STATUS, &status); + + TEST_ASSERT_EQ(status.policy, policy); + TEST_ASSERT_EQ(status.state, SEV_GUEST_STATE_LAUNCH_UPDATE); + + hash_for_each(vm->regions.slot_hash, ctr, region, slot_node) + encrypt_region(vm, region); + + if (policy & SEV_POLICY_ES) + vm_sev_ioctl(vm, KVM_SEV_LAUNCH_UPDATE_VMSA, NULL); + + vm->arch.is_pt_protected = true; +} + +void sev_vm_launch_measure(struct kvm_vm *vm, uint8_t *measurement) +{ + struct kvm_sev_launch_measure launch_measure; + struct kvm_sev_guest_status guest_status; + + launch_measure.len = 256; + launch_measure.uaddr = (__u64)measurement; + vm_sev_ioctl(vm, KVM_SEV_LAUNCH_MEASURE, &launch_measure); + + vm_sev_ioctl(vm, KVM_SEV_GUEST_STATUS, &guest_status); + TEST_ASSERT_EQ(guest_status.state, SEV_GUEST_STATE_LAUNCH_SECRET); +} + +void sev_vm_launch_finish(struct kvm_vm *vm) +{ + struct kvm_sev_guest_status status; + + vm_sev_ioctl(vm, KVM_SEV_GUEST_STATUS, &status); + TEST_ASSERT(status.state == SEV_GUEST_STATE_LAUNCH_UPDATE || + status.state == SEV_GUEST_STATE_LAUNCH_SECRET, + "Unexpected guest state: %d", status.state); + + vm_sev_ioctl(vm, KVM_SEV_LAUNCH_FINISH, NULL); + + vm_sev_ioctl(vm, KVM_SEV_GUEST_STATUS, &status); + TEST_ASSERT_EQ(status.state, SEV_GUEST_STATE_RUNNING); +} + +struct kvm_vm *vm_sev_create_with_one_vcpu(uint32_t type, void *guest_code, + struct kvm_vcpu **cpu) +{ + struct vm_shape shape = { + .mode = VM_MODE_DEFAULT, + .type = type, + }; + struct kvm_vm *vm; + struct kvm_vcpu *cpus[1]; + + vm = __vm_create_with_vcpus(shape, 1, 0, guest_code, cpus); + *cpu = cpus[0]; + + return vm; +} + +void vm_sev_launch(struct kvm_vm *vm, uint32_t policy, uint8_t *measurement) +{ + sev_vm_launch(vm, policy); + + if (!measurement) + measurement = alloca(256); + + sev_vm_launch_measure(vm, measurement); + + sev_vm_launch_finish(vm); +} diff --git a/tools/testing/selftests/kvm/lib/x86_64/svm.c b/tools/testing/selftests/kvm/lib/x86_64/svm.c index 3a5c72ed2b79..5495a92dfd5a 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/svm.c +++ b/tools/testing/selftests/kvm/lib/x86_64/svm.c @@ -9,10 +9,11 @@ #include "test_util.h" #include "kvm_util.h" -#include "../kvm_util_internal.h" #include "processor.h" #include "svm_util.h" +#define SEV_DEV_PATH "/dev/sev" + struct gpr64_regs guest_regs; u64 rflags; @@ -30,20 +31,22 @@ u64 rflags; struct svm_test_data * vcpu_alloc_svm(struct kvm_vm *vm, vm_vaddr_t *p_svm_gva) { - vm_vaddr_t svm_gva = vm_vaddr_alloc(vm, getpagesize(), - 0x10000, 0, 0); + vm_vaddr_t svm_gva = vm_vaddr_alloc_page(vm); struct svm_test_data *svm = addr_gva2hva(vm, svm_gva); - svm->vmcb = (void *)vm_vaddr_alloc(vm, getpagesize(), - 0x10000, 0, 0); + svm->vmcb = (void *)vm_vaddr_alloc_page(vm); svm->vmcb_hva = addr_gva2hva(vm, (uintptr_t)svm->vmcb); svm->vmcb_gpa = addr_gva2gpa(vm, (uintptr_t)svm->vmcb); - svm->save_area = (void *)vm_vaddr_alloc(vm, getpagesize(), - 0x10000, 0, 0); + svm->save_area = (void *)vm_vaddr_alloc_page(vm); svm->save_area_hva = addr_gva2hva(vm, (uintptr_t)svm->save_area); svm->save_area_gpa = addr_gva2gpa(vm, (uintptr_t)svm->save_area); + svm->msr = (void *)vm_vaddr_alloc_page(vm); + svm->msr_hva = addr_gva2hva(vm, (uintptr_t)svm->msr); + svm->msr_gpa = addr_gva2gpa(vm, (uintptr_t)svm->msr); + memset(svm->msr_hva, 0, getpagesize()); + *p_svm_gva = svm_gva; return svm; } @@ -74,7 +77,7 @@ void generic_svm_setup(struct svm_test_data *svm, void *guest_rip, void *guest_r wrmsr(MSR_VM_HSAVE_PA, svm->save_area_gpa); memset(vmcb, 0, sizeof(*vmcb)); - asm volatile ("vmsave\n\t" : : "a" (vmcb_gpa) : "memory"); + asm volatile ("vmsave %0\n\t" : : "a" (vmcb_gpa) : "memory"); vmcb_set_seg(&save->es, get_es(), 0, -1U, data_seg_attr); vmcb_set_seg(&save->cs, get_cs(), 0, -1U, code_seg_attr); vmcb_set_seg(&save->ss, get_ss(), 0, -1U, data_seg_attr); @@ -95,6 +98,7 @@ void generic_svm_setup(struct svm_test_data *svm, void *guest_rip, void *guest_r save->dbgctl = rdmsr(MSR_IA32_DEBUGCTLMSR); ctrl->intercept = (1ULL << INTERCEPT_VMRUN) | (1ULL << INTERCEPT_VMMCALL); + ctrl->msrpm_base_pa = svm->msr_gpa; vmcb->save.rip = (u64)guest_rip; vmcb->save.rsp = (u64)guest_rsp; @@ -131,35 +135,30 @@ void generic_svm_setup(struct svm_test_data *svm, void *guest_rip, void *guest_r void run_guest(struct vmcb *vmcb, uint64_t vmcb_gpa) { asm volatile ( - "vmload\n\t" + "vmload %[vmcb_gpa]\n\t" "mov rflags, %%r15\n\t" // rflags "mov %%r15, 0x170(%[vmcb])\n\t" "mov guest_regs, %%r15\n\t" // rax "mov %%r15, 0x1f8(%[vmcb])\n\t" LOAD_GPR_C - "vmrun\n\t" + "vmrun %[vmcb_gpa]\n\t" SAVE_GPR_C "mov 0x170(%[vmcb]), %%r15\n\t" // rflags "mov %%r15, rflags\n\t" "mov 0x1f8(%[vmcb]), %%r15\n\t" // rax "mov %%r15, guest_regs\n\t" - "vmsave\n\t" + "vmsave %[vmcb_gpa]\n\t" : : [vmcb] "r" (vmcb), [vmcb_gpa] "a" (vmcb_gpa) : "r15", "memory"); } -bool nested_svm_supported(void) -{ - struct kvm_cpuid_entry2 *entry = - kvm_get_supported_cpuid_entry(0x80000001); - - return entry->ecx & CPUID_SVM; -} - -void nested_svm_check_supported(void) +/* + * Open SEV_DEV_PATH if available, otherwise exit the entire program. + * + * Return: + * The opened file descriptor of /dev/sev. + */ +int open_sev_dev_path_or_exit(void) { - if (!nested_svm_supported()) { - print_skip("nested SVM not enabled"); - exit(KSFT_SKIP); - } + return open_path_or_exit(SEV_DEV_PATH, 0); } diff --git a/tools/testing/selftests/kvm/lib/x86_64/ucall.c b/tools/testing/selftests/kvm/lib/x86_64/ucall.c index da4d89ad5419..1265cecc7dd1 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/ucall.c +++ b/tools/testing/selftests/kvm/lib/x86_64/ucall.c @@ -8,49 +8,49 @@ #define UCALL_PIO_PORT ((uint16_t)0x1000) -void ucall_init(struct kvm_vm *vm, void *arg) +void ucall_arch_do_ucall(vm_vaddr_t uc) { + /* + * FIXME: Revert this hack (the entire commit that added it) once nVMX + * preserves L2 GPRs across a nested VM-Exit. If a ucall from L2, e.g. + * to do a GUEST_SYNC(), lands the vCPU in L1, any and all GPRs can be + * clobbered by L1. Save and restore non-volatile GPRs (clobbering RBP + * in particular is problematic) along with RDX and RDI (which are + * inputs), and clobber volatile GPRs. *sigh* + */ +#define HORRIFIC_L2_UCALL_CLOBBER_HACK \ + "rcx", "rsi", "r8", "r9", "r10", "r11" + + asm volatile("push %%rbp\n\t" + "push %%r15\n\t" + "push %%r14\n\t" + "push %%r13\n\t" + "push %%r12\n\t" + "push %%rbx\n\t" + "push %%rdx\n\t" + "push %%rdi\n\t" + "in %[port], %%al\n\t" + "pop %%rdi\n\t" + "pop %%rdx\n\t" + "pop %%rbx\n\t" + "pop %%r12\n\t" + "pop %%r13\n\t" + "pop %%r14\n\t" + "pop %%r15\n\t" + "pop %%rbp\n\t" + : : [port] "d" (UCALL_PIO_PORT), "D" (uc) : "rax", "memory", + HORRIFIC_L2_UCALL_CLOBBER_HACK); } -void ucall_uninit(struct kvm_vm *vm) +void *ucall_arch_get_ucall(struct kvm_vcpu *vcpu) { -} - -void ucall(uint64_t cmd, int nargs, ...) -{ - struct ucall uc = { - .cmd = cmd, - }; - va_list va; - int i; - - nargs = nargs <= UCALL_MAX_ARGS ? nargs : UCALL_MAX_ARGS; - - va_start(va, nargs); - for (i = 0; i < nargs; ++i) - uc.args[i] = va_arg(va, uint64_t); - va_end(va); - - asm volatile("in %[port], %%al" - : : [port] "d" (UCALL_PIO_PORT), "D" (&uc) : "rax", "memory"); -} - -uint64_t get_ucall(struct kvm_vm *vm, uint32_t vcpu_id, struct ucall *uc) -{ - struct kvm_run *run = vcpu_state(vm, vcpu_id); - struct ucall ucall = {}; + struct kvm_run *run = vcpu->run; if (run->exit_reason == KVM_EXIT_IO && run->io.port == UCALL_PIO_PORT) { struct kvm_regs regs; - vcpu_regs_get(vm, vcpu_id, ®s); - memcpy(&ucall, addr_gva2hva(vm, (vm_vaddr_t)regs.rdi), - sizeof(ucall)); - - vcpu_run_complete_io(vm, vcpu_id); - if (uc) - memcpy(uc, &ucall, sizeof(ucall)); + vcpu_regs_get(vcpu, ®s); + return (void *)regs.rdi; } - - return ucall.cmd; + return NULL; } diff --git a/tools/testing/selftests/kvm/lib/x86_64/vmx.c b/tools/testing/selftests/kvm/lib/x86_64/vmx.c index f1e00d43eea2..089b8925b6b2 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/vmx.c +++ b/tools/testing/selftests/kvm/lib/x86_64/vmx.c @@ -5,9 +5,10 @@ * Copyright (C) 2018, Google LLC. */ +#include <asm/msr-index.h> + #include "test_util.h" #include "kvm_util.h" -#include "../kvm_util_internal.h" #include "processor.h" #include "vmx.h" @@ -43,21 +44,17 @@ struct eptPageTablePointer { uint64_t address:40; uint64_t reserved_63_52:12; }; -int vcpu_enable_evmcs(struct kvm_vm *vm, int vcpu_id) +int vcpu_enable_evmcs(struct kvm_vcpu *vcpu) { uint16_t evmcs_ver; - struct kvm_enable_cap enable_evmcs_cap = { - .cap = KVM_CAP_HYPERV_ENLIGHTENED_VMCS, - .args[0] = (unsigned long)&evmcs_ver - }; - - vcpu_ioctl(vm, vcpu_id, KVM_ENABLE_CAP, &enable_evmcs_cap); + vcpu_enable_cap(vcpu, KVM_CAP_HYPERV_ENLIGHTENED_VMCS, + (unsigned long)&evmcs_ver); /* KVM should return supported EVMCS version range */ TEST_ASSERT(((evmcs_ver >> 8) >= (evmcs_ver & 0xff)) && (evmcs_ver & 0xff) > 0, - "Incorrect EVMCS version range: %x:%x\n", + "Incorrect EVMCS version range: %x:%x", evmcs_ver & 0xff, evmcs_ver >> 8); return evmcs_ver; @@ -77,55 +74,41 @@ int vcpu_enable_evmcs(struct kvm_vm *vm, int vcpu_id) struct vmx_pages * vcpu_alloc_vmx(struct kvm_vm *vm, vm_vaddr_t *p_vmx_gva) { - vm_vaddr_t vmx_gva = vm_vaddr_alloc(vm, getpagesize(), 0x10000, 0, 0); + vm_vaddr_t vmx_gva = vm_vaddr_alloc_page(vm); struct vmx_pages *vmx = addr_gva2hva(vm, vmx_gva); /* Setup of a region of guest memory for the vmxon region. */ - vmx->vmxon = (void *)vm_vaddr_alloc(vm, getpagesize(), 0x10000, 0, 0); + vmx->vmxon = (void *)vm_vaddr_alloc_page(vm); vmx->vmxon_hva = addr_gva2hva(vm, (uintptr_t)vmx->vmxon); vmx->vmxon_gpa = addr_gva2gpa(vm, (uintptr_t)vmx->vmxon); /* Setup of a region of guest memory for a vmcs. */ - vmx->vmcs = (void *)vm_vaddr_alloc(vm, getpagesize(), 0x10000, 0, 0); + vmx->vmcs = (void *)vm_vaddr_alloc_page(vm); vmx->vmcs_hva = addr_gva2hva(vm, (uintptr_t)vmx->vmcs); vmx->vmcs_gpa = addr_gva2gpa(vm, (uintptr_t)vmx->vmcs); /* Setup of a region of guest memory for the MSR bitmap. */ - vmx->msr = (void *)vm_vaddr_alloc(vm, getpagesize(), 0x10000, 0, 0); + vmx->msr = (void *)vm_vaddr_alloc_page(vm); vmx->msr_hva = addr_gva2hva(vm, (uintptr_t)vmx->msr); vmx->msr_gpa = addr_gva2gpa(vm, (uintptr_t)vmx->msr); memset(vmx->msr_hva, 0, getpagesize()); /* Setup of a region of guest memory for the shadow VMCS. */ - vmx->shadow_vmcs = (void *)vm_vaddr_alloc(vm, getpagesize(), 0x10000, 0, 0); + vmx->shadow_vmcs = (void *)vm_vaddr_alloc_page(vm); vmx->shadow_vmcs_hva = addr_gva2hva(vm, (uintptr_t)vmx->shadow_vmcs); vmx->shadow_vmcs_gpa = addr_gva2gpa(vm, (uintptr_t)vmx->shadow_vmcs); /* Setup of a region of guest memory for the VMREAD and VMWRITE bitmaps. */ - vmx->vmread = (void *)vm_vaddr_alloc(vm, getpagesize(), 0x10000, 0, 0); + vmx->vmread = (void *)vm_vaddr_alloc_page(vm); vmx->vmread_hva = addr_gva2hva(vm, (uintptr_t)vmx->vmread); vmx->vmread_gpa = addr_gva2gpa(vm, (uintptr_t)vmx->vmread); memset(vmx->vmread_hva, 0, getpagesize()); - vmx->vmwrite = (void *)vm_vaddr_alloc(vm, getpagesize(), 0x10000, 0, 0); + vmx->vmwrite = (void *)vm_vaddr_alloc_page(vm); vmx->vmwrite_hva = addr_gva2hva(vm, (uintptr_t)vmx->vmwrite); vmx->vmwrite_gpa = addr_gva2gpa(vm, (uintptr_t)vmx->vmwrite); memset(vmx->vmwrite_hva, 0, getpagesize()); - /* Setup of a region of guest memory for the VP Assist page. */ - vmx->vp_assist = (void *)vm_vaddr_alloc(vm, getpagesize(), - 0x10000, 0, 0); - vmx->vp_assist_hva = addr_gva2hva(vm, (uintptr_t)vmx->vp_assist); - vmx->vp_assist_gpa = addr_gva2gpa(vm, (uintptr_t)vmx->vp_assist); - - /* Setup of a region of guest memory for the enlightened VMCS. */ - vmx->enlightened_vmcs = (void *)vm_vaddr_alloc(vm, getpagesize(), - 0x10000, 0, 0); - vmx->enlightened_vmcs_hva = - addr_gva2hva(vm, (uintptr_t)vmx->enlightened_vmcs); - vmx->enlightened_vmcs_gpa = - addr_gva2gpa(vm, (uintptr_t)vmx->enlightened_vmcs); - *p_vmx_gva = vmx_gva; return vmx; } @@ -176,30 +159,32 @@ bool prepare_for_vmx_operation(struct vmx_pages *vmx) bool load_vmcs(struct vmx_pages *vmx) { - if (!enable_evmcs) { - /* Load a VMCS. */ - *(uint32_t *)(vmx->vmcs) = vmcs_revision(); - if (vmclear(vmx->vmcs_gpa)) - return false; - - if (vmptrld(vmx->vmcs_gpa)) - return false; - - /* Setup shadow VMCS, do not load it yet. */ - *(uint32_t *)(vmx->shadow_vmcs) = - vmcs_revision() | 0x80000000ul; - if (vmclear(vmx->shadow_vmcs_gpa)) - return false; - } else { - if (evmcs_vmptrld(vmx->enlightened_vmcs_gpa, - vmx->enlightened_vmcs)) - return false; - current_evmcs->revision_id = EVMCS_VERSION; - } + /* Load a VMCS. */ + *(uint32_t *)(vmx->vmcs) = vmcs_revision(); + if (vmclear(vmx->vmcs_gpa)) + return false; + + if (vmptrld(vmx->vmcs_gpa)) + return false; + + /* Setup shadow VMCS, do not load it yet. */ + *(uint32_t *)(vmx->shadow_vmcs) = vmcs_revision() | 0x80000000ul; + if (vmclear(vmx->shadow_vmcs_gpa)) + return false; return true; } +static bool ept_vpid_cap_supported(uint64_t mask) +{ + return rdmsr(MSR_IA32_VMX_EPT_VPID_CAP) & mask; +} + +bool ept_1g_pages_supported(void) +{ + return ept_vpid_cap_supported(VMX_EPT_VPID_CAP_1G_PAGES); +} + /* * Initialize the control fields to the most basic settings possible. */ @@ -217,7 +202,7 @@ static inline void init_vmcs_control_fields(struct vmx_pages *vmx) struct eptPageTablePointer eptp = { .memory_type = VMX_BASIC_MEM_TYPE_WB, .page_walk_length = 3, /* + 1 */ - .ad_enabled = !!(rdmsr(MSR_IA32_VMX_EPT_VPID_CAP) & VMX_EPT_VPID_CAP_AD_BITS), + .ad_enabled = ept_vpid_cap_supported(VMX_EPT_VPID_CAP_AD_BITS), .address = vmx->eptp_gpa >> PAGE_SHIFT_4K, }; @@ -379,101 +364,93 @@ void prepare_vmcs(struct vmx_pages *vmx, void *guest_rip, void *guest_rsp) init_vmcs_guest_state(guest_rip, guest_rsp); } -bool nested_vmx_supported(void) +static void nested_create_pte(struct kvm_vm *vm, + struct eptPageTableEntry *pte, + uint64_t nested_paddr, + uint64_t paddr, + int current_level, + int target_level) { - struct kvm_cpuid_entry2 *entry = kvm_get_supported_cpuid_entry(1); - - return entry->ecx & CPUID_VMX; -} - -void nested_vmx_check_supported(void) -{ - if (!nested_vmx_supported()) { - print_skip("nested VMX not enabled"); - exit(KSFT_SKIP); + if (!pte->readable) { + pte->writable = true; + pte->readable = true; + pte->executable = true; + pte->page_size = (current_level == target_level); + if (pte->page_size) + pte->address = paddr >> vm->page_shift; + else + pte->address = vm_alloc_page_table(vm) >> vm->page_shift; + } else { + /* + * Entry already present. Assert that the caller doesn't want + * a hugepage at this level, and that there isn't a hugepage at + * this level. + */ + TEST_ASSERT(current_level != target_level, + "Cannot create hugepage at level: %u, nested_paddr: 0x%lx", + current_level, nested_paddr); + TEST_ASSERT(!pte->page_size, + "Cannot create page table at level: %u, nested_paddr: 0x%lx", + current_level, nested_paddr); } } -void nested_pg_map(struct vmx_pages *vmx, struct kvm_vm *vm, - uint64_t nested_paddr, uint64_t paddr, uint32_t eptp_memslot) + +void __nested_pg_map(struct vmx_pages *vmx, struct kvm_vm *vm, + uint64_t nested_paddr, uint64_t paddr, int target_level) { - uint16_t index[4]; - struct eptPageTableEntry *pml4e; + const uint64_t page_size = PG_LEVEL_SIZE(target_level); + struct eptPageTableEntry *pt = vmx->eptp_hva, *pte; + uint16_t index; TEST_ASSERT(vm->mode == VM_MODE_PXXV48_4K, "Attempt to use " "unknown or unsupported guest mode, mode: 0x%x", vm->mode); - TEST_ASSERT((nested_paddr % vm->page_size) == 0, + TEST_ASSERT((nested_paddr >> 48) == 0, + "Nested physical address 0x%lx requires 5-level paging", + nested_paddr); + TEST_ASSERT((nested_paddr % page_size) == 0, "Nested physical address not on page boundary,\n" - " nested_paddr: 0x%lx vm->page_size: 0x%x", - nested_paddr, vm->page_size); + " nested_paddr: 0x%lx page_size: 0x%lx", + nested_paddr, page_size); TEST_ASSERT((nested_paddr >> vm->page_shift) <= vm->max_gfn, "Physical address beyond beyond maximum supported,\n" " nested_paddr: 0x%lx vm->max_gfn: 0x%lx vm->page_size: 0x%x", paddr, vm->max_gfn, vm->page_size); - TEST_ASSERT((paddr % vm->page_size) == 0, + TEST_ASSERT((paddr % page_size) == 0, "Physical address not on page boundary,\n" - " paddr: 0x%lx vm->page_size: 0x%x", - paddr, vm->page_size); + " paddr: 0x%lx page_size: 0x%lx", + paddr, page_size); TEST_ASSERT((paddr >> vm->page_shift) <= vm->max_gfn, "Physical address beyond beyond maximum supported,\n" " paddr: 0x%lx vm->max_gfn: 0x%lx vm->page_size: 0x%x", paddr, vm->max_gfn, vm->page_size); - index[0] = (nested_paddr >> 12) & 0x1ffu; - index[1] = (nested_paddr >> 21) & 0x1ffu; - index[2] = (nested_paddr >> 30) & 0x1ffu; - index[3] = (nested_paddr >> 39) & 0x1ffu; - - /* Allocate page directory pointer table if not present. */ - pml4e = vmx->eptp_hva; - if (!pml4e[index[3]].readable) { - pml4e[index[3]].address = vm_phy_page_alloc(vm, - KVM_EPT_PAGE_TABLE_MIN_PADDR, eptp_memslot) - >> vm->page_shift; - pml4e[index[3]].writable = true; - pml4e[index[3]].readable = true; - pml4e[index[3]].executable = true; - } + for (int level = PG_LEVEL_512G; level >= PG_LEVEL_4K; level--) { + index = (nested_paddr >> PG_LEVEL_SHIFT(level)) & 0x1ffu; + pte = &pt[index]; - /* Allocate page directory table if not present. */ - struct eptPageTableEntry *pdpe; - pdpe = addr_gpa2hva(vm, pml4e[index[3]].address * vm->page_size); - if (!pdpe[index[2]].readable) { - pdpe[index[2]].address = vm_phy_page_alloc(vm, - KVM_EPT_PAGE_TABLE_MIN_PADDR, eptp_memslot) - >> vm->page_shift; - pdpe[index[2]].writable = true; - pdpe[index[2]].readable = true; - pdpe[index[2]].executable = true; - } + nested_create_pte(vm, pte, nested_paddr, paddr, level, target_level); - /* Allocate page table if not present. */ - struct eptPageTableEntry *pde; - pde = addr_gpa2hva(vm, pdpe[index[2]].address * vm->page_size); - if (!pde[index[1]].readable) { - pde[index[1]].address = vm_phy_page_alloc(vm, - KVM_EPT_PAGE_TABLE_MIN_PADDR, eptp_memslot) - >> vm->page_shift; - pde[index[1]].writable = true; - pde[index[1]].readable = true; - pde[index[1]].executable = true; - } + if (pte->page_size) + break; - /* Fill in page table entry. */ - struct eptPageTableEntry *pte; - pte = addr_gpa2hva(vm, pde[index[1]].address * vm->page_size); - pte[index[0]].address = paddr >> vm->page_shift; - pte[index[0]].writable = true; - pte[index[0]].readable = true; - pte[index[0]].executable = true; + pt = addr_gpa2hva(vm, pte->address * vm->page_size); + } /* * For now mark these as accessed and dirty because the only * testcase we have needs that. Can be reconsidered later. */ - pte[index[0]].accessed = true; - pte[index[0]].dirty = true; + pte->accessed = true; + pte->dirty = true; + +} + +void nested_pg_map(struct vmx_pages *vmx, struct kvm_vm *vm, + uint64_t nested_paddr, uint64_t paddr) +{ + __nested_pg_map(vmx, vm, nested_paddr, paddr, PG_LEVEL_4K); } /* @@ -484,7 +461,7 @@ void nested_pg_map(struct vmx_pages *vmx, struct kvm_vm *vm, * nested_paddr - Nested guest physical address to map * paddr - VM Physical Address * size - The size of the range to map - * eptp_memslot - Memory region slot for new virtual translation tables + * level - The level at which to map the range * * Output Args: None * @@ -493,28 +470,34 @@ void nested_pg_map(struct vmx_pages *vmx, struct kvm_vm *vm, * Within the VM given by vm, creates a nested guest translation for the * page range starting at nested_paddr to the page range starting at paddr. */ -void nested_map(struct vmx_pages *vmx, struct kvm_vm *vm, - uint64_t nested_paddr, uint64_t paddr, uint64_t size, - uint32_t eptp_memslot) +void __nested_map(struct vmx_pages *vmx, struct kvm_vm *vm, + uint64_t nested_paddr, uint64_t paddr, uint64_t size, + int level) { - size_t page_size = vm->page_size; + size_t page_size = PG_LEVEL_SIZE(level); size_t npages = size / page_size; TEST_ASSERT(nested_paddr + size > nested_paddr, "Vaddr overflow"); TEST_ASSERT(paddr + size > paddr, "Paddr overflow"); while (npages--) { - nested_pg_map(vmx, vm, nested_paddr, paddr, eptp_memslot); + __nested_pg_map(vmx, vm, nested_paddr, paddr, level); nested_paddr += page_size; paddr += page_size; } } +void nested_map(struct vmx_pages *vmx, struct kvm_vm *vm, + uint64_t nested_paddr, uint64_t paddr, uint64_t size) +{ + __nested_map(vmx, vm, nested_paddr, paddr, size, PG_LEVEL_4K); +} + /* Prepare an identity extended page table that maps all the * physical pages in VM. */ void nested_map_memslot(struct vmx_pages *vmx, struct kvm_vm *vm, - uint32_t memslot, uint32_t eptp_memslot) + uint32_t memslot) { sparsebit_idx_t i, last; struct userspace_mem_region *region = @@ -530,15 +513,42 @@ void nested_map_memslot(struct vmx_pages *vmx, struct kvm_vm *vm, nested_map(vmx, vm, (uint64_t)i << vm->page_shift, (uint64_t)i << vm->page_shift, - 1 << vm->page_shift, - eptp_memslot); + 1 << vm->page_shift); } } +/* Identity map a region with 1GiB Pages. */ +void nested_identity_map_1g(struct vmx_pages *vmx, struct kvm_vm *vm, + uint64_t addr, uint64_t size) +{ + __nested_map(vmx, vm, addr, addr, size, PG_LEVEL_1G); +} + +bool kvm_cpu_has_ept(void) +{ + uint64_t ctrl; + + ctrl = kvm_get_feature_msr(MSR_IA32_VMX_TRUE_PROCBASED_CTLS) >> 32; + if (!(ctrl & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)) + return false; + + ctrl = kvm_get_feature_msr(MSR_IA32_VMX_PROCBASED_CTLS2) >> 32; + return ctrl & SECONDARY_EXEC_ENABLE_EPT; +} + void prepare_eptp(struct vmx_pages *vmx, struct kvm_vm *vm, uint32_t eptp_memslot) { - vmx->eptp = (void *)vm_vaddr_alloc(vm, getpagesize(), 0x10000, 0, 0); + TEST_ASSERT(kvm_cpu_has_ept(), "KVM doesn't support nested EPT"); + + vmx->eptp = (void *)vm_vaddr_alloc_page(vm); vmx->eptp_hva = addr_gva2hva(vm, (uintptr_t)vmx->eptp); vmx->eptp_gpa = addr_gva2gpa(vm, (uintptr_t)vmx->eptp); } + +void prepare_virtualize_apic_accesses(struct vmx_pages *vmx, struct kvm_vm *vm) +{ + vmx->apic_access = (void *)vm_vaddr_alloc_page(vm); + vmx->apic_access_hva = addr_gva2hva(vm, (uintptr_t)vmx->apic_access); + vmx->apic_access_gpa = addr_gva2gpa(vm, (uintptr_t)vmx->apic_access); +} diff --git a/tools/testing/selftests/kvm/max_guest_memory_test.c b/tools/testing/selftests/kvm/max_guest_memory_test.c new file mode 100644 index 000000000000..0b9678858b6d --- /dev/null +++ b/tools/testing/selftests/kvm/max_guest_memory_test.c @@ -0,0 +1,289 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <stdio.h> +#include <stdlib.h> +#include <pthread.h> +#include <semaphore.h> +#include <sys/types.h> +#include <signal.h> +#include <errno.h> +#include <linux/bitmap.h> +#include <linux/bitops.h> +#include <linux/atomic.h> +#include <linux/sizes.h> + +#include "kvm_util.h" +#include "test_util.h" +#include "guest_modes.h" +#include "processor.h" + +static void guest_code(uint64_t start_gpa, uint64_t end_gpa, uint64_t stride) +{ + uint64_t gpa; + + for (;;) { + for (gpa = start_gpa; gpa < end_gpa; gpa += stride) + *((volatile uint64_t *)gpa) = gpa; + GUEST_SYNC(0); + } +} + +struct vcpu_info { + struct kvm_vcpu *vcpu; + uint64_t start_gpa; + uint64_t end_gpa; +}; + +static int nr_vcpus; +static atomic_t rendezvous; + +static void rendezvous_with_boss(void) +{ + int orig = atomic_read(&rendezvous); + + if (orig > 0) { + atomic_dec_and_test(&rendezvous); + while (atomic_read(&rendezvous) > 0) + cpu_relax(); + } else { + atomic_inc(&rendezvous); + while (atomic_read(&rendezvous) < 0) + cpu_relax(); + } +} + +static void run_vcpu(struct kvm_vcpu *vcpu) +{ + vcpu_run(vcpu); + TEST_ASSERT_EQ(get_ucall(vcpu, NULL), UCALL_SYNC); +} + +static void *vcpu_worker(void *data) +{ + struct vcpu_info *info = data; + struct kvm_vcpu *vcpu = info->vcpu; + struct kvm_vm *vm = vcpu->vm; + struct kvm_sregs sregs; + + vcpu_args_set(vcpu, 3, info->start_gpa, info->end_gpa, vm->page_size); + + rendezvous_with_boss(); + + run_vcpu(vcpu); + rendezvous_with_boss(); + vcpu_sregs_get(vcpu, &sregs); +#ifdef __x86_64__ + /* Toggle CR0.WP to trigger a MMU context reset. */ + sregs.cr0 ^= X86_CR0_WP; +#endif + vcpu_sregs_set(vcpu, &sregs); + rendezvous_with_boss(); + + run_vcpu(vcpu); + rendezvous_with_boss(); + + return NULL; +} + +static pthread_t *spawn_workers(struct kvm_vm *vm, struct kvm_vcpu **vcpus, + uint64_t start_gpa, uint64_t end_gpa) +{ + struct vcpu_info *info; + uint64_t gpa, nr_bytes; + pthread_t *threads; + int i; + + threads = malloc(nr_vcpus * sizeof(*threads)); + TEST_ASSERT(threads, "Failed to allocate vCPU threads"); + + info = malloc(nr_vcpus * sizeof(*info)); + TEST_ASSERT(info, "Failed to allocate vCPU gpa ranges"); + + nr_bytes = ((end_gpa - start_gpa) / nr_vcpus) & + ~((uint64_t)vm->page_size - 1); + TEST_ASSERT(nr_bytes, "C'mon, no way you have %d CPUs", nr_vcpus); + + for (i = 0, gpa = start_gpa; i < nr_vcpus; i++, gpa += nr_bytes) { + info[i].vcpu = vcpus[i]; + info[i].start_gpa = gpa; + info[i].end_gpa = gpa + nr_bytes; + pthread_create(&threads[i], NULL, vcpu_worker, &info[i]); + } + return threads; +} + +static void rendezvous_with_vcpus(struct timespec *time, const char *name) +{ + int i, rendezvoused; + + pr_info("Waiting for vCPUs to finish %s...\n", name); + + rendezvoused = atomic_read(&rendezvous); + for (i = 0; abs(rendezvoused) != 1; i++) { + usleep(100); + if (!(i & 0x3f)) + pr_info("\r%d vCPUs haven't rendezvoused...", + abs(rendezvoused) - 1); + rendezvoused = atomic_read(&rendezvous); + } + + clock_gettime(CLOCK_MONOTONIC, time); + + /* Release the vCPUs after getting the time of the previous action. */ + pr_info("\rAll vCPUs finished %s, releasing...\n", name); + if (rendezvoused > 0) + atomic_set(&rendezvous, -nr_vcpus - 1); + else + atomic_set(&rendezvous, nr_vcpus + 1); +} + +static void calc_default_nr_vcpus(void) +{ + cpu_set_t possible_mask; + int r; + + r = sched_getaffinity(0, sizeof(possible_mask), &possible_mask); + TEST_ASSERT(!r, "sched_getaffinity failed, errno = %d (%s)", + errno, strerror(errno)); + + nr_vcpus = CPU_COUNT(&possible_mask) * 3/4; + TEST_ASSERT(nr_vcpus > 0, "Uh, no CPUs?"); +} + +int main(int argc, char *argv[]) +{ + /* + * Skip the first 4gb and slot0. slot0 maps <1gb and is used to back + * the guest's code, stack, and page tables. Because selftests creates + * an IRQCHIP, a.k.a. a local APIC, KVM creates an internal memslot + * just below the 4gb boundary. This test could create memory at + * 1gb-3gb,but it's simpler to skip straight to 4gb. + */ + const uint64_t start_gpa = SZ_4G; + const int first_slot = 1; + + struct timespec time_start, time_run1, time_reset, time_run2; + uint64_t max_gpa, gpa, slot_size, max_mem, i; + int max_slots, slot, opt, fd; + bool hugepages = false; + struct kvm_vcpu **vcpus; + pthread_t *threads; + struct kvm_vm *vm; + void *mem; + + /* + * Default to 2gb so that maxing out systems with MAXPHADDR=46, which + * are quite common for x86, requires changing only max_mem (KVM allows + * 32k memslots, 32k * 2gb == ~64tb of guest memory). + */ + slot_size = SZ_2G; + + max_slots = kvm_check_cap(KVM_CAP_NR_MEMSLOTS); + TEST_ASSERT(max_slots > first_slot, "KVM is broken"); + + /* All KVM MMUs should be able to survive a 128gb guest. */ + max_mem = 128ull * SZ_1G; + + calc_default_nr_vcpus(); + + while ((opt = getopt(argc, argv, "c:h:m:s:H")) != -1) { + switch (opt) { + case 'c': + nr_vcpus = atoi_positive("Number of vCPUs", optarg); + break; + case 'm': + max_mem = 1ull * atoi_positive("Memory size", optarg) * SZ_1G; + break; + case 's': + slot_size = 1ull * atoi_positive("Slot size", optarg) * SZ_1G; + break; + case 'H': + hugepages = true; + break; + case 'h': + default: + printf("usage: %s [-c nr_vcpus] [-m max_mem_in_gb] [-s slot_size_in_gb] [-H]\n", argv[0]); + exit(1); + } + } + + vcpus = malloc(nr_vcpus * sizeof(*vcpus)); + TEST_ASSERT(vcpus, "Failed to allocate vCPU array"); + + vm = vm_create_with_vcpus(nr_vcpus, guest_code, vcpus); + + max_gpa = vm->max_gfn << vm->page_shift; + TEST_ASSERT(max_gpa > (4 * slot_size), "MAXPHYADDR <4gb "); + + fd = kvm_memfd_alloc(slot_size, hugepages); + mem = mmap(NULL, slot_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + TEST_ASSERT(mem != MAP_FAILED, "mmap() failed"); + + TEST_ASSERT(!madvise(mem, slot_size, MADV_NOHUGEPAGE), "madvise() failed"); + + /* Pre-fault the memory to avoid taking mmap_sem on guest page faults. */ + for (i = 0; i < slot_size; i += vm->page_size) + ((uint8_t *)mem)[i] = 0xaa; + + gpa = 0; + for (slot = first_slot; slot < max_slots; slot++) { + gpa = start_gpa + ((slot - first_slot) * slot_size); + if (gpa + slot_size > max_gpa) + break; + + if ((gpa - start_gpa) >= max_mem) + break; + + vm_set_user_memory_region(vm, slot, 0, gpa, slot_size, mem); + +#ifdef __x86_64__ + /* Identity map memory in the guest using 1gb pages. */ + for (i = 0; i < slot_size; i += SZ_1G) + __virt_pg_map(vm, gpa + i, gpa + i, PG_LEVEL_1G); +#else + for (i = 0; i < slot_size; i += vm->page_size) + virt_pg_map(vm, gpa + i, gpa + i); +#endif + } + + atomic_set(&rendezvous, nr_vcpus + 1); + threads = spawn_workers(vm, vcpus, start_gpa, gpa); + + free(vcpus); + vcpus = NULL; + + pr_info("Running with %lugb of guest memory and %u vCPUs\n", + (gpa - start_gpa) / SZ_1G, nr_vcpus); + + rendezvous_with_vcpus(&time_start, "spawning"); + rendezvous_with_vcpus(&time_run1, "run 1"); + rendezvous_with_vcpus(&time_reset, "reset"); + rendezvous_with_vcpus(&time_run2, "run 2"); + + time_run2 = timespec_sub(time_run2, time_reset); + time_reset = timespec_sub(time_reset, time_run1); + time_run1 = timespec_sub(time_run1, time_start); + + pr_info("run1 = %ld.%.9lds, reset = %ld.%.9lds, run2 = %ld.%.9lds\n", + time_run1.tv_sec, time_run1.tv_nsec, + time_reset.tv_sec, time_reset.tv_nsec, + time_run2.tv_sec, time_run2.tv_nsec); + + /* + * Delete even numbered slots (arbitrary) and unmap the first half of + * the backing (also arbitrary) to verify KVM correctly drops all + * references to the removed regions. + */ + for (slot = (slot - 1) & ~1ull; slot >= first_slot; slot -= 2) + vm_set_user_memory_region(vm, slot, 0, 0, 0, NULL); + + munmap(mem, slot_size / 2); + + /* Sanity check that the vCPUs actually ran. */ + for (i = 0; i < nr_vcpus; i++) + pthread_join(threads[i], NULL); + + /* + * Deliberately exit without deleting the remaining memslots or closing + * kvm_fd to test cleanup via mmu_notifier.release. + */ +} diff --git a/tools/testing/selftests/kvm/memslot_modification_stress_test.c b/tools/testing/selftests/kvm/memslot_modification_stress_test.c new file mode 100644 index 000000000000..05fcf902e067 --- /dev/null +++ b/tools/testing/selftests/kvm/memslot_modification_stress_test.c @@ -0,0 +1,179 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * KVM memslot modification stress test + * Adapted from demand_paging_test.c + * + * Copyright (C) 2018, Red Hat, Inc. + * Copyright (C) 2020, Google, Inc. + */ +#include <stdio.h> +#include <stdlib.h> +#include <sys/syscall.h> +#include <unistd.h> +#include <asm/unistd.h> +#include <time.h> +#include <poll.h> +#include <pthread.h> +#include <linux/bitmap.h> +#include <linux/bitops.h> +#include <linux/userfaultfd.h> + +#include "memstress.h" +#include "processor.h" +#include "test_util.h" +#include "guest_modes.h" + +#define DUMMY_MEMSLOT_INDEX 7 + +#define DEFAULT_MEMSLOT_MODIFICATION_ITERATIONS 10 + + +static int nr_vcpus = 1; +static uint64_t guest_percpu_mem_size = DEFAULT_PER_VCPU_MEM_SIZE; + +static void vcpu_worker(struct memstress_vcpu_args *vcpu_args) +{ + struct kvm_vcpu *vcpu = vcpu_args->vcpu; + struct kvm_run *run; + int ret; + + run = vcpu->run; + + /* Let the guest access its memory until a stop signal is received */ + while (!READ_ONCE(memstress_args.stop_vcpus)) { + ret = _vcpu_run(vcpu); + TEST_ASSERT(ret == 0, "vcpu_run failed: %d", ret); + + if (get_ucall(vcpu, NULL) == UCALL_SYNC) + continue; + + TEST_ASSERT(false, + "Invalid guest sync status: exit_reason=%s\n", + exit_reason_str(run->exit_reason)); + } +} + +struct memslot_antagonist_args { + struct kvm_vm *vm; + useconds_t delay; + uint64_t nr_modifications; +}; + +static void add_remove_memslot(struct kvm_vm *vm, useconds_t delay, + uint64_t nr_modifications) +{ + uint64_t pages = max_t(int, vm->page_size, getpagesize()) / vm->page_size; + uint64_t gpa; + int i; + + /* + * Add the dummy memslot just below the memstress memslot, which is + * at the top of the guest physical address space. + */ + gpa = memstress_args.gpa - pages * vm->page_size; + + for (i = 0; i < nr_modifications; i++) { + usleep(delay); + vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, gpa, + DUMMY_MEMSLOT_INDEX, pages, 0); + + vm_mem_region_delete(vm, DUMMY_MEMSLOT_INDEX); + } +} + +struct test_params { + useconds_t delay; + uint64_t nr_iterations; + bool partition_vcpu_memory_access; +}; + +static void run_test(enum vm_guest_mode mode, void *arg) +{ + struct test_params *p = arg; + struct kvm_vm *vm; + + vm = memstress_create_vm(mode, nr_vcpus, guest_percpu_mem_size, 1, + VM_MEM_SRC_ANONYMOUS, + p->partition_vcpu_memory_access); + + pr_info("Finished creating vCPUs\n"); + + memstress_start_vcpu_threads(nr_vcpus, vcpu_worker); + + pr_info("Started all vCPUs\n"); + + add_remove_memslot(vm, p->delay, p->nr_iterations); + + memstress_join_vcpu_threads(nr_vcpus); + pr_info("All vCPU threads joined\n"); + + memstress_destroy_vm(vm); +} + +static void help(char *name) +{ + puts(""); + printf("usage: %s [-h] [-m mode] [-d delay_usec]\n" + " [-b memory] [-v vcpus] [-o] [-i iterations]\n", name); + guest_modes_help(); + printf(" -d: add a delay between each iteration of adding and\n" + " deleting a memslot in usec.\n"); + printf(" -b: specify the size of the memory region which should be\n" + " accessed by each vCPU. e.g. 10M or 3G.\n" + " Default: 1G\n"); + printf(" -v: specify the number of vCPUs to run.\n"); + printf(" -o: Overlap guest memory accesses instead of partitioning\n" + " them into a separate region of memory for each vCPU.\n"); + printf(" -i: specify the number of iterations of adding and removing\n" + " a memslot.\n" + " Default: %d\n", DEFAULT_MEMSLOT_MODIFICATION_ITERATIONS); + puts(""); + exit(0); +} + +int main(int argc, char *argv[]) +{ + int max_vcpus = kvm_check_cap(KVM_CAP_MAX_VCPUS); + int opt; + struct test_params p = { + .delay = 0, + .nr_iterations = DEFAULT_MEMSLOT_MODIFICATION_ITERATIONS, + .partition_vcpu_memory_access = true + }; + + guest_modes_append_default(); + + while ((opt = getopt(argc, argv, "hm:d:b:v:oi:")) != -1) { + switch (opt) { + case 'm': + guest_modes_cmdline(optarg); + break; + case 'd': + p.delay = atoi_non_negative("Delay", optarg); + break; + case 'b': + guest_percpu_mem_size = parse_size(optarg); + break; + case 'v': + nr_vcpus = atoi_positive("Number of vCPUs", optarg); + TEST_ASSERT(nr_vcpus <= max_vcpus, + "Invalid number of vcpus, must be between 1 and %d", + max_vcpus); + break; + case 'o': + p.partition_vcpu_memory_access = false; + break; + case 'i': + p.nr_iterations = atoi_positive("Number of iterations", optarg); + break; + case 'h': + default: + help(argv[0]); + break; + } + } + + for_each_guest_mode(run_test, &p); + + return 0; +} diff --git a/tools/testing/selftests/kvm/memslot_perf_test.c b/tools/testing/selftests/kvm/memslot_perf_test.c new file mode 100644 index 000000000000..579a64f97333 --- /dev/null +++ b/tools/testing/selftests/kvm/memslot_perf_test.c @@ -0,0 +1,1129 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * A memslot-related performance benchmark. + * + * Copyright (C) 2021 Oracle and/or its affiliates. + * + * Basic guest setup / host vCPU thread code lifted from set_memory_region_test. + */ +#include <pthread.h> +#include <sched.h> +#include <semaphore.h> +#include <stdatomic.h> +#include <stdbool.h> +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/mman.h> +#include <time.h> +#include <unistd.h> + +#include <linux/compiler.h> +#include <linux/sizes.h> + +#include <test_util.h> +#include <kvm_util.h> +#include <processor.h> + +#define MEM_EXTRA_SIZE SZ_64K + +#define MEM_SIZE (SZ_512M + MEM_EXTRA_SIZE) +#define MEM_GPA SZ_256M +#define MEM_AUX_GPA MEM_GPA +#define MEM_SYNC_GPA MEM_AUX_GPA +#define MEM_TEST_GPA (MEM_AUX_GPA + MEM_EXTRA_SIZE) +#define MEM_TEST_SIZE (MEM_SIZE - MEM_EXTRA_SIZE) + +/* + * 32 MiB is max size that gets well over 100 iterations on 509 slots. + * Considering that each slot needs to have at least one page up to + * 8194 slots in use can then be tested (although with slightly + * limited resolution). + */ +#define MEM_SIZE_MAP (SZ_32M + MEM_EXTRA_SIZE) +#define MEM_TEST_MAP_SIZE (MEM_SIZE_MAP - MEM_EXTRA_SIZE) + +/* + * 128 MiB is min size that fills 32k slots with at least one page in each + * while at the same time gets 100+ iterations in such test + * + * 2 MiB chunk size like a typical huge page + */ +#define MEM_TEST_UNMAP_SIZE SZ_128M +#define MEM_TEST_UNMAP_CHUNK_SIZE SZ_2M + +/* + * For the move active test the middle of the test area is placed on + * a memslot boundary: half lies in the memslot being moved, half in + * other memslot(s). + * + * We have different number of memory slots, excluding the reserved + * memory slot 0, on various architectures and configurations. The + * memory size in this test is calculated by picking the maximal + * last memory slot's memory size, with alignment to the largest + * supported page size (64KB). In this way, the selected memory + * size for this test is compatible with test_memslot_move_prepare(). + * + * architecture slots memory-per-slot memory-on-last-slot + * -------------------------------------------------------------- + * x86-4KB 32763 16KB 160KB + * arm64-4KB 32766 16KB 112KB + * arm64-16KB 32766 16KB 112KB + * arm64-64KB 8192 64KB 128KB + */ +#define MEM_TEST_MOVE_SIZE (3 * SZ_64K) +#define MEM_TEST_MOVE_GPA_DEST (MEM_GPA + MEM_SIZE) +static_assert(MEM_TEST_MOVE_SIZE <= MEM_TEST_SIZE, + "invalid move test region size"); + +#define MEM_TEST_VAL_1 0x1122334455667788 +#define MEM_TEST_VAL_2 0x99AABBCCDDEEFF00 + +struct vm_data { + struct kvm_vm *vm; + struct kvm_vcpu *vcpu; + pthread_t vcpu_thread; + uint32_t nslots; + uint64_t npages; + uint64_t pages_per_slot; + void **hva_slots; + bool mmio_ok; + uint64_t mmio_gpa_min; + uint64_t mmio_gpa_max; +}; + +struct sync_area { + uint32_t guest_page_size; + atomic_bool start_flag; + atomic_bool exit_flag; + atomic_bool sync_flag; + void *move_area_ptr; +}; + +/* + * Technically, we need also for the atomic bool to be address-free, which + * is recommended, but not strictly required, by C11 for lockless + * implementations. + * However, in practice both GCC and Clang fulfill this requirement on + * all KVM-supported platforms. + */ +static_assert(ATOMIC_BOOL_LOCK_FREE == 2, "atomic bool is not lockless"); + +static sem_t vcpu_ready; + +static bool map_unmap_verify; + +static bool verbose; +#define pr_info_v(...) \ + do { \ + if (verbose) \ + pr_info(__VA_ARGS__); \ + } while (0) + +static void check_mmio_access(struct vm_data *data, struct kvm_run *run) +{ + TEST_ASSERT(data->mmio_ok, "Unexpected mmio exit"); + TEST_ASSERT(run->mmio.is_write, "Unexpected mmio read"); + TEST_ASSERT(run->mmio.len == 8, + "Unexpected exit mmio size = %u", run->mmio.len); + TEST_ASSERT(run->mmio.phys_addr >= data->mmio_gpa_min && + run->mmio.phys_addr <= data->mmio_gpa_max, + "Unexpected exit mmio address = 0x%llx", + run->mmio.phys_addr); +} + +static void *vcpu_worker(void *__data) +{ + struct vm_data *data = __data; + struct kvm_vcpu *vcpu = data->vcpu; + struct kvm_run *run = vcpu->run; + struct ucall uc; + + while (1) { + vcpu_run(vcpu); + + switch (get_ucall(vcpu, &uc)) { + case UCALL_SYNC: + TEST_ASSERT(uc.args[1] == 0, + "Unexpected sync ucall, got %lx", + (ulong)uc.args[1]); + sem_post(&vcpu_ready); + continue; + case UCALL_NONE: + if (run->exit_reason == KVM_EXIT_MMIO) + check_mmio_access(data, run); + else + goto done; + break; + case UCALL_ABORT: + REPORT_GUEST_ASSERT(uc); + break; + case UCALL_DONE: + goto done; + default: + TEST_FAIL("Unknown ucall %lu", uc.cmd); + } + } + +done: + return NULL; +} + +static void wait_for_vcpu(void) +{ + struct timespec ts; + + TEST_ASSERT(!clock_gettime(CLOCK_REALTIME, &ts), + "clock_gettime() failed: %d", errno); + + ts.tv_sec += 2; + TEST_ASSERT(!sem_timedwait(&vcpu_ready, &ts), + "sem_timedwait() failed: %d", errno); +} + +static void *vm_gpa2hva(struct vm_data *data, uint64_t gpa, uint64_t *rempages) +{ + uint64_t gpage, pgoffs; + uint32_t slot, slotoffs; + void *base; + uint32_t guest_page_size = data->vm->page_size; + + TEST_ASSERT(gpa >= MEM_GPA, "Too low gpa to translate"); + TEST_ASSERT(gpa < MEM_GPA + data->npages * guest_page_size, + "Too high gpa to translate"); + gpa -= MEM_GPA; + + gpage = gpa / guest_page_size; + pgoffs = gpa % guest_page_size; + slot = min(gpage / data->pages_per_slot, (uint64_t)data->nslots - 1); + slotoffs = gpage - (slot * data->pages_per_slot); + + if (rempages) { + uint64_t slotpages; + + if (slot == data->nslots - 1) + slotpages = data->npages - slot * data->pages_per_slot; + else + slotpages = data->pages_per_slot; + + TEST_ASSERT(!pgoffs, + "Asking for remaining pages in slot but gpa not page aligned"); + *rempages = slotpages - slotoffs; + } + + base = data->hva_slots[slot]; + return (uint8_t *)base + slotoffs * guest_page_size + pgoffs; +} + +static uint64_t vm_slot2gpa(struct vm_data *data, uint32_t slot) +{ + uint32_t guest_page_size = data->vm->page_size; + + TEST_ASSERT(slot < data->nslots, "Too high slot number"); + + return MEM_GPA + slot * data->pages_per_slot * guest_page_size; +} + +static struct vm_data *alloc_vm(void) +{ + struct vm_data *data; + + data = malloc(sizeof(*data)); + TEST_ASSERT(data, "malloc(vmdata) failed"); + + data->vm = NULL; + data->vcpu = NULL; + data->hva_slots = NULL; + + return data; +} + +static bool check_slot_pages(uint32_t host_page_size, uint32_t guest_page_size, + uint64_t pages_per_slot, uint64_t rempages) +{ + if (!pages_per_slot) + return false; + + if ((pages_per_slot * guest_page_size) % host_page_size) + return false; + + if ((rempages * guest_page_size) % host_page_size) + return false; + + return true; +} + + +static uint64_t get_max_slots(struct vm_data *data, uint32_t host_page_size) +{ + uint32_t guest_page_size = data->vm->page_size; + uint64_t mempages, pages_per_slot, rempages; + uint64_t slots; + + mempages = data->npages; + slots = data->nslots; + while (--slots > 1) { + pages_per_slot = mempages / slots; + if (!pages_per_slot) + continue; + + rempages = mempages % pages_per_slot; + if (check_slot_pages(host_page_size, guest_page_size, + pages_per_slot, rempages)) + return slots + 1; /* slot 0 is reserved */ + } + + return 0; +} + +static bool prepare_vm(struct vm_data *data, int nslots, uint64_t *maxslots, + void *guest_code, uint64_t mem_size, + struct timespec *slot_runtime) +{ + uint64_t mempages, rempages; + uint64_t guest_addr; + uint32_t slot, host_page_size, guest_page_size; + struct timespec tstart; + struct sync_area *sync; + + host_page_size = getpagesize(); + guest_page_size = vm_guest_mode_params[VM_MODE_DEFAULT].page_size; + mempages = mem_size / guest_page_size; + + data->vm = __vm_create_with_one_vcpu(&data->vcpu, mempages, guest_code); + TEST_ASSERT(data->vm->page_size == guest_page_size, "Invalid VM page size"); + + data->npages = mempages; + TEST_ASSERT(data->npages > 1, "Can't test without any memory"); + data->nslots = nslots; + data->pages_per_slot = data->npages / data->nslots; + rempages = data->npages % data->nslots; + if (!check_slot_pages(host_page_size, guest_page_size, + data->pages_per_slot, rempages)) { + *maxslots = get_max_slots(data, host_page_size); + return false; + } + + data->hva_slots = malloc(sizeof(*data->hva_slots) * data->nslots); + TEST_ASSERT(data->hva_slots, "malloc() fail"); + + pr_info_v("Adding slots 1..%i, each slot with %"PRIu64" pages + %"PRIu64" extra pages last\n", + data->nslots, data->pages_per_slot, rempages); + + clock_gettime(CLOCK_MONOTONIC, &tstart); + for (slot = 1, guest_addr = MEM_GPA; slot <= data->nslots; slot++) { + uint64_t npages; + + npages = data->pages_per_slot; + if (slot == data->nslots) + npages += rempages; + + vm_userspace_mem_region_add(data->vm, VM_MEM_SRC_ANONYMOUS, + guest_addr, slot, npages, + 0); + guest_addr += npages * guest_page_size; + } + *slot_runtime = timespec_elapsed(tstart); + + for (slot = 1, guest_addr = MEM_GPA; slot <= data->nslots; slot++) { + uint64_t npages; + uint64_t gpa; + + npages = data->pages_per_slot; + if (slot == data->nslots) + npages += rempages; + + gpa = vm_phy_pages_alloc(data->vm, npages, guest_addr, slot); + TEST_ASSERT(gpa == guest_addr, + "vm_phy_pages_alloc() failed"); + + data->hva_slots[slot - 1] = addr_gpa2hva(data->vm, guest_addr); + memset(data->hva_slots[slot - 1], 0, npages * guest_page_size); + + guest_addr += npages * guest_page_size; + } + + virt_map(data->vm, MEM_GPA, MEM_GPA, data->npages); + + sync = (typeof(sync))vm_gpa2hva(data, MEM_SYNC_GPA, NULL); + sync->guest_page_size = data->vm->page_size; + atomic_init(&sync->start_flag, false); + atomic_init(&sync->exit_flag, false); + atomic_init(&sync->sync_flag, false); + + data->mmio_ok = false; + + return true; +} + +static void launch_vm(struct vm_data *data) +{ + pr_info_v("Launching the test VM\n"); + + pthread_create(&data->vcpu_thread, NULL, vcpu_worker, data); + + /* Ensure the guest thread is spun up. */ + wait_for_vcpu(); +} + +static void free_vm(struct vm_data *data) +{ + kvm_vm_free(data->vm); + free(data->hva_slots); + free(data); +} + +static void wait_guest_exit(struct vm_data *data) +{ + pthread_join(data->vcpu_thread, NULL); +} + +static void let_guest_run(struct sync_area *sync) +{ + atomic_store_explicit(&sync->start_flag, true, memory_order_release); +} + +static void guest_spin_until_start(void) +{ + struct sync_area *sync = (typeof(sync))MEM_SYNC_GPA; + + while (!atomic_load_explicit(&sync->start_flag, memory_order_acquire)) + ; +} + +static void make_guest_exit(struct sync_area *sync) +{ + atomic_store_explicit(&sync->exit_flag, true, memory_order_release); +} + +static bool _guest_should_exit(void) +{ + struct sync_area *sync = (typeof(sync))MEM_SYNC_GPA; + + return atomic_load_explicit(&sync->exit_flag, memory_order_acquire); +} + +#define guest_should_exit() unlikely(_guest_should_exit()) + +/* + * noinline so we can easily see how much time the host spends waiting + * for the guest. + * For the same reason use alarm() instead of polling clock_gettime() + * to implement a wait timeout. + */ +static noinline void host_perform_sync(struct sync_area *sync) +{ + alarm(2); + + atomic_store_explicit(&sync->sync_flag, true, memory_order_release); + while (atomic_load_explicit(&sync->sync_flag, memory_order_acquire)) + ; + + alarm(0); +} + +static bool guest_perform_sync(void) +{ + struct sync_area *sync = (typeof(sync))MEM_SYNC_GPA; + bool expected; + + do { + if (guest_should_exit()) + return false; + + expected = true; + } while (!atomic_compare_exchange_weak_explicit(&sync->sync_flag, + &expected, false, + memory_order_acq_rel, + memory_order_relaxed)); + + return true; +} + +static void guest_code_test_memslot_move(void) +{ + struct sync_area *sync = (typeof(sync))MEM_SYNC_GPA; + uint32_t page_size = (typeof(page_size))READ_ONCE(sync->guest_page_size); + uintptr_t base = (typeof(base))READ_ONCE(sync->move_area_ptr); + + GUEST_SYNC(0); + + guest_spin_until_start(); + + while (!guest_should_exit()) { + uintptr_t ptr; + + for (ptr = base; ptr < base + MEM_TEST_MOVE_SIZE; + ptr += page_size) + *(uint64_t *)ptr = MEM_TEST_VAL_1; + + /* + * No host sync here since the MMIO exits are so expensive + * that the host would spend most of its time waiting for + * the guest and so instead of measuring memslot move + * performance we would measure the performance and + * likelihood of MMIO exits + */ + } + + GUEST_DONE(); +} + +static void guest_code_test_memslot_map(void) +{ + struct sync_area *sync = (typeof(sync))MEM_SYNC_GPA; + uint32_t page_size = (typeof(page_size))READ_ONCE(sync->guest_page_size); + + GUEST_SYNC(0); + + guest_spin_until_start(); + + while (1) { + uintptr_t ptr; + + for (ptr = MEM_TEST_GPA; + ptr < MEM_TEST_GPA + MEM_TEST_MAP_SIZE / 2; + ptr += page_size) + *(uint64_t *)ptr = MEM_TEST_VAL_1; + + if (!guest_perform_sync()) + break; + + for (ptr = MEM_TEST_GPA + MEM_TEST_MAP_SIZE / 2; + ptr < MEM_TEST_GPA + MEM_TEST_MAP_SIZE; + ptr += page_size) + *(uint64_t *)ptr = MEM_TEST_VAL_2; + + if (!guest_perform_sync()) + break; + } + + GUEST_DONE(); +} + +static void guest_code_test_memslot_unmap(void) +{ + struct sync_area *sync = (typeof(sync))MEM_SYNC_GPA; + + GUEST_SYNC(0); + + guest_spin_until_start(); + + while (1) { + uintptr_t ptr = MEM_TEST_GPA; + + /* + * We can afford to access (map) just a small number of pages + * per host sync as otherwise the host will spend + * a significant amount of its time waiting for the guest + * (instead of doing unmap operations), so this will + * effectively turn this test into a map performance test. + * + * Just access a single page to be on the safe side. + */ + *(uint64_t *)ptr = MEM_TEST_VAL_1; + + if (!guest_perform_sync()) + break; + + ptr += MEM_TEST_UNMAP_SIZE / 2; + *(uint64_t *)ptr = MEM_TEST_VAL_2; + + if (!guest_perform_sync()) + break; + } + + GUEST_DONE(); +} + +static void guest_code_test_memslot_rw(void) +{ + struct sync_area *sync = (typeof(sync))MEM_SYNC_GPA; + uint32_t page_size = (typeof(page_size))READ_ONCE(sync->guest_page_size); + + GUEST_SYNC(0); + + guest_spin_until_start(); + + while (1) { + uintptr_t ptr; + + for (ptr = MEM_TEST_GPA; + ptr < MEM_TEST_GPA + MEM_TEST_SIZE; ptr += page_size) + *(uint64_t *)ptr = MEM_TEST_VAL_1; + + if (!guest_perform_sync()) + break; + + for (ptr = MEM_TEST_GPA + page_size / 2; + ptr < MEM_TEST_GPA + MEM_TEST_SIZE; ptr += page_size) { + uint64_t val = *(uint64_t *)ptr; + + GUEST_ASSERT_EQ(val, MEM_TEST_VAL_2); + *(uint64_t *)ptr = 0; + } + + if (!guest_perform_sync()) + break; + } + + GUEST_DONE(); +} + +static bool test_memslot_move_prepare(struct vm_data *data, + struct sync_area *sync, + uint64_t *maxslots, bool isactive) +{ + uint32_t guest_page_size = data->vm->page_size; + uint64_t movesrcgpa, movetestgpa; + + movesrcgpa = vm_slot2gpa(data, data->nslots - 1); + + if (isactive) { + uint64_t lastpages; + + vm_gpa2hva(data, movesrcgpa, &lastpages); + if (lastpages * guest_page_size < MEM_TEST_MOVE_SIZE / 2) { + *maxslots = 0; + return false; + } + } + + movetestgpa = movesrcgpa - (MEM_TEST_MOVE_SIZE / (isactive ? 2 : 1)); + sync->move_area_ptr = (void *)movetestgpa; + + if (isactive) { + data->mmio_ok = true; + data->mmio_gpa_min = movesrcgpa; + data->mmio_gpa_max = movesrcgpa + MEM_TEST_MOVE_SIZE / 2 - 1; + } + + return true; +} + +static bool test_memslot_move_prepare_active(struct vm_data *data, + struct sync_area *sync, + uint64_t *maxslots) +{ + return test_memslot_move_prepare(data, sync, maxslots, true); +} + +static bool test_memslot_move_prepare_inactive(struct vm_data *data, + struct sync_area *sync, + uint64_t *maxslots) +{ + return test_memslot_move_prepare(data, sync, maxslots, false); +} + +static void test_memslot_move_loop(struct vm_data *data, struct sync_area *sync) +{ + uint64_t movesrcgpa; + + movesrcgpa = vm_slot2gpa(data, data->nslots - 1); + vm_mem_region_move(data->vm, data->nslots - 1 + 1, + MEM_TEST_MOVE_GPA_DEST); + vm_mem_region_move(data->vm, data->nslots - 1 + 1, movesrcgpa); +} + +static void test_memslot_do_unmap(struct vm_data *data, + uint64_t offsp, uint64_t count) +{ + uint64_t gpa, ctr; + uint32_t guest_page_size = data->vm->page_size; + + for (gpa = MEM_TEST_GPA + offsp * guest_page_size, ctr = 0; ctr < count; ) { + uint64_t npages; + void *hva; + int ret; + + hva = vm_gpa2hva(data, gpa, &npages); + TEST_ASSERT(npages, "Empty memory slot at gptr 0x%"PRIx64, gpa); + npages = min(npages, count - ctr); + ret = madvise(hva, npages * guest_page_size, MADV_DONTNEED); + TEST_ASSERT(!ret, + "madvise(%p, MADV_DONTNEED) on VM memory should not fail for gptr 0x%"PRIx64, + hva, gpa); + ctr += npages; + gpa += npages * guest_page_size; + } + TEST_ASSERT(ctr == count, + "madvise(MADV_DONTNEED) should exactly cover all of the requested area"); +} + +static void test_memslot_map_unmap_check(struct vm_data *data, + uint64_t offsp, uint64_t valexp) +{ + uint64_t gpa; + uint64_t *val; + uint32_t guest_page_size = data->vm->page_size; + + if (!map_unmap_verify) + return; + + gpa = MEM_TEST_GPA + offsp * guest_page_size; + val = (typeof(val))vm_gpa2hva(data, gpa, NULL); + TEST_ASSERT(*val == valexp, + "Guest written values should read back correctly before unmap (%"PRIu64" vs %"PRIu64" @ %"PRIx64")", + *val, valexp, gpa); + *val = 0; +} + +static void test_memslot_map_loop(struct vm_data *data, struct sync_area *sync) +{ + uint32_t guest_page_size = data->vm->page_size; + uint64_t guest_pages = MEM_TEST_MAP_SIZE / guest_page_size; + + /* + * Unmap the second half of the test area while guest writes to (maps) + * the first half. + */ + test_memslot_do_unmap(data, guest_pages / 2, guest_pages / 2); + + /* + * Wait for the guest to finish writing the first half of the test + * area, verify the written value on the first and the last page of + * this area and then unmap it. + * Meanwhile, the guest is writing to (mapping) the second half of + * the test area. + */ + host_perform_sync(sync); + test_memslot_map_unmap_check(data, 0, MEM_TEST_VAL_1); + test_memslot_map_unmap_check(data, guest_pages / 2 - 1, MEM_TEST_VAL_1); + test_memslot_do_unmap(data, 0, guest_pages / 2); + + + /* + * Wait for the guest to finish writing the second half of the test + * area and verify the written value on the first and the last page + * of this area. + * The area will be unmapped at the beginning of the next loop + * iteration. + * Meanwhile, the guest is writing to (mapping) the first half of + * the test area. + */ + host_perform_sync(sync); + test_memslot_map_unmap_check(data, guest_pages / 2, MEM_TEST_VAL_2); + test_memslot_map_unmap_check(data, guest_pages - 1, MEM_TEST_VAL_2); +} + +static void test_memslot_unmap_loop_common(struct vm_data *data, + struct sync_area *sync, + uint64_t chunk) +{ + uint32_t guest_page_size = data->vm->page_size; + uint64_t guest_pages = MEM_TEST_UNMAP_SIZE / guest_page_size; + uint64_t ctr; + + /* + * Wait for the guest to finish mapping page(s) in the first half + * of the test area, verify the written value and then perform unmap + * of this area. + * Meanwhile, the guest is writing to (mapping) page(s) in the second + * half of the test area. + */ + host_perform_sync(sync); + test_memslot_map_unmap_check(data, 0, MEM_TEST_VAL_1); + for (ctr = 0; ctr < guest_pages / 2; ctr += chunk) + test_memslot_do_unmap(data, ctr, chunk); + + /* Likewise, but for the opposite host / guest areas */ + host_perform_sync(sync); + test_memslot_map_unmap_check(data, guest_pages / 2, MEM_TEST_VAL_2); + for (ctr = guest_pages / 2; ctr < guest_pages; ctr += chunk) + test_memslot_do_unmap(data, ctr, chunk); +} + +static void test_memslot_unmap_loop(struct vm_data *data, + struct sync_area *sync) +{ + uint32_t host_page_size = getpagesize(); + uint32_t guest_page_size = data->vm->page_size; + uint64_t guest_chunk_pages = guest_page_size >= host_page_size ? + 1 : host_page_size / guest_page_size; + + test_memslot_unmap_loop_common(data, sync, guest_chunk_pages); +} + +static void test_memslot_unmap_loop_chunked(struct vm_data *data, + struct sync_area *sync) +{ + uint32_t guest_page_size = data->vm->page_size; + uint64_t guest_chunk_pages = MEM_TEST_UNMAP_CHUNK_SIZE / guest_page_size; + + test_memslot_unmap_loop_common(data, sync, guest_chunk_pages); +} + +static void test_memslot_rw_loop(struct vm_data *data, struct sync_area *sync) +{ + uint64_t gptr; + uint32_t guest_page_size = data->vm->page_size; + + for (gptr = MEM_TEST_GPA + guest_page_size / 2; + gptr < MEM_TEST_GPA + MEM_TEST_SIZE; gptr += guest_page_size) + *(uint64_t *)vm_gpa2hva(data, gptr, NULL) = MEM_TEST_VAL_2; + + host_perform_sync(sync); + + for (gptr = MEM_TEST_GPA; + gptr < MEM_TEST_GPA + MEM_TEST_SIZE; gptr += guest_page_size) { + uint64_t *vptr = (typeof(vptr))vm_gpa2hva(data, gptr, NULL); + uint64_t val = *vptr; + + TEST_ASSERT(val == MEM_TEST_VAL_1, + "Guest written values should read back correctly (is %"PRIu64" @ %"PRIx64")", + val, gptr); + *vptr = 0; + } + + host_perform_sync(sync); +} + +struct test_data { + const char *name; + uint64_t mem_size; + void (*guest_code)(void); + bool (*prepare)(struct vm_data *data, struct sync_area *sync, + uint64_t *maxslots); + void (*loop)(struct vm_data *data, struct sync_area *sync); +}; + +static bool test_execute(int nslots, uint64_t *maxslots, + unsigned int maxtime, + const struct test_data *tdata, + uint64_t *nloops, + struct timespec *slot_runtime, + struct timespec *guest_runtime) +{ + uint64_t mem_size = tdata->mem_size ? : MEM_SIZE; + struct vm_data *data; + struct sync_area *sync; + struct timespec tstart; + bool ret = true; + + data = alloc_vm(); + if (!prepare_vm(data, nslots, maxslots, tdata->guest_code, + mem_size, slot_runtime)) { + ret = false; + goto exit_free; + } + + sync = (typeof(sync))vm_gpa2hva(data, MEM_SYNC_GPA, NULL); + if (tdata->prepare && + !tdata->prepare(data, sync, maxslots)) { + ret = false; + goto exit_free; + } + + launch_vm(data); + + clock_gettime(CLOCK_MONOTONIC, &tstart); + let_guest_run(sync); + + while (1) { + *guest_runtime = timespec_elapsed(tstart); + if (guest_runtime->tv_sec >= maxtime) + break; + + tdata->loop(data, sync); + + (*nloops)++; + } + + make_guest_exit(sync); + wait_guest_exit(data); + +exit_free: + free_vm(data); + + return ret; +} + +static const struct test_data tests[] = { + { + .name = "map", + .mem_size = MEM_SIZE_MAP, + .guest_code = guest_code_test_memslot_map, + .loop = test_memslot_map_loop, + }, + { + .name = "unmap", + .mem_size = MEM_TEST_UNMAP_SIZE + MEM_EXTRA_SIZE, + .guest_code = guest_code_test_memslot_unmap, + .loop = test_memslot_unmap_loop, + }, + { + .name = "unmap chunked", + .mem_size = MEM_TEST_UNMAP_SIZE + MEM_EXTRA_SIZE, + .guest_code = guest_code_test_memslot_unmap, + .loop = test_memslot_unmap_loop_chunked, + }, + { + .name = "move active area", + .guest_code = guest_code_test_memslot_move, + .prepare = test_memslot_move_prepare_active, + .loop = test_memslot_move_loop, + }, + { + .name = "move inactive area", + .guest_code = guest_code_test_memslot_move, + .prepare = test_memslot_move_prepare_inactive, + .loop = test_memslot_move_loop, + }, + { + .name = "RW", + .guest_code = guest_code_test_memslot_rw, + .loop = test_memslot_rw_loop + }, +}; + +#define NTESTS ARRAY_SIZE(tests) + +struct test_args { + int tfirst; + int tlast; + int nslots; + int seconds; + int runs; +}; + +static void help(char *name, struct test_args *targs) +{ + int ctr; + + pr_info("usage: %s [-h] [-v] [-d] [-s slots] [-f first_test] [-e last_test] [-l test_length] [-r run_count]\n", + name); + pr_info(" -h: print this help screen.\n"); + pr_info(" -v: enable verbose mode (not for benchmarking).\n"); + pr_info(" -d: enable extra debug checks.\n"); + pr_info(" -s: specify memslot count cap (-1 means no cap; currently: %i)\n", + targs->nslots); + pr_info(" -f: specify the first test to run (currently: %i; max %zu)\n", + targs->tfirst, NTESTS - 1); + pr_info(" -e: specify the last test to run (currently: %i; max %zu)\n", + targs->tlast, NTESTS - 1); + pr_info(" -l: specify the test length in seconds (currently: %i)\n", + targs->seconds); + pr_info(" -r: specify the number of runs per test (currently: %i)\n", + targs->runs); + + pr_info("\nAvailable tests:\n"); + for (ctr = 0; ctr < NTESTS; ctr++) + pr_info("%d: %s\n", ctr, tests[ctr].name); +} + +static bool check_memory_sizes(void) +{ + uint32_t host_page_size = getpagesize(); + uint32_t guest_page_size = vm_guest_mode_params[VM_MODE_DEFAULT].page_size; + + if (host_page_size > SZ_64K || guest_page_size > SZ_64K) { + pr_info("Unsupported page size on host (0x%x) or guest (0x%x)\n", + host_page_size, guest_page_size); + return false; + } + + if (MEM_SIZE % guest_page_size || + MEM_TEST_SIZE % guest_page_size) { + pr_info("invalid MEM_SIZE or MEM_TEST_SIZE\n"); + return false; + } + + if (MEM_SIZE_MAP % guest_page_size || + MEM_TEST_MAP_SIZE % guest_page_size || + (MEM_TEST_MAP_SIZE / guest_page_size) <= 2 || + (MEM_TEST_MAP_SIZE / guest_page_size) % 2) { + pr_info("invalid MEM_SIZE_MAP or MEM_TEST_MAP_SIZE\n"); + return false; + } + + if (MEM_TEST_UNMAP_SIZE > MEM_TEST_SIZE || + MEM_TEST_UNMAP_SIZE % guest_page_size || + (MEM_TEST_UNMAP_SIZE / guest_page_size) % + (2 * MEM_TEST_UNMAP_CHUNK_SIZE / guest_page_size)) { + pr_info("invalid MEM_TEST_UNMAP_SIZE or MEM_TEST_UNMAP_CHUNK_SIZE\n"); + return false; + } + + return true; +} + +static bool parse_args(int argc, char *argv[], + struct test_args *targs) +{ + uint32_t max_mem_slots; + int opt; + + while ((opt = getopt(argc, argv, "hvds:f:e:l:r:")) != -1) { + switch (opt) { + case 'h': + default: + help(argv[0], targs); + return false; + case 'v': + verbose = true; + break; + case 'd': + map_unmap_verify = true; + break; + case 's': + targs->nslots = atoi_paranoid(optarg); + if (targs->nslots <= 1 && targs->nslots != -1) { + pr_info("Slot count cap must be larger than 1 or -1 for no cap\n"); + return false; + } + break; + case 'f': + targs->tfirst = atoi_non_negative("First test", optarg); + break; + case 'e': + targs->tlast = atoi_non_negative("Last test", optarg); + if (targs->tlast >= NTESTS) { + pr_info("Last test to run has to be non-negative and less than %zu\n", + NTESTS); + return false; + } + break; + case 'l': + targs->seconds = atoi_non_negative("Test length", optarg); + break; + case 'r': + targs->runs = atoi_positive("Runs per test", optarg); + break; + } + } + + if (optind < argc) { + help(argv[0], targs); + return false; + } + + if (targs->tfirst > targs->tlast) { + pr_info("First test to run cannot be greater than the last test to run\n"); + return false; + } + + max_mem_slots = kvm_check_cap(KVM_CAP_NR_MEMSLOTS); + if (max_mem_slots <= 1) { + pr_info("KVM_CAP_NR_MEMSLOTS should be greater than 1\n"); + return false; + } + + /* Memory slot 0 is reserved */ + if (targs->nslots == -1) + targs->nslots = max_mem_slots - 1; + else + targs->nslots = min_t(int, targs->nslots, max_mem_slots) - 1; + + pr_info_v("Allowed Number of memory slots: %"PRIu32"\n", + targs->nslots + 1); + + return true; +} + +struct test_result { + struct timespec slot_runtime, guest_runtime, iter_runtime; + int64_t slottimens, runtimens; + uint64_t nloops; +}; + +static bool test_loop(const struct test_data *data, + const struct test_args *targs, + struct test_result *rbestslottime, + struct test_result *rbestruntime) +{ + uint64_t maxslots; + struct test_result result = {}; + + if (!test_execute(targs->nslots, &maxslots, targs->seconds, data, + &result.nloops, + &result.slot_runtime, &result.guest_runtime)) { + if (maxslots) + pr_info("Memslot count too high for this test, decrease the cap (max is %"PRIu64")\n", + maxslots); + else + pr_info("Memslot count may be too high for this test, try adjusting the cap\n"); + + return false; + } + + pr_info("Test took %ld.%.9lds for slot setup + %ld.%.9lds all iterations\n", + result.slot_runtime.tv_sec, result.slot_runtime.tv_nsec, + result.guest_runtime.tv_sec, result.guest_runtime.tv_nsec); + if (!result.nloops) { + pr_info("No full loops done - too short test time or system too loaded?\n"); + return true; + } + + result.iter_runtime = timespec_div(result.guest_runtime, + result.nloops); + pr_info("Done %"PRIu64" iterations, avg %ld.%.9lds each\n", + result.nloops, + result.iter_runtime.tv_sec, + result.iter_runtime.tv_nsec); + result.slottimens = timespec_to_ns(result.slot_runtime); + result.runtimens = timespec_to_ns(result.iter_runtime); + + /* + * Only rank the slot setup time for tests using the whole test memory + * area so they are comparable + */ + if (!data->mem_size && + (!rbestslottime->slottimens || + result.slottimens < rbestslottime->slottimens)) + *rbestslottime = result; + if (!rbestruntime->runtimens || + result.runtimens < rbestruntime->runtimens) + *rbestruntime = result; + + return true; +} + +int main(int argc, char *argv[]) +{ + struct test_args targs = { + .tfirst = 0, + .tlast = NTESTS - 1, + .nslots = -1, + .seconds = 5, + .runs = 1, + }; + struct test_result rbestslottime = {}; + int tctr; + + if (!check_memory_sizes()) + return -1; + + if (!parse_args(argc, argv, &targs)) + return -1; + + for (tctr = targs.tfirst; tctr <= targs.tlast; tctr++) { + const struct test_data *data = &tests[tctr]; + unsigned int runctr; + struct test_result rbestruntime = {}; + + if (tctr > targs.tfirst) + pr_info("\n"); + + pr_info("Testing %s performance with %i runs, %d seconds each\n", + data->name, targs.runs, targs.seconds); + + for (runctr = 0; runctr < targs.runs; runctr++) + if (!test_loop(data, &targs, + &rbestslottime, &rbestruntime)) + break; + + if (rbestruntime.runtimens) + pr_info("Best runtime result was %ld.%.9lds per iteration (with %"PRIu64" iterations)\n", + rbestruntime.iter_runtime.tv_sec, + rbestruntime.iter_runtime.tv_nsec, + rbestruntime.nloops); + } + + if (rbestslottime.slottimens) + pr_info("Best slot setup time for the whole test area was %ld.%.9lds\n", + rbestslottime.slot_runtime.tv_sec, + rbestslottime.slot_runtime.tv_nsec); + + return 0; +} diff --git a/tools/testing/selftests/kvm/riscv/arch_timer.c b/tools/testing/selftests/kvm/riscv/arch_timer.c new file mode 100644 index 000000000000..2c792228ac0b --- /dev/null +++ b/tools/testing/selftests/kvm/riscv/arch_timer.c @@ -0,0 +1,109 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * arch_timer.c - Tests the riscv64 sstc timer IRQ functionality + * + * The test validates the sstc timer IRQs using vstimecmp registers. + * It's ported from the aarch64 arch_timer test. + * + * Copyright (c) 2024, Intel Corporation. + */ +#include "arch_timer.h" +#include "kvm_util.h" +#include "processor.h" +#include "timer_test.h" +#include "ucall_common.h" + +static int timer_irq = IRQ_S_TIMER; + +static void guest_irq_handler(struct ex_regs *regs) +{ + uint64_t xcnt, xcnt_diff_us, cmp; + unsigned int intid = regs->cause & ~CAUSE_IRQ_FLAG; + uint32_t cpu = guest_get_vcpuid(); + struct test_vcpu_shared_data *shared_data = &vcpu_shared_data[cpu]; + + timer_irq_disable(); + + xcnt = timer_get_cycles(); + cmp = timer_get_cmp(); + xcnt_diff_us = cycles_to_usec(xcnt - shared_data->xcnt); + + /* Make sure we are dealing with the correct timer IRQ */ + GUEST_ASSERT_EQ(intid, timer_irq); + + __GUEST_ASSERT(xcnt >= cmp, + "xcnt = 0x%"PRIx64", cmp = 0x%"PRIx64", xcnt_diff_us = 0x%" PRIx64, + xcnt, cmp, xcnt_diff_us); + + WRITE_ONCE(shared_data->nr_iter, shared_data->nr_iter + 1); +} + +static void guest_run(struct test_vcpu_shared_data *shared_data) +{ + uint32_t irq_iter, config_iter; + + shared_data->nr_iter = 0; + shared_data->guest_stage = 0; + + for (config_iter = 0; config_iter < test_args.nr_iter; config_iter++) { + /* Setup the next interrupt */ + timer_set_next_cmp_ms(test_args.timer_period_ms); + shared_data->xcnt = timer_get_cycles(); + timer_irq_enable(); + + /* Setup a timeout for the interrupt to arrive */ + udelay(msecs_to_usecs(test_args.timer_period_ms) + + test_args.timer_err_margin_us); + + irq_iter = READ_ONCE(shared_data->nr_iter); + __GUEST_ASSERT(config_iter + 1 == irq_iter, + "config_iter + 1 = 0x%x, irq_iter = 0x%x.\n" + " Guest timer interrupt was not triggered within the specified\n" + " interval, try to increase the error margin by [-e] option.\n", + config_iter + 1, irq_iter); + } +} + +static void guest_code(void) +{ + uint32_t cpu = guest_get_vcpuid(); + struct test_vcpu_shared_data *shared_data = &vcpu_shared_data[cpu]; + + timer_irq_disable(); + local_irq_enable(); + + guest_run(shared_data); + + GUEST_DONE(); +} + +struct kvm_vm *test_vm_create(void) +{ + struct kvm_vm *vm; + int nr_vcpus = test_args.nr_vcpus; + + vm = vm_create_with_vcpus(nr_vcpus, guest_code, vcpus); + __TEST_REQUIRE(__vcpu_has_isa_ext(vcpus[0], KVM_RISCV_ISA_EXT_SSTC), + "SSTC not available, skipping test\n"); + + vm_init_vector_tables(vm); + vm_install_interrupt_handler(vm, guest_irq_handler); + + for (int i = 0; i < nr_vcpus; i++) + vcpu_init_vector_tables(vcpus[i]); + + /* Initialize guest timer frequency. */ + vcpu_get_reg(vcpus[0], RISCV_TIMER_REG(frequency), &timer_freq); + sync_global_to_guest(vm, timer_freq); + pr_debug("timer_freq: %lu\n", timer_freq); + + /* Make all the test's cmdline args visible to the guest */ + sync_global_to_guest(vm, test_args); + + return vm; +} + +void test_vm_cleanup(struct kvm_vm *vm) +{ + kvm_vm_free(vm); +} diff --git a/tools/testing/selftests/kvm/riscv/ebreak_test.c b/tools/testing/selftests/kvm/riscv/ebreak_test.c new file mode 100644 index 000000000000..823c132069b4 --- /dev/null +++ b/tools/testing/selftests/kvm/riscv/ebreak_test.c @@ -0,0 +1,82 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * RISC-V KVM ebreak test. + * + * Copyright 2024 Beijing ESWIN Computing Technology Co., Ltd. + * + */ +#include "kvm_util.h" + +#define LABEL_ADDRESS(v) ((uint64_t)&(v)) + +extern unsigned char sw_bp_1, sw_bp_2; +static uint64_t sw_bp_addr; + +static void guest_code(void) +{ + asm volatile( + ".option push\n" + ".option norvc\n" + "sw_bp_1: ebreak\n" + "sw_bp_2: ebreak\n" + ".option pop\n" + ); + GUEST_ASSERT_EQ(READ_ONCE(sw_bp_addr), LABEL_ADDRESS(sw_bp_2)); + + GUEST_DONE(); +} + +static void guest_breakpoint_handler(struct ex_regs *regs) +{ + WRITE_ONCE(sw_bp_addr, regs->epc); + regs->epc += 4; +} + +int main(void) +{ + struct kvm_vm *vm; + struct kvm_vcpu *vcpu; + uint64_t pc; + struct kvm_guest_debug debug = { + .control = KVM_GUESTDBG_ENABLE, + }; + + TEST_REQUIRE(kvm_has_cap(KVM_CAP_SET_GUEST_DEBUG)); + + vm = vm_create_with_one_vcpu(&vcpu, guest_code); + + vm_init_vector_tables(vm); + vcpu_init_vector_tables(vcpu); + vm_install_exception_handler(vm, EXC_BREAKPOINT, + guest_breakpoint_handler); + + /* + * Enable the guest debug. + * ebreak should exit to the VMM with KVM_EXIT_DEBUG reason. + */ + vcpu_guest_debug_set(vcpu, &debug); + vcpu_run(vcpu); + + TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_DEBUG); + + vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.pc), &pc); + TEST_ASSERT_EQ(pc, LABEL_ADDRESS(sw_bp_1)); + + /* skip sw_bp_1 */ + vcpu_set_reg(vcpu, RISCV_CORE_REG(regs.pc), pc + 4); + + /* + * Disable all debug controls. + * Guest should handle the ebreak without exiting to the VMM. + */ + memset(&debug, 0, sizeof(debug)); + vcpu_guest_debug_set(vcpu, &debug); + + vcpu_run(vcpu); + + TEST_ASSERT_EQ(get_ucall(vcpu, NULL), UCALL_DONE); + + kvm_vm_free(vm); + + return 0; +} diff --git a/tools/testing/selftests/kvm/riscv/get-reg-list.c b/tools/testing/selftests/kvm/riscv/get-reg-list.c new file mode 100644 index 000000000000..222198dd6d04 --- /dev/null +++ b/tools/testing/selftests/kvm/riscv/get-reg-list.c @@ -0,0 +1,1038 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Check for KVM_GET_REG_LIST regressions. + * + * Copyright (c) 2023 Intel Corporation + * + */ +#include <stdio.h> +#include "kvm_util.h" +#include "test_util.h" +#include "processor.h" + +#define REG_MASK (KVM_REG_ARCH_MASK | KVM_REG_SIZE_MASK) + +enum { + VCPU_FEATURE_ISA_EXT = 0, + VCPU_FEATURE_SBI_EXT, +}; + +static bool isa_ext_cant_disable[KVM_RISCV_ISA_EXT_MAX]; + +bool filter_reg(__u64 reg) +{ + switch (reg & ~REG_MASK) { + /* + * Same set of ISA_EXT registers are not present on all host because + * ISA_EXT registers are visible to the KVM user space based on the + * ISA extensions available on the host. Also, disabling an ISA + * extension using corresponding ISA_EXT register does not affect + * the visibility of the ISA_EXT register itself. + * + * Based on above, we should filter-out all ISA_EXT registers. + * + * Note: The below list is alphabetically sorted. + */ + case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_A: + case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_C: + case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_D: + case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_F: + case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_H: + case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_I: + case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_M: + case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_V: + case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_SMSTATEEN: + case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_SSAIA: + case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_SSCOFPMF: + case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_SSTC: + case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_SVINVAL: + case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_SVNAPOT: + case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_SVPBMT: + case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_ZACAS: + case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_ZBA: + case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_ZBB: + case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_ZBC: + case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_ZBKB: + case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_ZBKC: + case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_ZBKX: + case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_ZBS: + case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_ZFA: + case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_ZFH: + case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_ZFHMIN: + case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_ZICBOM: + case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_ZICBOZ: + case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_ZICNTR: + case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_ZICOND: + case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_ZICSR: + case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_ZIFENCEI: + case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_ZIHINTNTL: + case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_ZIHINTPAUSE: + case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_ZIHPM: + case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_ZKND: + case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_ZKNE: + case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_ZKNH: + case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_ZKR: + case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_ZKSED: + case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_ZKSH: + case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_ZKT: + case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_ZTSO: + case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_ZVBB: + case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_ZVBC: + case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_ZVFH: + case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_ZVFHMIN: + case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_ZVKB: + case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_ZVKG: + case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_ZVKNED: + case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_ZVKNHA: + case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_ZVKNHB: + case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_ZVKSED: + case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_ZVKSH: + case KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_ZVKT: + /* + * Like ISA_EXT registers, SBI_EXT registers are only visible when the + * host supports them and disabling them does not affect the visibility + * of the SBI_EXT register itself. + */ + case KVM_REG_RISCV_SBI_EXT | KVM_REG_RISCV_SBI_SINGLE | KVM_RISCV_SBI_EXT_V01: + case KVM_REG_RISCV_SBI_EXT | KVM_REG_RISCV_SBI_SINGLE | KVM_RISCV_SBI_EXT_TIME: + case KVM_REG_RISCV_SBI_EXT | KVM_REG_RISCV_SBI_SINGLE | KVM_RISCV_SBI_EXT_IPI: + case KVM_REG_RISCV_SBI_EXT | KVM_REG_RISCV_SBI_SINGLE | KVM_RISCV_SBI_EXT_RFENCE: + case KVM_REG_RISCV_SBI_EXT | KVM_REG_RISCV_SBI_SINGLE | KVM_RISCV_SBI_EXT_SRST: + case KVM_REG_RISCV_SBI_EXT | KVM_REG_RISCV_SBI_SINGLE | KVM_RISCV_SBI_EXT_HSM: + case KVM_REG_RISCV_SBI_EXT | KVM_REG_RISCV_SBI_SINGLE | KVM_RISCV_SBI_EXT_PMU: + case KVM_REG_RISCV_SBI_EXT | KVM_REG_RISCV_SBI_SINGLE | KVM_RISCV_SBI_EXT_DBCN: + case KVM_REG_RISCV_SBI_EXT | KVM_REG_RISCV_SBI_SINGLE | KVM_RISCV_SBI_EXT_STA: + case KVM_REG_RISCV_SBI_EXT | KVM_REG_RISCV_SBI_SINGLE | KVM_RISCV_SBI_EXT_EXPERIMENTAL: + case KVM_REG_RISCV_SBI_EXT | KVM_REG_RISCV_SBI_SINGLE | KVM_RISCV_SBI_EXT_VENDOR: + return true; + /* AIA registers are always available when Ssaia can't be disabled */ + case KVM_REG_RISCV_CSR | KVM_REG_RISCV_CSR_AIA | KVM_REG_RISCV_CSR_AIA_REG(siselect): + case KVM_REG_RISCV_CSR | KVM_REG_RISCV_CSR_AIA | KVM_REG_RISCV_CSR_AIA_REG(iprio1): + case KVM_REG_RISCV_CSR | KVM_REG_RISCV_CSR_AIA | KVM_REG_RISCV_CSR_AIA_REG(iprio2): + case KVM_REG_RISCV_CSR | KVM_REG_RISCV_CSR_AIA | KVM_REG_RISCV_CSR_AIA_REG(sieh): + case KVM_REG_RISCV_CSR | KVM_REG_RISCV_CSR_AIA | KVM_REG_RISCV_CSR_AIA_REG(siph): + case KVM_REG_RISCV_CSR | KVM_REG_RISCV_CSR_AIA | KVM_REG_RISCV_CSR_AIA_REG(iprio1h): + case KVM_REG_RISCV_CSR | KVM_REG_RISCV_CSR_AIA | KVM_REG_RISCV_CSR_AIA_REG(iprio2h): + return isa_ext_cant_disable[KVM_RISCV_ISA_EXT_SSAIA]; + default: + break; + } + + return false; +} + +bool check_reject_set(int err) +{ + return err == EINVAL; +} + +void finalize_vcpu(struct kvm_vcpu *vcpu, struct vcpu_reg_list *c) +{ + unsigned long isa_ext_state[KVM_RISCV_ISA_EXT_MAX] = { 0 }; + struct vcpu_reg_sublist *s; + uint64_t feature; + int rc; + + for (int i = 0; i < KVM_RISCV_ISA_EXT_MAX; i++) + __vcpu_get_reg(vcpu, RISCV_ISA_EXT_REG(i), &isa_ext_state[i]); + + /* + * Disable all extensions which were enabled by default + * if they were available in the risc-v host. + */ + for (int i = 0; i < KVM_RISCV_ISA_EXT_MAX; i++) { + rc = __vcpu_set_reg(vcpu, RISCV_ISA_EXT_REG(i), 0); + if (rc && isa_ext_state[i]) + isa_ext_cant_disable[i] = true; + } + + for (int i = 0; i < KVM_RISCV_SBI_EXT_MAX; i++) { + rc = __vcpu_set_reg(vcpu, RISCV_SBI_EXT_REG(i), 0); + TEST_ASSERT(!rc || (rc == -1 && errno == ENOENT), "Unexpected error"); + } + + for_each_sublist(c, s) { + if (!s->feature) + continue; + + switch (s->feature_type) { + case VCPU_FEATURE_ISA_EXT: + feature = RISCV_ISA_EXT_REG(s->feature); + break; + case VCPU_FEATURE_SBI_EXT: + feature = RISCV_SBI_EXT_REG(s->feature); + break; + default: + TEST_FAIL("Unknown feature type"); + } + + /* Try to enable the desired extension */ + __vcpu_set_reg(vcpu, feature, 1); + + /* Double check whether the desired extension was enabled */ + __TEST_REQUIRE(__vcpu_has_ext(vcpu, feature), + "%s not available, skipping tests", s->name); + } +} + +static const char *config_id_to_str(const char *prefix, __u64 id) +{ + /* reg_off is the offset into struct kvm_riscv_config */ + __u64 reg_off = id & ~(REG_MASK | KVM_REG_RISCV_CONFIG); + + assert((id & KVM_REG_RISCV_TYPE_MASK) == KVM_REG_RISCV_CONFIG); + + switch (reg_off) { + case KVM_REG_RISCV_CONFIG_REG(isa): + return "KVM_REG_RISCV_CONFIG_REG(isa)"; + case KVM_REG_RISCV_CONFIG_REG(zicbom_block_size): + return "KVM_REG_RISCV_CONFIG_REG(zicbom_block_size)"; + case KVM_REG_RISCV_CONFIG_REG(zicboz_block_size): + return "KVM_REG_RISCV_CONFIG_REG(zicboz_block_size)"; + case KVM_REG_RISCV_CONFIG_REG(mvendorid): + return "KVM_REG_RISCV_CONFIG_REG(mvendorid)"; + case KVM_REG_RISCV_CONFIG_REG(marchid): + return "KVM_REG_RISCV_CONFIG_REG(marchid)"; + case KVM_REG_RISCV_CONFIG_REG(mimpid): + return "KVM_REG_RISCV_CONFIG_REG(mimpid)"; + case KVM_REG_RISCV_CONFIG_REG(satp_mode): + return "KVM_REG_RISCV_CONFIG_REG(satp_mode)"; + } + + return strdup_printf("%lld /* UNKNOWN */", reg_off); +} + +static const char *core_id_to_str(const char *prefix, __u64 id) +{ + /* reg_off is the offset into struct kvm_riscv_core */ + __u64 reg_off = id & ~(REG_MASK | KVM_REG_RISCV_CORE); + + assert((id & KVM_REG_RISCV_TYPE_MASK) == KVM_REG_RISCV_CORE); + + switch (reg_off) { + case KVM_REG_RISCV_CORE_REG(regs.pc): + return "KVM_REG_RISCV_CORE_REG(regs.pc)"; + case KVM_REG_RISCV_CORE_REG(regs.ra): + return "KVM_REG_RISCV_CORE_REG(regs.ra)"; + case KVM_REG_RISCV_CORE_REG(regs.sp): + return "KVM_REG_RISCV_CORE_REG(regs.sp)"; + case KVM_REG_RISCV_CORE_REG(regs.gp): + return "KVM_REG_RISCV_CORE_REG(regs.gp)"; + case KVM_REG_RISCV_CORE_REG(regs.tp): + return "KVM_REG_RISCV_CORE_REG(regs.tp)"; + case KVM_REG_RISCV_CORE_REG(regs.t0) ... KVM_REG_RISCV_CORE_REG(regs.t2): + return strdup_printf("KVM_REG_RISCV_CORE_REG(regs.t%lld)", + reg_off - KVM_REG_RISCV_CORE_REG(regs.t0)); + case KVM_REG_RISCV_CORE_REG(regs.s0) ... KVM_REG_RISCV_CORE_REG(regs.s1): + return strdup_printf("KVM_REG_RISCV_CORE_REG(regs.s%lld)", + reg_off - KVM_REG_RISCV_CORE_REG(regs.s0)); + case KVM_REG_RISCV_CORE_REG(regs.a0) ... KVM_REG_RISCV_CORE_REG(regs.a7): + return strdup_printf("KVM_REG_RISCV_CORE_REG(regs.a%lld)", + reg_off - KVM_REG_RISCV_CORE_REG(regs.a0)); + case KVM_REG_RISCV_CORE_REG(regs.s2) ... KVM_REG_RISCV_CORE_REG(regs.s11): + return strdup_printf("KVM_REG_RISCV_CORE_REG(regs.s%lld)", + reg_off - KVM_REG_RISCV_CORE_REG(regs.s2) + 2); + case KVM_REG_RISCV_CORE_REG(regs.t3) ... KVM_REG_RISCV_CORE_REG(regs.t6): + return strdup_printf("KVM_REG_RISCV_CORE_REG(regs.t%lld)", + reg_off - KVM_REG_RISCV_CORE_REG(regs.t3) + 3); + case KVM_REG_RISCV_CORE_REG(mode): + return "KVM_REG_RISCV_CORE_REG(mode)"; + } + + return strdup_printf("%lld /* UNKNOWN */", reg_off); +} + +#define RISCV_CSR_GENERAL(csr) \ + "KVM_REG_RISCV_CSR_GENERAL | KVM_REG_RISCV_CSR_REG(" #csr ")" +#define RISCV_CSR_AIA(csr) \ + "KVM_REG_RISCV_CSR_AIA | KVM_REG_RISCV_CSR_REG(" #csr ")" +#define RISCV_CSR_SMSTATEEN(csr) \ + "KVM_REG_RISCV_CSR_SMSTATEEN | KVM_REG_RISCV_CSR_REG(" #csr ")" + +static const char *general_csr_id_to_str(__u64 reg_off) +{ + /* reg_off is the offset into struct kvm_riscv_csr */ + switch (reg_off) { + case KVM_REG_RISCV_CSR_REG(sstatus): + return RISCV_CSR_GENERAL(sstatus); + case KVM_REG_RISCV_CSR_REG(sie): + return RISCV_CSR_GENERAL(sie); + case KVM_REG_RISCV_CSR_REG(stvec): + return RISCV_CSR_GENERAL(stvec); + case KVM_REG_RISCV_CSR_REG(sscratch): + return RISCV_CSR_GENERAL(sscratch); + case KVM_REG_RISCV_CSR_REG(sepc): + return RISCV_CSR_GENERAL(sepc); + case KVM_REG_RISCV_CSR_REG(scause): + return RISCV_CSR_GENERAL(scause); + case KVM_REG_RISCV_CSR_REG(stval): + return RISCV_CSR_GENERAL(stval); + case KVM_REG_RISCV_CSR_REG(sip): + return RISCV_CSR_GENERAL(sip); + case KVM_REG_RISCV_CSR_REG(satp): + return RISCV_CSR_GENERAL(satp); + case KVM_REG_RISCV_CSR_REG(scounteren): + return RISCV_CSR_GENERAL(scounteren); + case KVM_REG_RISCV_CSR_REG(senvcfg): + return RISCV_CSR_GENERAL(senvcfg); + } + + return strdup_printf("KVM_REG_RISCV_CSR_GENERAL | %lld /* UNKNOWN */", reg_off); +} + +static const char *aia_csr_id_to_str(__u64 reg_off) +{ + /* reg_off is the offset into struct kvm_riscv_aia_csr */ + switch (reg_off) { + case KVM_REG_RISCV_CSR_AIA_REG(siselect): + return RISCV_CSR_AIA(siselect); + case KVM_REG_RISCV_CSR_AIA_REG(iprio1): + return RISCV_CSR_AIA(iprio1); + case KVM_REG_RISCV_CSR_AIA_REG(iprio2): + return RISCV_CSR_AIA(iprio2); + case KVM_REG_RISCV_CSR_AIA_REG(sieh): + return RISCV_CSR_AIA(sieh); + case KVM_REG_RISCV_CSR_AIA_REG(siph): + return RISCV_CSR_AIA(siph); + case KVM_REG_RISCV_CSR_AIA_REG(iprio1h): + return RISCV_CSR_AIA(iprio1h); + case KVM_REG_RISCV_CSR_AIA_REG(iprio2h): + return RISCV_CSR_AIA(iprio2h); + } + + return strdup_printf("KVM_REG_RISCV_CSR_AIA | %lld /* UNKNOWN */", reg_off); +} + +static const char *smstateen_csr_id_to_str(__u64 reg_off) +{ + /* reg_off is the offset into struct kvm_riscv_smstateen_csr */ + switch (reg_off) { + case KVM_REG_RISCV_CSR_SMSTATEEN_REG(sstateen0): + return RISCV_CSR_SMSTATEEN(sstateen0); + } + + TEST_FAIL("Unknown smstateen csr reg: 0x%llx", reg_off); + return NULL; +} + +static const char *csr_id_to_str(const char *prefix, __u64 id) +{ + __u64 reg_off = id & ~(REG_MASK | KVM_REG_RISCV_CSR); + __u64 reg_subtype = reg_off & KVM_REG_RISCV_SUBTYPE_MASK; + + assert((id & KVM_REG_RISCV_TYPE_MASK) == KVM_REG_RISCV_CSR); + + reg_off &= ~KVM_REG_RISCV_SUBTYPE_MASK; + + switch (reg_subtype) { + case KVM_REG_RISCV_CSR_GENERAL: + return general_csr_id_to_str(reg_off); + case KVM_REG_RISCV_CSR_AIA: + return aia_csr_id_to_str(reg_off); + case KVM_REG_RISCV_CSR_SMSTATEEN: + return smstateen_csr_id_to_str(reg_off); + } + + return strdup_printf("%lld | %lld /* UNKNOWN */", reg_subtype, reg_off); +} + +static const char *timer_id_to_str(const char *prefix, __u64 id) +{ + /* reg_off is the offset into struct kvm_riscv_timer */ + __u64 reg_off = id & ~(REG_MASK | KVM_REG_RISCV_TIMER); + + assert((id & KVM_REG_RISCV_TYPE_MASK) == KVM_REG_RISCV_TIMER); + + switch (reg_off) { + case KVM_REG_RISCV_TIMER_REG(frequency): + return "KVM_REG_RISCV_TIMER_REG(frequency)"; + case KVM_REG_RISCV_TIMER_REG(time): + return "KVM_REG_RISCV_TIMER_REG(time)"; + case KVM_REG_RISCV_TIMER_REG(compare): + return "KVM_REG_RISCV_TIMER_REG(compare)"; + case KVM_REG_RISCV_TIMER_REG(state): + return "KVM_REG_RISCV_TIMER_REG(state)"; + } + + return strdup_printf("%lld /* UNKNOWN */", reg_off); +} + +static const char *fp_f_id_to_str(const char *prefix, __u64 id) +{ + /* reg_off is the offset into struct __riscv_f_ext_state */ + __u64 reg_off = id & ~(REG_MASK | KVM_REG_RISCV_FP_F); + + assert((id & KVM_REG_RISCV_TYPE_MASK) == KVM_REG_RISCV_FP_F); + + switch (reg_off) { + case KVM_REG_RISCV_FP_F_REG(f[0]) ... + KVM_REG_RISCV_FP_F_REG(f[31]): + return strdup_printf("KVM_REG_RISCV_FP_F_REG(f[%lld])", reg_off); + case KVM_REG_RISCV_FP_F_REG(fcsr): + return "KVM_REG_RISCV_FP_F_REG(fcsr)"; + } + + return strdup_printf("%lld /* UNKNOWN */", reg_off); +} + +static const char *fp_d_id_to_str(const char *prefix, __u64 id) +{ + /* reg_off is the offset into struct __riscv_d_ext_state */ + __u64 reg_off = id & ~(REG_MASK | KVM_REG_RISCV_FP_D); + + assert((id & KVM_REG_RISCV_TYPE_MASK) == KVM_REG_RISCV_FP_D); + + switch (reg_off) { + case KVM_REG_RISCV_FP_D_REG(f[0]) ... + KVM_REG_RISCV_FP_D_REG(f[31]): + return strdup_printf("KVM_REG_RISCV_FP_D_REG(f[%lld])", reg_off); + case KVM_REG_RISCV_FP_D_REG(fcsr): + return "KVM_REG_RISCV_FP_D_REG(fcsr)"; + } + + return strdup_printf("%lld /* UNKNOWN */", reg_off); +} + +#define KVM_ISA_EXT_ARR(ext) \ +[KVM_RISCV_ISA_EXT_##ext] = "KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_" #ext + +static const char *isa_ext_single_id_to_str(__u64 reg_off) +{ + static const char * const kvm_isa_ext_reg_name[] = { + KVM_ISA_EXT_ARR(A), + KVM_ISA_EXT_ARR(C), + KVM_ISA_EXT_ARR(D), + KVM_ISA_EXT_ARR(F), + KVM_ISA_EXT_ARR(H), + KVM_ISA_EXT_ARR(I), + KVM_ISA_EXT_ARR(M), + KVM_ISA_EXT_ARR(V), + KVM_ISA_EXT_ARR(SMSTATEEN), + KVM_ISA_EXT_ARR(SSAIA), + KVM_ISA_EXT_ARR(SSCOFPMF), + KVM_ISA_EXT_ARR(SSTC), + KVM_ISA_EXT_ARR(SVINVAL), + KVM_ISA_EXT_ARR(SVNAPOT), + KVM_ISA_EXT_ARR(SVPBMT), + KVM_ISA_EXT_ARR(ZACAS), + KVM_ISA_EXT_ARR(ZBA), + KVM_ISA_EXT_ARR(ZBB), + KVM_ISA_EXT_ARR(ZBC), + KVM_ISA_EXT_ARR(ZBKB), + KVM_ISA_EXT_ARR(ZBKC), + KVM_ISA_EXT_ARR(ZBKX), + KVM_ISA_EXT_ARR(ZBS), + KVM_ISA_EXT_ARR(ZFA), + KVM_ISA_EXT_ARR(ZFH), + KVM_ISA_EXT_ARR(ZFHMIN), + KVM_ISA_EXT_ARR(ZICBOM), + KVM_ISA_EXT_ARR(ZICBOZ), + KVM_ISA_EXT_ARR(ZICNTR), + KVM_ISA_EXT_ARR(ZICOND), + KVM_ISA_EXT_ARR(ZICSR), + KVM_ISA_EXT_ARR(ZIFENCEI), + KVM_ISA_EXT_ARR(ZIHINTNTL), + KVM_ISA_EXT_ARR(ZIHINTPAUSE), + KVM_ISA_EXT_ARR(ZIHPM), + KVM_ISA_EXT_ARR(ZKND), + KVM_ISA_EXT_ARR(ZKNE), + KVM_ISA_EXT_ARR(ZKNH), + KVM_ISA_EXT_ARR(ZKR), + KVM_ISA_EXT_ARR(ZKSED), + KVM_ISA_EXT_ARR(ZKSH), + KVM_ISA_EXT_ARR(ZKT), + KVM_ISA_EXT_ARR(ZTSO), + KVM_ISA_EXT_ARR(ZVBB), + KVM_ISA_EXT_ARR(ZVBC), + KVM_ISA_EXT_ARR(ZVFH), + KVM_ISA_EXT_ARR(ZVFHMIN), + KVM_ISA_EXT_ARR(ZVKB), + KVM_ISA_EXT_ARR(ZVKG), + KVM_ISA_EXT_ARR(ZVKNED), + KVM_ISA_EXT_ARR(ZVKNHA), + KVM_ISA_EXT_ARR(ZVKNHB), + KVM_ISA_EXT_ARR(ZVKSED), + KVM_ISA_EXT_ARR(ZVKSH), + KVM_ISA_EXT_ARR(ZVKT), + }; + + if (reg_off >= ARRAY_SIZE(kvm_isa_ext_reg_name)) + return strdup_printf("KVM_REG_RISCV_ISA_SINGLE | %lld /* UNKNOWN */", reg_off); + + return kvm_isa_ext_reg_name[reg_off]; +} + +static const char *isa_ext_multi_id_to_str(__u64 reg_subtype, __u64 reg_off) +{ + const char *unknown = ""; + + if (reg_off > KVM_REG_RISCV_ISA_MULTI_REG_LAST) + unknown = " /* UNKNOWN */"; + + switch (reg_subtype) { + case KVM_REG_RISCV_ISA_MULTI_EN: + return strdup_printf("KVM_REG_RISCV_ISA_MULTI_EN | %lld%s", reg_off, unknown); + case KVM_REG_RISCV_ISA_MULTI_DIS: + return strdup_printf("KVM_REG_RISCV_ISA_MULTI_DIS | %lld%s", reg_off, unknown); + } + + return strdup_printf("%lld | %lld /* UNKNOWN */", reg_subtype, reg_off); +} + +static const char *isa_ext_id_to_str(const char *prefix, __u64 id) +{ + __u64 reg_off = id & ~(REG_MASK | KVM_REG_RISCV_ISA_EXT); + __u64 reg_subtype = reg_off & KVM_REG_RISCV_SUBTYPE_MASK; + + assert((id & KVM_REG_RISCV_TYPE_MASK) == KVM_REG_RISCV_ISA_EXT); + + reg_off &= ~KVM_REG_RISCV_SUBTYPE_MASK; + + switch (reg_subtype) { + case KVM_REG_RISCV_ISA_SINGLE: + return isa_ext_single_id_to_str(reg_off); + case KVM_REG_RISCV_ISA_MULTI_EN: + case KVM_REG_RISCV_ISA_MULTI_DIS: + return isa_ext_multi_id_to_str(reg_subtype, reg_off); + } + + return strdup_printf("%lld | %lld /* UNKNOWN */", reg_subtype, reg_off); +} + +#define KVM_SBI_EXT_ARR(ext) \ +[ext] = "KVM_REG_RISCV_SBI_SINGLE | " #ext + +static const char *sbi_ext_single_id_to_str(__u64 reg_off) +{ + /* reg_off is KVM_RISCV_SBI_EXT_ID */ + static const char * const kvm_sbi_ext_reg_name[] = { + KVM_SBI_EXT_ARR(KVM_RISCV_SBI_EXT_V01), + KVM_SBI_EXT_ARR(KVM_RISCV_SBI_EXT_TIME), + KVM_SBI_EXT_ARR(KVM_RISCV_SBI_EXT_IPI), + KVM_SBI_EXT_ARR(KVM_RISCV_SBI_EXT_RFENCE), + KVM_SBI_EXT_ARR(KVM_RISCV_SBI_EXT_SRST), + KVM_SBI_EXT_ARR(KVM_RISCV_SBI_EXT_HSM), + KVM_SBI_EXT_ARR(KVM_RISCV_SBI_EXT_PMU), + KVM_SBI_EXT_ARR(KVM_RISCV_SBI_EXT_STA), + KVM_SBI_EXT_ARR(KVM_RISCV_SBI_EXT_EXPERIMENTAL), + KVM_SBI_EXT_ARR(KVM_RISCV_SBI_EXT_VENDOR), + KVM_SBI_EXT_ARR(KVM_RISCV_SBI_EXT_DBCN), + }; + + if (reg_off >= ARRAY_SIZE(kvm_sbi_ext_reg_name)) + return strdup_printf("KVM_REG_RISCV_SBI_SINGLE | %lld /* UNKNOWN */", reg_off); + + return kvm_sbi_ext_reg_name[reg_off]; +} + +static const char *sbi_ext_multi_id_to_str(__u64 reg_subtype, __u64 reg_off) +{ + const char *unknown = ""; + + if (reg_off > KVM_REG_RISCV_SBI_MULTI_REG_LAST) + unknown = " /* UNKNOWN */"; + + switch (reg_subtype) { + case KVM_REG_RISCV_SBI_MULTI_EN: + return strdup_printf("KVM_REG_RISCV_SBI_MULTI_EN | %lld%s", reg_off, unknown); + case KVM_REG_RISCV_SBI_MULTI_DIS: + return strdup_printf("KVM_REG_RISCV_SBI_MULTI_DIS | %lld%s", reg_off, unknown); + } + + return strdup_printf("%lld | %lld /* UNKNOWN */", reg_subtype, reg_off); +} + +static const char *sbi_ext_id_to_str(const char *prefix, __u64 id) +{ + __u64 reg_off = id & ~(REG_MASK | KVM_REG_RISCV_SBI_EXT); + __u64 reg_subtype = reg_off & KVM_REG_RISCV_SUBTYPE_MASK; + + assert((id & KVM_REG_RISCV_TYPE_MASK) == KVM_REG_RISCV_SBI_EXT); + + reg_off &= ~KVM_REG_RISCV_SUBTYPE_MASK; + + switch (reg_subtype) { + case KVM_REG_RISCV_SBI_SINGLE: + return sbi_ext_single_id_to_str(reg_off); + case KVM_REG_RISCV_SBI_MULTI_EN: + case KVM_REG_RISCV_SBI_MULTI_DIS: + return sbi_ext_multi_id_to_str(reg_subtype, reg_off); + } + + return strdup_printf("%lld | %lld /* UNKNOWN */", reg_subtype, reg_off); +} + +static const char *sbi_sta_id_to_str(__u64 reg_off) +{ + switch (reg_off) { + case 0: return "KVM_REG_RISCV_SBI_STA | KVM_REG_RISCV_SBI_STA_REG(shmem_lo)"; + case 1: return "KVM_REG_RISCV_SBI_STA | KVM_REG_RISCV_SBI_STA_REG(shmem_hi)"; + } + return strdup_printf("KVM_REG_RISCV_SBI_STA | %lld /* UNKNOWN */", reg_off); +} + +static const char *sbi_id_to_str(const char *prefix, __u64 id) +{ + __u64 reg_off = id & ~(REG_MASK | KVM_REG_RISCV_SBI_STATE); + __u64 reg_subtype = reg_off & KVM_REG_RISCV_SUBTYPE_MASK; + + assert((id & KVM_REG_RISCV_TYPE_MASK) == KVM_REG_RISCV_SBI_STATE); + + reg_off &= ~KVM_REG_RISCV_SUBTYPE_MASK; + + switch (reg_subtype) { + case KVM_REG_RISCV_SBI_STA: + return sbi_sta_id_to_str(reg_off); + } + + return strdup_printf("%lld | %lld /* UNKNOWN */", reg_subtype, reg_off); +} + +void print_reg(const char *prefix, __u64 id) +{ + const char *reg_size = NULL; + + TEST_ASSERT((id & KVM_REG_ARCH_MASK) == KVM_REG_RISCV, + "%s: KVM_REG_RISCV missing in reg id: 0x%llx", prefix, id); + + switch (id & KVM_REG_SIZE_MASK) { + case KVM_REG_SIZE_U32: + reg_size = "KVM_REG_SIZE_U32"; + break; + case KVM_REG_SIZE_U64: + reg_size = "KVM_REG_SIZE_U64"; + break; + case KVM_REG_SIZE_U128: + reg_size = "KVM_REG_SIZE_U128"; + break; + default: + printf("\tKVM_REG_RISCV | (%lld << KVM_REG_SIZE_SHIFT) | 0x%llx /* UNKNOWN */,\n", + (id & KVM_REG_SIZE_MASK) >> KVM_REG_SIZE_SHIFT, id & ~REG_MASK); + return; + } + + switch (id & KVM_REG_RISCV_TYPE_MASK) { + case KVM_REG_RISCV_CONFIG: + printf("\tKVM_REG_RISCV | %s | KVM_REG_RISCV_CONFIG | %s,\n", + reg_size, config_id_to_str(prefix, id)); + break; + case KVM_REG_RISCV_CORE: + printf("\tKVM_REG_RISCV | %s | KVM_REG_RISCV_CORE | %s,\n", + reg_size, core_id_to_str(prefix, id)); + break; + case KVM_REG_RISCV_CSR: + printf("\tKVM_REG_RISCV | %s | KVM_REG_RISCV_CSR | %s,\n", + reg_size, csr_id_to_str(prefix, id)); + break; + case KVM_REG_RISCV_TIMER: + printf("\tKVM_REG_RISCV | %s | KVM_REG_RISCV_TIMER | %s,\n", + reg_size, timer_id_to_str(prefix, id)); + break; + case KVM_REG_RISCV_FP_F: + printf("\tKVM_REG_RISCV | %s | KVM_REG_RISCV_FP_F | %s,\n", + reg_size, fp_f_id_to_str(prefix, id)); + break; + case KVM_REG_RISCV_FP_D: + printf("\tKVM_REG_RISCV | %s | KVM_REG_RISCV_FP_D | %s,\n", + reg_size, fp_d_id_to_str(prefix, id)); + break; + case KVM_REG_RISCV_ISA_EXT: + printf("\tKVM_REG_RISCV | %s | KVM_REG_RISCV_ISA_EXT | %s,\n", + reg_size, isa_ext_id_to_str(prefix, id)); + break; + case KVM_REG_RISCV_SBI_EXT: + printf("\tKVM_REG_RISCV | %s | KVM_REG_RISCV_SBI_EXT | %s,\n", + reg_size, sbi_ext_id_to_str(prefix, id)); + break; + case KVM_REG_RISCV_SBI_STATE: + printf("\tKVM_REG_RISCV | %s | KVM_REG_RISCV_SBI_STATE | %s,\n", + reg_size, sbi_id_to_str(prefix, id)); + break; + default: + printf("\tKVM_REG_RISCV | %s | 0x%llx /* UNKNOWN */,\n", + reg_size, id & ~REG_MASK); + return; + } +} + +/* + * The current blessed list was primed with the output of kernel version + * v6.5-rc3 and then later updated with new registers. + */ +static __u64 base_regs[] = { + KVM_REG_RISCV | KVM_REG_SIZE_ULONG | KVM_REG_RISCV_CONFIG | KVM_REG_RISCV_CONFIG_REG(isa), + KVM_REG_RISCV | KVM_REG_SIZE_ULONG | KVM_REG_RISCV_CONFIG | KVM_REG_RISCV_CONFIG_REG(mvendorid), + KVM_REG_RISCV | KVM_REG_SIZE_ULONG | KVM_REG_RISCV_CONFIG | KVM_REG_RISCV_CONFIG_REG(marchid), + KVM_REG_RISCV | KVM_REG_SIZE_ULONG | KVM_REG_RISCV_CONFIG | KVM_REG_RISCV_CONFIG_REG(mimpid), + KVM_REG_RISCV | KVM_REG_SIZE_ULONG | KVM_REG_RISCV_CONFIG | KVM_REG_RISCV_CONFIG_REG(satp_mode), + KVM_REG_RISCV | KVM_REG_SIZE_ULONG | KVM_REG_RISCV_CORE | KVM_REG_RISCV_CORE_REG(regs.pc), + KVM_REG_RISCV | KVM_REG_SIZE_ULONG | KVM_REG_RISCV_CORE | KVM_REG_RISCV_CORE_REG(regs.ra), + KVM_REG_RISCV | KVM_REG_SIZE_ULONG | KVM_REG_RISCV_CORE | KVM_REG_RISCV_CORE_REG(regs.sp), + KVM_REG_RISCV | KVM_REG_SIZE_ULONG | KVM_REG_RISCV_CORE | KVM_REG_RISCV_CORE_REG(regs.gp), + KVM_REG_RISCV | KVM_REG_SIZE_ULONG | KVM_REG_RISCV_CORE | KVM_REG_RISCV_CORE_REG(regs.tp), + KVM_REG_RISCV | KVM_REG_SIZE_ULONG | KVM_REG_RISCV_CORE | KVM_REG_RISCV_CORE_REG(regs.t0), + KVM_REG_RISCV | KVM_REG_SIZE_ULONG | KVM_REG_RISCV_CORE | KVM_REG_RISCV_CORE_REG(regs.t1), + KVM_REG_RISCV | KVM_REG_SIZE_ULONG | KVM_REG_RISCV_CORE | KVM_REG_RISCV_CORE_REG(regs.t2), + KVM_REG_RISCV | KVM_REG_SIZE_ULONG | KVM_REG_RISCV_CORE | KVM_REG_RISCV_CORE_REG(regs.s0), + KVM_REG_RISCV | KVM_REG_SIZE_ULONG | KVM_REG_RISCV_CORE | KVM_REG_RISCV_CORE_REG(regs.s1), + KVM_REG_RISCV | KVM_REG_SIZE_ULONG | KVM_REG_RISCV_CORE | KVM_REG_RISCV_CORE_REG(regs.a0), + KVM_REG_RISCV | KVM_REG_SIZE_ULONG | KVM_REG_RISCV_CORE | KVM_REG_RISCV_CORE_REG(regs.a1), + KVM_REG_RISCV | KVM_REG_SIZE_ULONG | KVM_REG_RISCV_CORE | KVM_REG_RISCV_CORE_REG(regs.a2), + KVM_REG_RISCV | KVM_REG_SIZE_ULONG | KVM_REG_RISCV_CORE | KVM_REG_RISCV_CORE_REG(regs.a3), + KVM_REG_RISCV | KVM_REG_SIZE_ULONG | KVM_REG_RISCV_CORE | KVM_REG_RISCV_CORE_REG(regs.a4), + KVM_REG_RISCV | KVM_REG_SIZE_ULONG | KVM_REG_RISCV_CORE | KVM_REG_RISCV_CORE_REG(regs.a5), + KVM_REG_RISCV | KVM_REG_SIZE_ULONG | KVM_REG_RISCV_CORE | KVM_REG_RISCV_CORE_REG(regs.a6), + KVM_REG_RISCV | KVM_REG_SIZE_ULONG | KVM_REG_RISCV_CORE | KVM_REG_RISCV_CORE_REG(regs.a7), + KVM_REG_RISCV | KVM_REG_SIZE_ULONG | KVM_REG_RISCV_CORE | KVM_REG_RISCV_CORE_REG(regs.s2), + KVM_REG_RISCV | KVM_REG_SIZE_ULONG | KVM_REG_RISCV_CORE | KVM_REG_RISCV_CORE_REG(regs.s3), + KVM_REG_RISCV | KVM_REG_SIZE_ULONG | KVM_REG_RISCV_CORE | KVM_REG_RISCV_CORE_REG(regs.s4), + KVM_REG_RISCV | KVM_REG_SIZE_ULONG | KVM_REG_RISCV_CORE | KVM_REG_RISCV_CORE_REG(regs.s5), + KVM_REG_RISCV | KVM_REG_SIZE_ULONG | KVM_REG_RISCV_CORE | KVM_REG_RISCV_CORE_REG(regs.s6), + KVM_REG_RISCV | KVM_REG_SIZE_ULONG | KVM_REG_RISCV_CORE | KVM_REG_RISCV_CORE_REG(regs.s7), + KVM_REG_RISCV | KVM_REG_SIZE_ULONG | KVM_REG_RISCV_CORE | KVM_REG_RISCV_CORE_REG(regs.s8), + KVM_REG_RISCV | KVM_REG_SIZE_ULONG | KVM_REG_RISCV_CORE | KVM_REG_RISCV_CORE_REG(regs.s9), + KVM_REG_RISCV | KVM_REG_SIZE_ULONG | KVM_REG_RISCV_CORE | KVM_REG_RISCV_CORE_REG(regs.s10), + KVM_REG_RISCV | KVM_REG_SIZE_ULONG | KVM_REG_RISCV_CORE | KVM_REG_RISCV_CORE_REG(regs.s11), + KVM_REG_RISCV | KVM_REG_SIZE_ULONG | KVM_REG_RISCV_CORE | KVM_REG_RISCV_CORE_REG(regs.t3), + KVM_REG_RISCV | KVM_REG_SIZE_ULONG | KVM_REG_RISCV_CORE | KVM_REG_RISCV_CORE_REG(regs.t4), + KVM_REG_RISCV | KVM_REG_SIZE_ULONG | KVM_REG_RISCV_CORE | KVM_REG_RISCV_CORE_REG(regs.t5), + KVM_REG_RISCV | KVM_REG_SIZE_ULONG | KVM_REG_RISCV_CORE | KVM_REG_RISCV_CORE_REG(regs.t6), + KVM_REG_RISCV | KVM_REG_SIZE_ULONG | KVM_REG_RISCV_CORE | KVM_REG_RISCV_CORE_REG(mode), + KVM_REG_RISCV | KVM_REG_SIZE_ULONG | KVM_REG_RISCV_CSR | KVM_REG_RISCV_CSR_GENERAL | KVM_REG_RISCV_CSR_REG(sstatus), + KVM_REG_RISCV | KVM_REG_SIZE_ULONG | KVM_REG_RISCV_CSR | KVM_REG_RISCV_CSR_GENERAL | KVM_REG_RISCV_CSR_REG(sie), + KVM_REG_RISCV | KVM_REG_SIZE_ULONG | KVM_REG_RISCV_CSR | KVM_REG_RISCV_CSR_GENERAL | KVM_REG_RISCV_CSR_REG(stvec), + KVM_REG_RISCV | KVM_REG_SIZE_ULONG | KVM_REG_RISCV_CSR | KVM_REG_RISCV_CSR_GENERAL | KVM_REG_RISCV_CSR_REG(sscratch), + KVM_REG_RISCV | KVM_REG_SIZE_ULONG | KVM_REG_RISCV_CSR | KVM_REG_RISCV_CSR_GENERAL | KVM_REG_RISCV_CSR_REG(sepc), + KVM_REG_RISCV | KVM_REG_SIZE_ULONG | KVM_REG_RISCV_CSR | KVM_REG_RISCV_CSR_GENERAL | KVM_REG_RISCV_CSR_REG(scause), + KVM_REG_RISCV | KVM_REG_SIZE_ULONG | KVM_REG_RISCV_CSR | KVM_REG_RISCV_CSR_GENERAL | KVM_REG_RISCV_CSR_REG(stval), + KVM_REG_RISCV | KVM_REG_SIZE_ULONG | KVM_REG_RISCV_CSR | KVM_REG_RISCV_CSR_GENERAL | KVM_REG_RISCV_CSR_REG(sip), + KVM_REG_RISCV | KVM_REG_SIZE_ULONG | KVM_REG_RISCV_CSR | KVM_REG_RISCV_CSR_GENERAL | KVM_REG_RISCV_CSR_REG(satp), + KVM_REG_RISCV | KVM_REG_SIZE_ULONG | KVM_REG_RISCV_CSR | KVM_REG_RISCV_CSR_GENERAL | KVM_REG_RISCV_CSR_REG(scounteren), + KVM_REG_RISCV | KVM_REG_SIZE_ULONG | KVM_REG_RISCV_CSR | KVM_REG_RISCV_CSR_GENERAL | KVM_REG_RISCV_CSR_REG(senvcfg), + KVM_REG_RISCV | KVM_REG_SIZE_U64 | KVM_REG_RISCV_TIMER | KVM_REG_RISCV_TIMER_REG(frequency), + KVM_REG_RISCV | KVM_REG_SIZE_U64 | KVM_REG_RISCV_TIMER | KVM_REG_RISCV_TIMER_REG(time), + KVM_REG_RISCV | KVM_REG_SIZE_U64 | KVM_REG_RISCV_TIMER | KVM_REG_RISCV_TIMER_REG(compare), + KVM_REG_RISCV | KVM_REG_SIZE_U64 | KVM_REG_RISCV_TIMER | KVM_REG_RISCV_TIMER_REG(state), +}; + +/* + * The skips_set list registers that should skip set test. + * - KVM_REG_RISCV_TIMER_REG(state): set would fail if it was not initialized properly. + */ +static __u64 base_skips_set[] = { + KVM_REG_RISCV | KVM_REG_SIZE_U64 | KVM_REG_RISCV_TIMER | KVM_REG_RISCV_TIMER_REG(state), +}; + +static __u64 sbi_base_regs[] = { + KVM_REG_RISCV | KVM_REG_SIZE_ULONG | KVM_REG_RISCV_SBI_EXT | KVM_REG_RISCV_SBI_SINGLE | KVM_RISCV_SBI_EXT_V01, + KVM_REG_RISCV | KVM_REG_SIZE_ULONG | KVM_REG_RISCV_SBI_EXT | KVM_REG_RISCV_SBI_SINGLE | KVM_RISCV_SBI_EXT_TIME, + KVM_REG_RISCV | KVM_REG_SIZE_ULONG | KVM_REG_RISCV_SBI_EXT | KVM_REG_RISCV_SBI_SINGLE | KVM_RISCV_SBI_EXT_IPI, + KVM_REG_RISCV | KVM_REG_SIZE_ULONG | KVM_REG_RISCV_SBI_EXT | KVM_REG_RISCV_SBI_SINGLE | KVM_RISCV_SBI_EXT_RFENCE, + KVM_REG_RISCV | KVM_REG_SIZE_ULONG | KVM_REG_RISCV_SBI_EXT | KVM_REG_RISCV_SBI_SINGLE | KVM_RISCV_SBI_EXT_SRST, + KVM_REG_RISCV | KVM_REG_SIZE_ULONG | KVM_REG_RISCV_SBI_EXT | KVM_REG_RISCV_SBI_SINGLE | KVM_RISCV_SBI_EXT_HSM, + KVM_REG_RISCV | KVM_REG_SIZE_ULONG | KVM_REG_RISCV_SBI_EXT | KVM_REG_RISCV_SBI_SINGLE | KVM_RISCV_SBI_EXT_EXPERIMENTAL, + KVM_REG_RISCV | KVM_REG_SIZE_ULONG | KVM_REG_RISCV_SBI_EXT | KVM_REG_RISCV_SBI_SINGLE | KVM_RISCV_SBI_EXT_VENDOR, +}; + +static __u64 sbi_sta_regs[] = { + KVM_REG_RISCV | KVM_REG_SIZE_ULONG | KVM_REG_RISCV_SBI_EXT | KVM_REG_RISCV_SBI_SINGLE | KVM_RISCV_SBI_EXT_STA, + KVM_REG_RISCV | KVM_REG_SIZE_ULONG | KVM_REG_RISCV_SBI_STATE | KVM_REG_RISCV_SBI_STA | KVM_REG_RISCV_SBI_STA_REG(shmem_lo), + KVM_REG_RISCV | KVM_REG_SIZE_ULONG | KVM_REG_RISCV_SBI_STATE | KVM_REG_RISCV_SBI_STA | KVM_REG_RISCV_SBI_STA_REG(shmem_hi), +}; + +static __u64 zicbom_regs[] = { + KVM_REG_RISCV | KVM_REG_SIZE_ULONG | KVM_REG_RISCV_CONFIG | KVM_REG_RISCV_CONFIG_REG(zicbom_block_size), + KVM_REG_RISCV | KVM_REG_SIZE_ULONG | KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_ZICBOM, +}; + +static __u64 zicboz_regs[] = { + KVM_REG_RISCV | KVM_REG_SIZE_ULONG | KVM_REG_RISCV_CONFIG | KVM_REG_RISCV_CONFIG_REG(zicboz_block_size), + KVM_REG_RISCV | KVM_REG_SIZE_ULONG | KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_ZICBOZ, +}; + +static __u64 aia_regs[] = { + KVM_REG_RISCV | KVM_REG_SIZE_ULONG | KVM_REG_RISCV_CSR | KVM_REG_RISCV_CSR_AIA | KVM_REG_RISCV_CSR_AIA_REG(siselect), + KVM_REG_RISCV | KVM_REG_SIZE_ULONG | KVM_REG_RISCV_CSR | KVM_REG_RISCV_CSR_AIA | KVM_REG_RISCV_CSR_AIA_REG(iprio1), + KVM_REG_RISCV | KVM_REG_SIZE_ULONG | KVM_REG_RISCV_CSR | KVM_REG_RISCV_CSR_AIA | KVM_REG_RISCV_CSR_AIA_REG(iprio2), + KVM_REG_RISCV | KVM_REG_SIZE_ULONG | KVM_REG_RISCV_CSR | KVM_REG_RISCV_CSR_AIA | KVM_REG_RISCV_CSR_AIA_REG(sieh), + KVM_REG_RISCV | KVM_REG_SIZE_ULONG | KVM_REG_RISCV_CSR | KVM_REG_RISCV_CSR_AIA | KVM_REG_RISCV_CSR_AIA_REG(siph), + KVM_REG_RISCV | KVM_REG_SIZE_ULONG | KVM_REG_RISCV_CSR | KVM_REG_RISCV_CSR_AIA | KVM_REG_RISCV_CSR_AIA_REG(iprio1h), + KVM_REG_RISCV | KVM_REG_SIZE_ULONG | KVM_REG_RISCV_CSR | KVM_REG_RISCV_CSR_AIA | KVM_REG_RISCV_CSR_AIA_REG(iprio2h), + KVM_REG_RISCV | KVM_REG_SIZE_ULONG | KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_SSAIA, +}; + +static __u64 smstateen_regs[] = { + KVM_REG_RISCV | KVM_REG_SIZE_ULONG | KVM_REG_RISCV_CSR | KVM_REG_RISCV_CSR_SMSTATEEN | KVM_REG_RISCV_CSR_SMSTATEEN_REG(sstateen0), + KVM_REG_RISCV | KVM_REG_SIZE_ULONG | KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_SMSTATEEN, +}; + +static __u64 fp_f_regs[] = { + KVM_REG_RISCV | KVM_REG_SIZE_U32 | KVM_REG_RISCV_FP_F | KVM_REG_RISCV_FP_F_REG(f[0]), + KVM_REG_RISCV | KVM_REG_SIZE_U32 | KVM_REG_RISCV_FP_F | KVM_REG_RISCV_FP_F_REG(f[1]), + KVM_REG_RISCV | KVM_REG_SIZE_U32 | KVM_REG_RISCV_FP_F | KVM_REG_RISCV_FP_F_REG(f[2]), + KVM_REG_RISCV | KVM_REG_SIZE_U32 | KVM_REG_RISCV_FP_F | KVM_REG_RISCV_FP_F_REG(f[3]), + KVM_REG_RISCV | KVM_REG_SIZE_U32 | KVM_REG_RISCV_FP_F | KVM_REG_RISCV_FP_F_REG(f[4]), + KVM_REG_RISCV | KVM_REG_SIZE_U32 | KVM_REG_RISCV_FP_F | KVM_REG_RISCV_FP_F_REG(f[5]), + KVM_REG_RISCV | KVM_REG_SIZE_U32 | KVM_REG_RISCV_FP_F | KVM_REG_RISCV_FP_F_REG(f[6]), + KVM_REG_RISCV | KVM_REG_SIZE_U32 | KVM_REG_RISCV_FP_F | KVM_REG_RISCV_FP_F_REG(f[7]), + KVM_REG_RISCV | KVM_REG_SIZE_U32 | KVM_REG_RISCV_FP_F | KVM_REG_RISCV_FP_F_REG(f[8]), + KVM_REG_RISCV | KVM_REG_SIZE_U32 | KVM_REG_RISCV_FP_F | KVM_REG_RISCV_FP_F_REG(f[9]), + KVM_REG_RISCV | KVM_REG_SIZE_U32 | KVM_REG_RISCV_FP_F | KVM_REG_RISCV_FP_F_REG(f[10]), + KVM_REG_RISCV | KVM_REG_SIZE_U32 | KVM_REG_RISCV_FP_F | KVM_REG_RISCV_FP_F_REG(f[11]), + KVM_REG_RISCV | KVM_REG_SIZE_U32 | KVM_REG_RISCV_FP_F | KVM_REG_RISCV_FP_F_REG(f[12]), + KVM_REG_RISCV | KVM_REG_SIZE_U32 | KVM_REG_RISCV_FP_F | KVM_REG_RISCV_FP_F_REG(f[13]), + KVM_REG_RISCV | KVM_REG_SIZE_U32 | KVM_REG_RISCV_FP_F | KVM_REG_RISCV_FP_F_REG(f[14]), + KVM_REG_RISCV | KVM_REG_SIZE_U32 | KVM_REG_RISCV_FP_F | KVM_REG_RISCV_FP_F_REG(f[15]), + KVM_REG_RISCV | KVM_REG_SIZE_U32 | KVM_REG_RISCV_FP_F | KVM_REG_RISCV_FP_F_REG(f[16]), + KVM_REG_RISCV | KVM_REG_SIZE_U32 | KVM_REG_RISCV_FP_F | KVM_REG_RISCV_FP_F_REG(f[17]), + KVM_REG_RISCV | KVM_REG_SIZE_U32 | KVM_REG_RISCV_FP_F | KVM_REG_RISCV_FP_F_REG(f[18]), + KVM_REG_RISCV | KVM_REG_SIZE_U32 | KVM_REG_RISCV_FP_F | KVM_REG_RISCV_FP_F_REG(f[19]), + KVM_REG_RISCV | KVM_REG_SIZE_U32 | KVM_REG_RISCV_FP_F | KVM_REG_RISCV_FP_F_REG(f[20]), + KVM_REG_RISCV | KVM_REG_SIZE_U32 | KVM_REG_RISCV_FP_F | KVM_REG_RISCV_FP_F_REG(f[21]), + KVM_REG_RISCV | KVM_REG_SIZE_U32 | KVM_REG_RISCV_FP_F | KVM_REG_RISCV_FP_F_REG(f[22]), + KVM_REG_RISCV | KVM_REG_SIZE_U32 | KVM_REG_RISCV_FP_F | KVM_REG_RISCV_FP_F_REG(f[23]), + KVM_REG_RISCV | KVM_REG_SIZE_U32 | KVM_REG_RISCV_FP_F | KVM_REG_RISCV_FP_F_REG(f[24]), + KVM_REG_RISCV | KVM_REG_SIZE_U32 | KVM_REG_RISCV_FP_F | KVM_REG_RISCV_FP_F_REG(f[25]), + KVM_REG_RISCV | KVM_REG_SIZE_U32 | KVM_REG_RISCV_FP_F | KVM_REG_RISCV_FP_F_REG(f[26]), + KVM_REG_RISCV | KVM_REG_SIZE_U32 | KVM_REG_RISCV_FP_F | KVM_REG_RISCV_FP_F_REG(f[27]), + KVM_REG_RISCV | KVM_REG_SIZE_U32 | KVM_REG_RISCV_FP_F | KVM_REG_RISCV_FP_F_REG(f[28]), + KVM_REG_RISCV | KVM_REG_SIZE_U32 | KVM_REG_RISCV_FP_F | KVM_REG_RISCV_FP_F_REG(f[29]), + KVM_REG_RISCV | KVM_REG_SIZE_U32 | KVM_REG_RISCV_FP_F | KVM_REG_RISCV_FP_F_REG(f[30]), + KVM_REG_RISCV | KVM_REG_SIZE_U32 | KVM_REG_RISCV_FP_F | KVM_REG_RISCV_FP_F_REG(f[31]), + KVM_REG_RISCV | KVM_REG_SIZE_U32 | KVM_REG_RISCV_FP_F | KVM_REG_RISCV_FP_F_REG(fcsr), + KVM_REG_RISCV | KVM_REG_SIZE_ULONG | KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_F, +}; + +static __u64 fp_d_regs[] = { + KVM_REG_RISCV | KVM_REG_SIZE_U64 | KVM_REG_RISCV_FP_D | KVM_REG_RISCV_FP_D_REG(f[0]), + KVM_REG_RISCV | KVM_REG_SIZE_U64 | KVM_REG_RISCV_FP_D | KVM_REG_RISCV_FP_D_REG(f[1]), + KVM_REG_RISCV | KVM_REG_SIZE_U64 | KVM_REG_RISCV_FP_D | KVM_REG_RISCV_FP_D_REG(f[2]), + KVM_REG_RISCV | KVM_REG_SIZE_U64 | KVM_REG_RISCV_FP_D | KVM_REG_RISCV_FP_D_REG(f[3]), + KVM_REG_RISCV | KVM_REG_SIZE_U64 | KVM_REG_RISCV_FP_D | KVM_REG_RISCV_FP_D_REG(f[4]), + KVM_REG_RISCV | KVM_REG_SIZE_U64 | KVM_REG_RISCV_FP_D | KVM_REG_RISCV_FP_D_REG(f[5]), + KVM_REG_RISCV | KVM_REG_SIZE_U64 | KVM_REG_RISCV_FP_D | KVM_REG_RISCV_FP_D_REG(f[6]), + KVM_REG_RISCV | KVM_REG_SIZE_U64 | KVM_REG_RISCV_FP_D | KVM_REG_RISCV_FP_D_REG(f[7]), + KVM_REG_RISCV | KVM_REG_SIZE_U64 | KVM_REG_RISCV_FP_D | KVM_REG_RISCV_FP_D_REG(f[8]), + KVM_REG_RISCV | KVM_REG_SIZE_U64 | KVM_REG_RISCV_FP_D | KVM_REG_RISCV_FP_D_REG(f[9]), + KVM_REG_RISCV | KVM_REG_SIZE_U64 | KVM_REG_RISCV_FP_D | KVM_REG_RISCV_FP_D_REG(f[10]), + KVM_REG_RISCV | KVM_REG_SIZE_U64 | KVM_REG_RISCV_FP_D | KVM_REG_RISCV_FP_D_REG(f[11]), + KVM_REG_RISCV | KVM_REG_SIZE_U64 | KVM_REG_RISCV_FP_D | KVM_REG_RISCV_FP_D_REG(f[12]), + KVM_REG_RISCV | KVM_REG_SIZE_U64 | KVM_REG_RISCV_FP_D | KVM_REG_RISCV_FP_D_REG(f[13]), + KVM_REG_RISCV | KVM_REG_SIZE_U64 | KVM_REG_RISCV_FP_D | KVM_REG_RISCV_FP_D_REG(f[14]), + KVM_REG_RISCV | KVM_REG_SIZE_U64 | KVM_REG_RISCV_FP_D | KVM_REG_RISCV_FP_D_REG(f[15]), + KVM_REG_RISCV | KVM_REG_SIZE_U64 | KVM_REG_RISCV_FP_D | KVM_REG_RISCV_FP_D_REG(f[16]), + KVM_REG_RISCV | KVM_REG_SIZE_U64 | KVM_REG_RISCV_FP_D | KVM_REG_RISCV_FP_D_REG(f[17]), + KVM_REG_RISCV | KVM_REG_SIZE_U64 | KVM_REG_RISCV_FP_D | KVM_REG_RISCV_FP_D_REG(f[18]), + KVM_REG_RISCV | KVM_REG_SIZE_U64 | KVM_REG_RISCV_FP_D | KVM_REG_RISCV_FP_D_REG(f[19]), + KVM_REG_RISCV | KVM_REG_SIZE_U64 | KVM_REG_RISCV_FP_D | KVM_REG_RISCV_FP_D_REG(f[20]), + KVM_REG_RISCV | KVM_REG_SIZE_U64 | KVM_REG_RISCV_FP_D | KVM_REG_RISCV_FP_D_REG(f[21]), + KVM_REG_RISCV | KVM_REG_SIZE_U64 | KVM_REG_RISCV_FP_D | KVM_REG_RISCV_FP_D_REG(f[22]), + KVM_REG_RISCV | KVM_REG_SIZE_U64 | KVM_REG_RISCV_FP_D | KVM_REG_RISCV_FP_D_REG(f[23]), + KVM_REG_RISCV | KVM_REG_SIZE_U64 | KVM_REG_RISCV_FP_D | KVM_REG_RISCV_FP_D_REG(f[24]), + KVM_REG_RISCV | KVM_REG_SIZE_U64 | KVM_REG_RISCV_FP_D | KVM_REG_RISCV_FP_D_REG(f[25]), + KVM_REG_RISCV | KVM_REG_SIZE_U64 | KVM_REG_RISCV_FP_D | KVM_REG_RISCV_FP_D_REG(f[26]), + KVM_REG_RISCV | KVM_REG_SIZE_U64 | KVM_REG_RISCV_FP_D | KVM_REG_RISCV_FP_D_REG(f[27]), + KVM_REG_RISCV | KVM_REG_SIZE_U64 | KVM_REG_RISCV_FP_D | KVM_REG_RISCV_FP_D_REG(f[28]), + KVM_REG_RISCV | KVM_REG_SIZE_U64 | KVM_REG_RISCV_FP_D | KVM_REG_RISCV_FP_D_REG(f[29]), + KVM_REG_RISCV | KVM_REG_SIZE_U64 | KVM_REG_RISCV_FP_D | KVM_REG_RISCV_FP_D_REG(f[30]), + KVM_REG_RISCV | KVM_REG_SIZE_U64 | KVM_REG_RISCV_FP_D | KVM_REG_RISCV_FP_D_REG(f[31]), + KVM_REG_RISCV | KVM_REG_SIZE_U32 | KVM_REG_RISCV_FP_D | KVM_REG_RISCV_FP_D_REG(fcsr), + KVM_REG_RISCV | KVM_REG_SIZE_ULONG | KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | KVM_RISCV_ISA_EXT_D, +}; + +#define SUBLIST_BASE \ + {"base", .regs = base_regs, .regs_n = ARRAY_SIZE(base_regs), \ + .skips_set = base_skips_set, .skips_set_n = ARRAY_SIZE(base_skips_set),} +#define SUBLIST_SBI_BASE \ + {"sbi-base", .feature_type = VCPU_FEATURE_SBI_EXT, .feature = KVM_RISCV_SBI_EXT_V01, \ + .regs = sbi_base_regs, .regs_n = ARRAY_SIZE(sbi_base_regs),} +#define SUBLIST_SBI_STA \ + {"sbi-sta", .feature_type = VCPU_FEATURE_SBI_EXT, .feature = KVM_RISCV_SBI_EXT_STA, \ + .regs = sbi_sta_regs, .regs_n = ARRAY_SIZE(sbi_sta_regs),} +#define SUBLIST_ZICBOM \ + {"zicbom", .feature = KVM_RISCV_ISA_EXT_ZICBOM, .regs = zicbom_regs, .regs_n = ARRAY_SIZE(zicbom_regs),} +#define SUBLIST_ZICBOZ \ + {"zicboz", .feature = KVM_RISCV_ISA_EXT_ZICBOZ, .regs = zicboz_regs, .regs_n = ARRAY_SIZE(zicboz_regs),} +#define SUBLIST_AIA \ + {"aia", .feature = KVM_RISCV_ISA_EXT_SSAIA, .regs = aia_regs, .regs_n = ARRAY_SIZE(aia_regs),} +#define SUBLIST_SMSTATEEN \ + {"smstateen", .feature = KVM_RISCV_ISA_EXT_SMSTATEEN, .regs = smstateen_regs, .regs_n = ARRAY_SIZE(smstateen_regs),} +#define SUBLIST_FP_F \ + {"fp_f", .feature = KVM_RISCV_ISA_EXT_F, .regs = fp_f_regs, \ + .regs_n = ARRAY_SIZE(fp_f_regs),} +#define SUBLIST_FP_D \ + {"fp_d", .feature = KVM_RISCV_ISA_EXT_D, .regs = fp_d_regs, \ + .regs_n = ARRAY_SIZE(fp_d_regs),} + +#define KVM_ISA_EXT_SIMPLE_CONFIG(ext, extu) \ +static __u64 regs_##ext[] = { \ + KVM_REG_RISCV | KVM_REG_SIZE_ULONG | \ + KVM_REG_RISCV_ISA_EXT | KVM_REG_RISCV_ISA_SINGLE | \ + KVM_RISCV_ISA_EXT_##extu, \ +}; \ +static struct vcpu_reg_list config_##ext = { \ + .sublists = { \ + SUBLIST_BASE, \ + { \ + .name = #ext, \ + .feature = KVM_RISCV_ISA_EXT_##extu, \ + .regs = regs_##ext, \ + .regs_n = ARRAY_SIZE(regs_##ext), \ + }, \ + {0}, \ + }, \ +} \ + +#define KVM_SBI_EXT_SIMPLE_CONFIG(ext, extu) \ +static __u64 regs_sbi_##ext[] = { \ + KVM_REG_RISCV | KVM_REG_SIZE_ULONG | \ + KVM_REG_RISCV_SBI_EXT | KVM_REG_RISCV_SBI_SINGLE | \ + KVM_RISCV_SBI_EXT_##extu, \ +}; \ +static struct vcpu_reg_list config_sbi_##ext = { \ + .sublists = { \ + SUBLIST_BASE, \ + { \ + .name = "sbi-"#ext, \ + .feature_type = VCPU_FEATURE_SBI_EXT, \ + .feature = KVM_RISCV_SBI_EXT_##extu, \ + .regs = regs_sbi_##ext, \ + .regs_n = ARRAY_SIZE(regs_sbi_##ext), \ + }, \ + {0}, \ + }, \ +} \ + +#define KVM_ISA_EXT_SUBLIST_CONFIG(ext, extu) \ +static struct vcpu_reg_list config_##ext = { \ + .sublists = { \ + SUBLIST_BASE, \ + SUBLIST_##extu, \ + {0}, \ + }, \ +} \ + +#define KVM_SBI_EXT_SUBLIST_CONFIG(ext, extu) \ +static struct vcpu_reg_list config_sbi_##ext = { \ + .sublists = { \ + SUBLIST_BASE, \ + SUBLIST_SBI_##extu, \ + {0}, \ + }, \ +} \ + +/* Note: The below list is alphabetically sorted. */ + +KVM_SBI_EXT_SUBLIST_CONFIG(base, BASE); +KVM_SBI_EXT_SUBLIST_CONFIG(sta, STA); +KVM_SBI_EXT_SIMPLE_CONFIG(pmu, PMU); +KVM_SBI_EXT_SIMPLE_CONFIG(dbcn, DBCN); + +KVM_ISA_EXT_SUBLIST_CONFIG(aia, AIA); +KVM_ISA_EXT_SUBLIST_CONFIG(fp_f, FP_F); +KVM_ISA_EXT_SUBLIST_CONFIG(fp_d, FP_D); +KVM_ISA_EXT_SIMPLE_CONFIG(h, H); +KVM_ISA_EXT_SUBLIST_CONFIG(smstateen, SMSTATEEN); +KVM_ISA_EXT_SIMPLE_CONFIG(sscofpmf, SSCOFPMF); +KVM_ISA_EXT_SIMPLE_CONFIG(sstc, SSTC); +KVM_ISA_EXT_SIMPLE_CONFIG(svinval, SVINVAL); +KVM_ISA_EXT_SIMPLE_CONFIG(svnapot, SVNAPOT); +KVM_ISA_EXT_SIMPLE_CONFIG(svpbmt, SVPBMT); +KVM_ISA_EXT_SIMPLE_CONFIG(zacas, ZACAS); +KVM_ISA_EXT_SIMPLE_CONFIG(zba, ZBA); +KVM_ISA_EXT_SIMPLE_CONFIG(zbb, ZBB); +KVM_ISA_EXT_SIMPLE_CONFIG(zbc, ZBC); +KVM_ISA_EXT_SIMPLE_CONFIG(zbkb, ZBKB); +KVM_ISA_EXT_SIMPLE_CONFIG(zbkc, ZBKC); +KVM_ISA_EXT_SIMPLE_CONFIG(zbkx, ZBKX); +KVM_ISA_EXT_SIMPLE_CONFIG(zbs, ZBS); +KVM_ISA_EXT_SIMPLE_CONFIG(zfa, ZFA); +KVM_ISA_EXT_SIMPLE_CONFIG(zfh, ZFH); +KVM_ISA_EXT_SIMPLE_CONFIG(zfhmin, ZFHMIN); +KVM_ISA_EXT_SUBLIST_CONFIG(zicbom, ZICBOM); +KVM_ISA_EXT_SUBLIST_CONFIG(zicboz, ZICBOZ); +KVM_ISA_EXT_SIMPLE_CONFIG(zicntr, ZICNTR); +KVM_ISA_EXT_SIMPLE_CONFIG(zicond, ZICOND); +KVM_ISA_EXT_SIMPLE_CONFIG(zicsr, ZICSR); +KVM_ISA_EXT_SIMPLE_CONFIG(zifencei, ZIFENCEI); +KVM_ISA_EXT_SIMPLE_CONFIG(zihintntl, ZIHINTNTL); +KVM_ISA_EXT_SIMPLE_CONFIG(zihintpause, ZIHINTPAUSE); +KVM_ISA_EXT_SIMPLE_CONFIG(zihpm, ZIHPM); +KVM_ISA_EXT_SIMPLE_CONFIG(zknd, ZKND); +KVM_ISA_EXT_SIMPLE_CONFIG(zkne, ZKNE); +KVM_ISA_EXT_SIMPLE_CONFIG(zknh, ZKNH); +KVM_ISA_EXT_SIMPLE_CONFIG(zkr, ZKR); +KVM_ISA_EXT_SIMPLE_CONFIG(zksed, ZKSED); +KVM_ISA_EXT_SIMPLE_CONFIG(zksh, ZKSH); +KVM_ISA_EXT_SIMPLE_CONFIG(zkt, ZKT); +KVM_ISA_EXT_SIMPLE_CONFIG(ztso, ZTSO); +KVM_ISA_EXT_SIMPLE_CONFIG(zvbb, ZVBB); +KVM_ISA_EXT_SIMPLE_CONFIG(zvbc, ZVBC); +KVM_ISA_EXT_SIMPLE_CONFIG(zvfh, ZVFH); +KVM_ISA_EXT_SIMPLE_CONFIG(zvfhmin, ZVFHMIN); +KVM_ISA_EXT_SIMPLE_CONFIG(zvkb, ZVKB); +KVM_ISA_EXT_SIMPLE_CONFIG(zvkg, ZVKG); +KVM_ISA_EXT_SIMPLE_CONFIG(zvkned, ZVKNED); +KVM_ISA_EXT_SIMPLE_CONFIG(zvknha, ZVKNHA); +KVM_ISA_EXT_SIMPLE_CONFIG(zvknhb, ZVKNHB); +KVM_ISA_EXT_SIMPLE_CONFIG(zvksed, ZVKSED); +KVM_ISA_EXT_SIMPLE_CONFIG(zvksh, ZVKSH); +KVM_ISA_EXT_SIMPLE_CONFIG(zvkt, ZVKT); + +struct vcpu_reg_list *vcpu_configs[] = { + &config_sbi_base, + &config_sbi_sta, + &config_sbi_pmu, + &config_sbi_dbcn, + &config_aia, + &config_fp_f, + &config_fp_d, + &config_h, + &config_smstateen, + &config_sscofpmf, + &config_sstc, + &config_svinval, + &config_svnapot, + &config_svpbmt, + &config_zacas, + &config_zba, + &config_zbb, + &config_zbc, + &config_zbkb, + &config_zbkc, + &config_zbkx, + &config_zbs, + &config_zfa, + &config_zfh, + &config_zfhmin, + &config_zicbom, + &config_zicboz, + &config_zicntr, + &config_zicond, + &config_zicsr, + &config_zifencei, + &config_zihintntl, + &config_zihintpause, + &config_zihpm, + &config_zknd, + &config_zkne, + &config_zknh, + &config_zkr, + &config_zksed, + &config_zksh, + &config_zkt, + &config_ztso, + &config_zvbb, + &config_zvbc, + &config_zvfh, + &config_zvfhmin, + &config_zvkb, + &config_zvkg, + &config_zvkned, + &config_zvknha, + &config_zvknhb, + &config_zvksed, + &config_zvksh, + &config_zvkt, +}; +int vcpu_configs_n = ARRAY_SIZE(vcpu_configs); diff --git a/tools/testing/selftests/kvm/riscv/sbi_pmu_test.c b/tools/testing/selftests/kvm/riscv/sbi_pmu_test.c new file mode 100644 index 000000000000..69bb94e6b227 --- /dev/null +++ b/tools/testing/selftests/kvm/riscv/sbi_pmu_test.c @@ -0,0 +1,681 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * sbi_pmu_test.c - Tests the riscv64 SBI PMU functionality. + * + * Copyright (c) 2024, Rivos Inc. + */ + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <sys/types.h> +#include "kvm_util.h" +#include "test_util.h" +#include "processor.h" +#include "sbi.h" +#include "arch_timer.h" + +/* Maximum counters(firmware + hardware) */ +#define RISCV_MAX_PMU_COUNTERS 64 +union sbi_pmu_ctr_info ctrinfo_arr[RISCV_MAX_PMU_COUNTERS]; + +/* Snapshot shared memory data */ +#define PMU_SNAPSHOT_GPA_BASE BIT(30) +static void *snapshot_gva; +static vm_paddr_t snapshot_gpa; + +static int vcpu_shared_irq_count; +static int counter_in_use; + +/* Cache the available counters in a bitmask */ +static unsigned long counter_mask_available; + +static bool illegal_handler_invoked; + +#define SBI_PMU_TEST_BASIC BIT(0) +#define SBI_PMU_TEST_EVENTS BIT(1) +#define SBI_PMU_TEST_SNAPSHOT BIT(2) +#define SBI_PMU_TEST_OVERFLOW BIT(3) + +static int disabled_tests; + +unsigned long pmu_csr_read_num(int csr_num) +{ +#define switchcase_csr_read(__csr_num, __val) {\ + case __csr_num: \ + __val = csr_read(__csr_num); \ + break; } +#define switchcase_csr_read_2(__csr_num, __val) {\ + switchcase_csr_read(__csr_num + 0, __val) \ + switchcase_csr_read(__csr_num + 1, __val)} +#define switchcase_csr_read_4(__csr_num, __val) {\ + switchcase_csr_read_2(__csr_num + 0, __val) \ + switchcase_csr_read_2(__csr_num + 2, __val)} +#define switchcase_csr_read_8(__csr_num, __val) {\ + switchcase_csr_read_4(__csr_num + 0, __val) \ + switchcase_csr_read_4(__csr_num + 4, __val)} +#define switchcase_csr_read_16(__csr_num, __val) {\ + switchcase_csr_read_8(__csr_num + 0, __val) \ + switchcase_csr_read_8(__csr_num + 8, __val)} +#define switchcase_csr_read_32(__csr_num, __val) {\ + switchcase_csr_read_16(__csr_num + 0, __val) \ + switchcase_csr_read_16(__csr_num + 16, __val)} + + unsigned long ret = 0; + + switch (csr_num) { + switchcase_csr_read_32(CSR_CYCLE, ret) + switchcase_csr_read_32(CSR_CYCLEH, ret) + default : + break; + } + + return ret; +#undef switchcase_csr_read_32 +#undef switchcase_csr_read_16 +#undef switchcase_csr_read_8 +#undef switchcase_csr_read_4 +#undef switchcase_csr_read_2 +#undef switchcase_csr_read +} + +static inline void dummy_func_loop(uint64_t iter) +{ + int i = 0; + + while (i < iter) { + asm volatile("nop"); + i++; + } +} + +static void start_counter(unsigned long counter, unsigned long start_flags, + unsigned long ival) +{ + struct sbiret ret; + + ret = sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_COUNTER_START, counter, 1, start_flags, + ival, 0, 0); + __GUEST_ASSERT(ret.error == 0, "Unable to start counter %ld\n", counter); +} + +/* This should be invoked only for reset counter use case */ +static void stop_reset_counter(unsigned long counter, unsigned long stop_flags) +{ + struct sbiret ret; + + ret = sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_COUNTER_STOP, counter, 1, + stop_flags | SBI_PMU_STOP_FLAG_RESET, 0, 0, 0); + __GUEST_ASSERT(ret.error == SBI_ERR_ALREADY_STOPPED, + "Unable to stop counter %ld\n", counter); +} + +static void stop_counter(unsigned long counter, unsigned long stop_flags) +{ + struct sbiret ret; + + ret = sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_COUNTER_STOP, counter, 1, stop_flags, + 0, 0, 0); + __GUEST_ASSERT(ret.error == 0, "Unable to stop counter %ld error %ld\n", + counter, ret.error); +} + +static void guest_illegal_exception_handler(struct ex_regs *regs) +{ + __GUEST_ASSERT(regs->cause == EXC_INST_ILLEGAL, + "Unexpected exception handler %lx\n", regs->cause); + + illegal_handler_invoked = true; + /* skip the trapping instruction */ + regs->epc += 4; +} + +static void guest_irq_handler(struct ex_regs *regs) +{ + unsigned int irq_num = regs->cause & ~CAUSE_IRQ_FLAG; + struct riscv_pmu_snapshot_data *snapshot_data = snapshot_gva; + unsigned long overflown_mask; + unsigned long counter_val = 0; + + /* Validate that we are in the correct irq handler */ + GUEST_ASSERT_EQ(irq_num, IRQ_PMU_OVF); + + /* Stop all counters first to avoid further interrupts */ + stop_counter(counter_in_use, SBI_PMU_STOP_FLAG_TAKE_SNAPSHOT); + + csr_clear(CSR_SIP, BIT(IRQ_PMU_OVF)); + + overflown_mask = READ_ONCE(snapshot_data->ctr_overflow_mask); + GUEST_ASSERT(overflown_mask & 0x01); + + WRITE_ONCE(vcpu_shared_irq_count, vcpu_shared_irq_count+1); + + counter_val = READ_ONCE(snapshot_data->ctr_values[0]); + /* Now start the counter to mimick the real driver behavior */ + start_counter(counter_in_use, SBI_PMU_START_FLAG_SET_INIT_VALUE, counter_val); +} + +static unsigned long get_counter_index(unsigned long cbase, unsigned long cmask, + unsigned long cflags, + unsigned long event) +{ + struct sbiret ret; + + ret = sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_COUNTER_CFG_MATCH, cbase, cmask, + cflags, event, 0, 0); + __GUEST_ASSERT(ret.error == 0, "config matching failed %ld\n", ret.error); + GUEST_ASSERT(ret.value < RISCV_MAX_PMU_COUNTERS); + GUEST_ASSERT(BIT(ret.value) & counter_mask_available); + + return ret.value; +} + +static unsigned long get_num_counters(void) +{ + struct sbiret ret; + + ret = sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_NUM_COUNTERS, 0, 0, 0, 0, 0, 0); + + __GUEST_ASSERT(ret.error == 0, "Unable to retrieve number of counters from SBI PMU"); + __GUEST_ASSERT(ret.value < RISCV_MAX_PMU_COUNTERS, + "Invalid number of counters %ld\n", ret.value); + + return ret.value; +} + +static void update_counter_info(int num_counters) +{ + int i = 0; + struct sbiret ret; + + for (i = 0; i < num_counters; i++) { + ret = sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_COUNTER_GET_INFO, i, 0, 0, 0, 0, 0); + + /* There can be gaps in logical counter indicies*/ + if (ret.error) + continue; + GUEST_ASSERT_NE(ret.value, 0); + + ctrinfo_arr[i].value = ret.value; + counter_mask_available |= BIT(i); + } + + GUEST_ASSERT(counter_mask_available > 0); +} + +static unsigned long read_fw_counter(int idx, union sbi_pmu_ctr_info ctrinfo) +{ + struct sbiret ret; + + ret = sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_COUNTER_FW_READ, idx, 0, 0, 0, 0, 0); + GUEST_ASSERT(ret.error == 0); + return ret.value; +} + +static unsigned long read_counter(int idx, union sbi_pmu_ctr_info ctrinfo) +{ + unsigned long counter_val = 0; + + __GUEST_ASSERT(ctrinfo.type < 2, "Invalid counter type %d", ctrinfo.type); + + if (ctrinfo.type == SBI_PMU_CTR_TYPE_HW) + counter_val = pmu_csr_read_num(ctrinfo.csr); + else if (ctrinfo.type == SBI_PMU_CTR_TYPE_FW) + counter_val = read_fw_counter(idx, ctrinfo); + + return counter_val; +} + +static inline void verify_sbi_requirement_assert(void) +{ + long out_val = 0; + bool probe; + + probe = guest_sbi_probe_extension(SBI_EXT_PMU, &out_val); + GUEST_ASSERT(probe && out_val == 1); + + if (get_host_sbi_spec_version() < sbi_mk_version(2, 0)) + __GUEST_ASSERT(0, "SBI implementation version doesn't support PMU Snapshot"); +} + +static void snapshot_set_shmem(vm_paddr_t gpa, unsigned long flags) +{ + unsigned long lo = (unsigned long)gpa; +#if __riscv_xlen == 32 + unsigned long hi = (unsigned long)(gpa >> 32); +#else + unsigned long hi = gpa == -1 ? -1 : 0; +#endif + struct sbiret ret = sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_SNAPSHOT_SET_SHMEM, + lo, hi, flags, 0, 0, 0); + + GUEST_ASSERT(ret.value == 0 && ret.error == 0); +} + +static void test_pmu_event(unsigned long event) +{ + unsigned long counter; + unsigned long counter_value_pre, counter_value_post; + unsigned long counter_init_value = 100; + + counter = get_counter_index(0, counter_mask_available, 0, event); + counter_value_pre = read_counter(counter, ctrinfo_arr[counter]); + + /* Do not set the initial value */ + start_counter(counter, 0, 0); + dummy_func_loop(10000); + stop_counter(counter, 0); + + counter_value_post = read_counter(counter, ctrinfo_arr[counter]); + __GUEST_ASSERT(counter_value_post > counter_value_pre, + "Event update verification failed: post [%lx] pre [%lx]\n", + counter_value_post, counter_value_pre); + + /* + * We can't just update the counter without starting it. + * Do start/stop twice to simulate that by first initializing to a very + * high value and a low value after that. + */ + start_counter(counter, SBI_PMU_START_FLAG_SET_INIT_VALUE, ULONG_MAX/2); + stop_counter(counter, 0); + counter_value_pre = read_counter(counter, ctrinfo_arr[counter]); + + start_counter(counter, SBI_PMU_START_FLAG_SET_INIT_VALUE, counter_init_value); + stop_counter(counter, 0); + counter_value_post = read_counter(counter, ctrinfo_arr[counter]); + __GUEST_ASSERT(counter_value_pre > counter_value_post, + "Counter reinitialization verification failed : post [%lx] pre [%lx]\n", + counter_value_post, counter_value_pre); + + /* Now set the initial value and compare */ + start_counter(counter, SBI_PMU_START_FLAG_SET_INIT_VALUE, counter_init_value); + dummy_func_loop(10000); + stop_counter(counter, 0); + + counter_value_post = read_counter(counter, ctrinfo_arr[counter]); + __GUEST_ASSERT(counter_value_post > counter_init_value, + "Event update verification failed: post [%lx] pre [%lx]\n", + counter_value_post, counter_init_value); + + stop_reset_counter(counter, 0); +} + +static void test_pmu_event_snapshot(unsigned long event) +{ + unsigned long counter; + unsigned long counter_value_pre, counter_value_post; + unsigned long counter_init_value = 100; + struct riscv_pmu_snapshot_data *snapshot_data = snapshot_gva; + + counter = get_counter_index(0, counter_mask_available, 0, event); + counter_value_pre = read_counter(counter, ctrinfo_arr[counter]); + + /* Do not set the initial value */ + start_counter(counter, 0, 0); + dummy_func_loop(10000); + stop_counter(counter, SBI_PMU_STOP_FLAG_TAKE_SNAPSHOT); + + /* The counter value is updated w.r.t relative index of cbase */ + counter_value_post = READ_ONCE(snapshot_data->ctr_values[0]); + __GUEST_ASSERT(counter_value_post > counter_value_pre, + "Event update verification failed: post [%lx] pre [%lx]\n", + counter_value_post, counter_value_pre); + + /* + * We can't just update the counter without starting it. + * Do start/stop twice to simulate that by first initializing to a very + * high value and a low value after that. + */ + WRITE_ONCE(snapshot_data->ctr_values[0], ULONG_MAX/2); + start_counter(counter, SBI_PMU_START_FLAG_INIT_SNAPSHOT, 0); + stop_counter(counter, SBI_PMU_STOP_FLAG_TAKE_SNAPSHOT); + counter_value_pre = READ_ONCE(snapshot_data->ctr_values[0]); + + WRITE_ONCE(snapshot_data->ctr_values[0], counter_init_value); + start_counter(counter, SBI_PMU_START_FLAG_INIT_SNAPSHOT, 0); + stop_counter(counter, SBI_PMU_STOP_FLAG_TAKE_SNAPSHOT); + counter_value_post = READ_ONCE(snapshot_data->ctr_values[0]); + __GUEST_ASSERT(counter_value_pre > counter_value_post, + "Counter reinitialization verification failed : post [%lx] pre [%lx]\n", + counter_value_post, counter_value_pre); + + /* Now set the initial value and compare */ + WRITE_ONCE(snapshot_data->ctr_values[0], counter_init_value); + start_counter(counter, SBI_PMU_START_FLAG_INIT_SNAPSHOT, 0); + dummy_func_loop(10000); + stop_counter(counter, SBI_PMU_STOP_FLAG_TAKE_SNAPSHOT); + + counter_value_post = READ_ONCE(snapshot_data->ctr_values[0]); + __GUEST_ASSERT(counter_value_post > counter_init_value, + "Event update verification failed: post [%lx] pre [%lx]\n", + counter_value_post, counter_init_value); + + stop_reset_counter(counter, 0); +} + +static void test_pmu_event_overflow(unsigned long event) +{ + unsigned long counter; + unsigned long counter_value_post; + unsigned long counter_init_value = ULONG_MAX - 10000; + struct riscv_pmu_snapshot_data *snapshot_data = snapshot_gva; + + counter = get_counter_index(0, counter_mask_available, 0, event); + counter_in_use = counter; + + /* The counter value is updated w.r.t relative index of cbase passed to start/stop */ + WRITE_ONCE(snapshot_data->ctr_values[0], counter_init_value); + start_counter(counter, SBI_PMU_START_FLAG_INIT_SNAPSHOT, 0); + dummy_func_loop(10000); + udelay(msecs_to_usecs(2000)); + /* irq handler should have stopped the counter */ + stop_counter(counter, SBI_PMU_STOP_FLAG_TAKE_SNAPSHOT); + + counter_value_post = READ_ONCE(snapshot_data->ctr_values[0]); + /* The counter value after stopping should be less the init value due to overflow */ + __GUEST_ASSERT(counter_value_post < counter_init_value, + "counter_value_post %lx counter_init_value %lx for counter\n", + counter_value_post, counter_init_value); + + stop_reset_counter(counter, 0); +} + +static void test_invalid_event(void) +{ + struct sbiret ret; + unsigned long event = 0x1234; /* A random event */ + + ret = sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_COUNTER_CFG_MATCH, 0, + counter_mask_available, 0, event, 0, 0); + GUEST_ASSERT_EQ(ret.error, SBI_ERR_NOT_SUPPORTED); +} + +static void test_pmu_events(void) +{ + int num_counters = 0; + + /* Get the counter details */ + num_counters = get_num_counters(); + update_counter_info(num_counters); + + /* Sanity testing for any random invalid event */ + test_invalid_event(); + + /* Only these two events are guaranteed to be present */ + test_pmu_event(SBI_PMU_HW_CPU_CYCLES); + test_pmu_event(SBI_PMU_HW_INSTRUCTIONS); + + GUEST_DONE(); +} + +static void test_pmu_basic_sanity(void) +{ + long out_val = 0; + bool probe; + struct sbiret ret; + int num_counters = 0, i; + union sbi_pmu_ctr_info ctrinfo; + + probe = guest_sbi_probe_extension(SBI_EXT_PMU, &out_val); + GUEST_ASSERT(probe && out_val == 1); + + num_counters = get_num_counters(); + + for (i = 0; i < num_counters; i++) { + ret = sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_COUNTER_GET_INFO, i, + 0, 0, 0, 0, 0); + + /* There can be gaps in logical counter indicies*/ + if (ret.error) + continue; + GUEST_ASSERT_NE(ret.value, 0); + + ctrinfo.value = ret.value; + + /** + * Accessibility check of hardware and read capability of firmware counters. + * The spec doesn't mandate any initial value. No need to check any value. + */ + if (ctrinfo.type == SBI_PMU_CTR_TYPE_HW) { + pmu_csr_read_num(ctrinfo.csr); + GUEST_ASSERT(illegal_handler_invoked); + } else if (ctrinfo.type == SBI_PMU_CTR_TYPE_FW) { + read_fw_counter(i, ctrinfo); + } + } + + GUEST_DONE(); +} + +static void test_pmu_events_snaphost(void) +{ + int num_counters = 0; + struct riscv_pmu_snapshot_data *snapshot_data = snapshot_gva; + int i; + + /* Verify presence of SBI PMU and minimum requrired SBI version */ + verify_sbi_requirement_assert(); + + snapshot_set_shmem(snapshot_gpa, 0); + + /* Get the counter details */ + num_counters = get_num_counters(); + update_counter_info(num_counters); + + /* Validate shared memory access */ + GUEST_ASSERT_EQ(READ_ONCE(snapshot_data->ctr_overflow_mask), 0); + for (i = 0; i < num_counters; i++) { + if (counter_mask_available & (BIT(i))) + GUEST_ASSERT_EQ(READ_ONCE(snapshot_data->ctr_values[i]), 0); + } + /* Only these two events are guranteed to be present */ + test_pmu_event_snapshot(SBI_PMU_HW_CPU_CYCLES); + test_pmu_event_snapshot(SBI_PMU_HW_INSTRUCTIONS); + + GUEST_DONE(); +} + +static void test_pmu_events_overflow(void) +{ + int num_counters = 0; + + /* Verify presence of SBI PMU and minimum requrired SBI version */ + verify_sbi_requirement_assert(); + + snapshot_set_shmem(snapshot_gpa, 0); + csr_set(CSR_IE, BIT(IRQ_PMU_OVF)); + local_irq_enable(); + + /* Get the counter details */ + num_counters = get_num_counters(); + update_counter_info(num_counters); + + /* + * Qemu supports overflow for cycle/instruction. + * This test may fail on any platform that do not support overflow for these two events. + */ + test_pmu_event_overflow(SBI_PMU_HW_CPU_CYCLES); + GUEST_ASSERT_EQ(vcpu_shared_irq_count, 1); + + test_pmu_event_overflow(SBI_PMU_HW_INSTRUCTIONS); + GUEST_ASSERT_EQ(vcpu_shared_irq_count, 2); + + GUEST_DONE(); +} + +static void run_vcpu(struct kvm_vcpu *vcpu) +{ + struct ucall uc; + + vcpu_run(vcpu); + switch (get_ucall(vcpu, &uc)) { + case UCALL_ABORT: + REPORT_GUEST_ASSERT(uc); + break; + case UCALL_DONE: + case UCALL_SYNC: + break; + default: + TEST_FAIL("Unknown ucall %lu", uc.cmd); + break; + } +} + +void test_vm_destroy(struct kvm_vm *vm) +{ + memset(ctrinfo_arr, 0, sizeof(union sbi_pmu_ctr_info) * RISCV_MAX_PMU_COUNTERS); + counter_mask_available = 0; + kvm_vm_free(vm); +} + +static void test_vm_basic_test(void *guest_code) +{ + struct kvm_vm *vm; + struct kvm_vcpu *vcpu; + + vm = vm_create_with_one_vcpu(&vcpu, guest_code); + __TEST_REQUIRE(__vcpu_has_sbi_ext(vcpu, KVM_RISCV_SBI_EXT_PMU), + "SBI PMU not available, skipping test"); + vm_init_vector_tables(vm); + /* Illegal instruction handler is required to verify read access without configuration */ + vm_install_exception_handler(vm, EXC_INST_ILLEGAL, guest_illegal_exception_handler); + + vcpu_init_vector_tables(vcpu); + run_vcpu(vcpu); + + test_vm_destroy(vm); +} + +static void test_vm_events_test(void *guest_code) +{ + struct kvm_vm *vm = NULL; + struct kvm_vcpu *vcpu = NULL; + + vm = vm_create_with_one_vcpu(&vcpu, guest_code); + __TEST_REQUIRE(__vcpu_has_sbi_ext(vcpu, KVM_RISCV_SBI_EXT_PMU), + "SBI PMU not available, skipping test"); + run_vcpu(vcpu); + + test_vm_destroy(vm); +} + +static void test_vm_setup_snapshot_mem(struct kvm_vm *vm, struct kvm_vcpu *vcpu) +{ + /* PMU Snapshot requires single page only */ + vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, PMU_SNAPSHOT_GPA_BASE, 1, 1, 0); + /* PMU_SNAPSHOT_GPA_BASE is identity mapped */ + virt_map(vm, PMU_SNAPSHOT_GPA_BASE, PMU_SNAPSHOT_GPA_BASE, 1); + + snapshot_gva = (void *)(PMU_SNAPSHOT_GPA_BASE); + snapshot_gpa = addr_gva2gpa(vcpu->vm, (vm_vaddr_t)snapshot_gva); + sync_global_to_guest(vcpu->vm, snapshot_gva); + sync_global_to_guest(vcpu->vm, snapshot_gpa); +} + +static void test_vm_events_snapshot_test(void *guest_code) +{ + struct kvm_vm *vm = NULL; + struct kvm_vcpu *vcpu; + + vm = vm_create_with_one_vcpu(&vcpu, guest_code); + __TEST_REQUIRE(__vcpu_has_sbi_ext(vcpu, KVM_RISCV_SBI_EXT_PMU), + "SBI PMU not available, skipping test"); + + test_vm_setup_snapshot_mem(vm, vcpu); + + run_vcpu(vcpu); + + test_vm_destroy(vm); +} + +static void test_vm_events_overflow(void *guest_code) +{ + struct kvm_vm *vm = NULL; + struct kvm_vcpu *vcpu; + + vm = vm_create_with_one_vcpu(&vcpu, guest_code); + __TEST_REQUIRE(__vcpu_has_sbi_ext(vcpu, KVM_RISCV_SBI_EXT_PMU), + "SBI PMU not available, skipping test"); + + __TEST_REQUIRE(__vcpu_has_isa_ext(vcpu, KVM_RISCV_ISA_EXT_SSCOFPMF), + "Sscofpmf is not available, skipping overflow test"); + + test_vm_setup_snapshot_mem(vm, vcpu); + vm_init_vector_tables(vm); + vm_install_interrupt_handler(vm, guest_irq_handler); + + vcpu_init_vector_tables(vcpu); + /* Initialize guest timer frequency. */ + vcpu_get_reg(vcpu, RISCV_TIMER_REG(frequency), &timer_freq); + sync_global_to_guest(vm, timer_freq); + + run_vcpu(vcpu); + + test_vm_destroy(vm); +} + +static void test_print_help(char *name) +{ + pr_info("Usage: %s [-h] [-d <test name>]\n", name); + pr_info("\t-d: Test to disable. Available tests are 'basic', 'events', 'snapshot', 'overflow'\n"); + pr_info("\t-h: print this help screen\n"); +} + +static bool parse_args(int argc, char *argv[]) +{ + int opt; + + while ((opt = getopt(argc, argv, "hd:")) != -1) { + switch (opt) { + case 'd': + if (!strncmp("basic", optarg, 5)) + disabled_tests |= SBI_PMU_TEST_BASIC; + else if (!strncmp("events", optarg, 6)) + disabled_tests |= SBI_PMU_TEST_EVENTS; + else if (!strncmp("snapshot", optarg, 8)) + disabled_tests |= SBI_PMU_TEST_SNAPSHOT; + else if (!strncmp("overflow", optarg, 8)) + disabled_tests |= SBI_PMU_TEST_OVERFLOW; + else + goto done; + break; + case 'h': + default: + goto done; + } + } + + return true; +done: + test_print_help(argv[0]); + return false; +} + +int main(int argc, char *argv[]) +{ + if (!parse_args(argc, argv)) + exit(KSFT_SKIP); + + if (!(disabled_tests & SBI_PMU_TEST_BASIC)) { + test_vm_basic_test(test_pmu_basic_sanity); + pr_info("SBI PMU basic test : PASS\n"); + } + + if (!(disabled_tests & SBI_PMU_TEST_EVENTS)) { + test_vm_events_test(test_pmu_events); + pr_info("SBI PMU event verification test : PASS\n"); + } + + if (!(disabled_tests & SBI_PMU_TEST_SNAPSHOT)) { + test_vm_events_snapshot_test(test_pmu_events_snaphost); + pr_info("SBI PMU event verification with snapshot test : PASS\n"); + } + + if (!(disabled_tests & SBI_PMU_TEST_OVERFLOW)) { + test_vm_events_overflow(test_pmu_events_overflow); + pr_info("SBI PMU event verification with overflow test : PASS\n"); + } + + return 0; +} diff --git a/tools/testing/selftests/kvm/rseq_test.c b/tools/testing/selftests/kvm/rseq_test.c new file mode 100644 index 000000000000..e5898678bfab --- /dev/null +++ b/tools/testing/selftests/kvm/rseq_test.c @@ -0,0 +1,306 @@ +// SPDX-License-Identifier: GPL-2.0-only + +/* + * Include rseq.c without _GNU_SOURCE defined, before including any headers, so + * that rseq.c is compiled with its configuration, not KVM selftests' config. + */ +#undef _GNU_SOURCE +#include "../rseq/rseq.c" +#define _GNU_SOURCE + +#include <errno.h> +#include <fcntl.h> +#include <pthread.h> +#include <sched.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <signal.h> +#include <syscall.h> +#include <sys/ioctl.h> +#include <sys/sysinfo.h> +#include <asm/barrier.h> +#include <linux/atomic.h> +#include <linux/rseq.h> +#include <linux/unistd.h> + +#include "kvm_util.h" +#include "processor.h" +#include "test_util.h" +#include "ucall_common.h" + +/* + * Any bug related to task migration is likely to be timing-dependent; perform + * a large number of migrations to reduce the odds of a false negative. + */ +#define NR_TASK_MIGRATIONS 100000 + +static pthread_t migration_thread; +static cpu_set_t possible_mask; +static int min_cpu, max_cpu; +static bool done; + +static atomic_t seq_cnt; + +static void guest_code(void) +{ + for (;;) + GUEST_SYNC(0); +} + +static int next_cpu(int cpu) +{ + /* + * Advance to the next CPU, skipping those that weren't in the original + * affinity set. Sadly, there is no CPU_SET_FOR_EACH, and cpu_set_t's + * data storage is considered as opaque. Note, if this task is pinned + * to a small set of discontigous CPUs, e.g. 2 and 1023, this loop will + * burn a lot cycles and the test will take longer than normal to + * complete. + */ + do { + cpu++; + if (cpu > max_cpu) { + cpu = min_cpu; + TEST_ASSERT(CPU_ISSET(cpu, &possible_mask), + "Min CPU = %d must always be usable", cpu); + break; + } + } while (!CPU_ISSET(cpu, &possible_mask)); + + return cpu; +} + +static void *migration_worker(void *__rseq_tid) +{ + pid_t rseq_tid = (pid_t)(unsigned long)__rseq_tid; + cpu_set_t allowed_mask; + int r, i, cpu; + + CPU_ZERO(&allowed_mask); + + for (i = 0, cpu = min_cpu; i < NR_TASK_MIGRATIONS; i++, cpu = next_cpu(cpu)) { + CPU_SET(cpu, &allowed_mask); + + /* + * Bump the sequence count twice to allow the reader to detect + * that a migration may have occurred in between rseq and sched + * CPU ID reads. An odd sequence count indicates a migration + * is in-progress, while a completely different count indicates + * a migration occurred since the count was last read. + */ + atomic_inc(&seq_cnt); + + /* + * Ensure the odd count is visible while getcpu() isn't + * stable, i.e. while changing affinity is in-progress. + */ + smp_wmb(); + r = sched_setaffinity(rseq_tid, sizeof(allowed_mask), &allowed_mask); + TEST_ASSERT(!r, "sched_setaffinity failed, errno = %d (%s)", + errno, strerror(errno)); + smp_wmb(); + atomic_inc(&seq_cnt); + + CPU_CLR(cpu, &allowed_mask); + + /* + * Wait 1-10us before proceeding to the next iteration and more + * specifically, before bumping seq_cnt again. A delay is + * needed on three fronts: + * + * 1. To allow sched_setaffinity() to prompt migration before + * ioctl(KVM_RUN) enters the guest so that TIF_NOTIFY_RESUME + * (or TIF_NEED_RESCHED, which indirectly leads to handling + * NOTIFY_RESUME) is handled in KVM context. + * + * If NOTIFY_RESUME/NEED_RESCHED is set after KVM enters + * the guest, the guest will trigger a IO/MMIO exit all the + * way to userspace and the TIF flags will be handled by + * the generic "exit to userspace" logic, not by KVM. The + * exit to userspace is necessary to give the test a chance + * to check the rseq CPU ID (see #2). + * + * Alternatively, guest_code() could include an instruction + * to trigger an exit that is handled by KVM, but any such + * exit requires architecture specific code. + * + * 2. To let ioctl(KVM_RUN) make its way back to the test + * before the next round of migration. The test's check on + * the rseq CPU ID must wait for migration to complete in + * order to avoid false positive, thus any kernel rseq bug + * will be missed if the next migration starts before the + * check completes. + * + * 3. To ensure the read-side makes efficient forward progress, + * e.g. if getcpu() involves a syscall. Stalling the read-side + * means the test will spend more time waiting for getcpu() + * to stabilize and less time trying to hit the timing-dependent + * bug. + * + * Because any bug in this area is likely to be timing-dependent, + * run with a range of delays at 1us intervals from 1us to 10us + * as a best effort to avoid tuning the test to the point where + * it can hit _only_ the original bug and not detect future + * regressions. + * + * The original bug can reproduce with a delay up to ~500us on + * x86-64, but starts to require more iterations to reproduce + * as the delay creeps above ~10us, and the average runtime of + * each iteration obviously increases as well. Cap the delay + * at 10us to keep test runtime reasonable while minimizing + * potential coverage loss. + * + * The lower bound for reproducing the bug is likely below 1us, + * e.g. failures occur on x86-64 with nanosleep(0), but at that + * point the overhead of the syscall likely dominates the delay. + * Use usleep() for simplicity and to avoid unnecessary kernel + * dependencies. + */ + usleep((i % 10) + 1); + } + done = true; + return NULL; +} + +static void calc_min_max_cpu(void) +{ + int i, cnt, nproc; + + TEST_REQUIRE(CPU_COUNT(&possible_mask) >= 2); + + /* + * CPU_SET doesn't provide a FOR_EACH helper, get the min/max CPU that + * this task is affined to in order to reduce the time spent querying + * unusable CPUs, e.g. if this task is pinned to a small percentage of + * total CPUs. + */ + nproc = get_nprocs_conf(); + min_cpu = -1; + max_cpu = -1; + cnt = 0; + + for (i = 0; i < nproc; i++) { + if (!CPU_ISSET(i, &possible_mask)) + continue; + if (min_cpu == -1) + min_cpu = i; + max_cpu = i; + cnt++; + } + + __TEST_REQUIRE(cnt >= 2, + "Only one usable CPU, task migration not possible"); +} + +static void help(const char *name) +{ + puts(""); + printf("usage: %s [-h] [-u]\n", name); + printf(" -u: Don't sanity check the number of successful KVM_RUNs\n"); + puts(""); + exit(0); +} + +int main(int argc, char *argv[]) +{ + bool skip_sanity_check = false; + int r, i, snapshot; + struct kvm_vm *vm; + struct kvm_vcpu *vcpu; + u32 cpu, rseq_cpu; + int opt; + + while ((opt = getopt(argc, argv, "hu")) != -1) { + switch (opt) { + case 'u': + skip_sanity_check = true; + break; + case 'h': + default: + help(argv[0]); + break; + } + } + + r = sched_getaffinity(0, sizeof(possible_mask), &possible_mask); + TEST_ASSERT(!r, "sched_getaffinity failed, errno = %d (%s)", errno, + strerror(errno)); + + calc_min_max_cpu(); + + r = rseq_register_current_thread(); + TEST_ASSERT(!r, "rseq_register_current_thread failed, errno = %d (%s)", + errno, strerror(errno)); + + /* + * Create and run a dummy VM that immediately exits to userspace via + * GUEST_SYNC, while concurrently migrating the process by setting its + * CPU affinity. + */ + vm = vm_create_with_one_vcpu(&vcpu, guest_code); + + pthread_create(&migration_thread, NULL, migration_worker, + (void *)(unsigned long)syscall(SYS_gettid)); + + for (i = 0; !done; i++) { + vcpu_run(vcpu); + TEST_ASSERT(get_ucall(vcpu, NULL) == UCALL_SYNC, + "Guest failed?"); + + /* + * Verify rseq's CPU matches sched's CPU. Ensure migration + * doesn't occur between getcpu() and reading the rseq cpu_id + * by rereading both if the sequence count changes, or if the + * count is odd (migration in-progress). + */ + do { + /* + * Drop bit 0 to force a mismatch if the count is odd, + * i.e. if a migration is in-progress. + */ + snapshot = atomic_read(&seq_cnt) & ~1; + + /* + * Ensure calling getcpu() and reading rseq.cpu_id complete + * in a single "no migration" window, i.e. are not reordered + * across the seq_cnt reads. + */ + smp_rmb(); + r = sys_getcpu(&cpu, NULL); + TEST_ASSERT(!r, "getcpu failed, errno = %d (%s)", + errno, strerror(errno)); + rseq_cpu = rseq_current_cpu_raw(); + smp_rmb(); + } while (snapshot != atomic_read(&seq_cnt)); + + TEST_ASSERT(rseq_cpu == cpu, + "rseq CPU = %d, sched CPU = %d", rseq_cpu, cpu); + } + + /* + * Sanity check that the test was able to enter the guest a reasonable + * number of times, e.g. didn't get stalled too often/long waiting for + * getcpu() to stabilize. A 2:1 migration:KVM_RUN ratio is a fairly + * conservative ratio on x86-64, which can do _more_ KVM_RUNs than + * migrations given the 1us+ delay in the migration task. + * + * Another reason why it may have small migration:KVM_RUN ratio is that, + * on systems with large low power mode wakeup latency, it may happen + * quite often that the scheduler is not able to wake up the target CPU + * before the vCPU thread is scheduled to another CPU. + */ + TEST_ASSERT(skip_sanity_check || i > (NR_TASK_MIGRATIONS / 2), + "Only performed %d KVM_RUNs, task stalled too much?\n\n" + " Try disabling deep sleep states to reduce CPU wakeup latency,\n" + " e.g. via cpuidle.off=1 or setting /dev/cpu_dma_latency to '0',\n" + " or run with -u to disable this sanity check.", i); + + pthread_join(migration_thread, NULL); + + kvm_vm_free(vm); + + rseq_unregister_current_thread(); + + return 0; +} diff --git a/tools/testing/selftests/kvm/s390x/cmma_test.c b/tools/testing/selftests/kvm/s390x/cmma_test.c new file mode 100644 index 000000000000..b39033844756 --- /dev/null +++ b/tools/testing/selftests/kvm/s390x/cmma_test.c @@ -0,0 +1,694 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Test for s390x CMMA migration + * + * Copyright IBM Corp. 2023 + * + * Authors: + * Nico Boehr <nrb@linux.ibm.com> + */ +#include <fcntl.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/ioctl.h> + +#include "test_util.h" +#include "kvm_util.h" +#include "kselftest.h" +#include "ucall_common.h" + +#define MAIN_PAGE_COUNT 512 + +#define TEST_DATA_PAGE_COUNT 512 +#define TEST_DATA_MEMSLOT 1 +#define TEST_DATA_START_GFN 4096 + +#define TEST_DATA_TWO_PAGE_COUNT 256 +#define TEST_DATA_TWO_MEMSLOT 2 +#define TEST_DATA_TWO_START_GFN 8192 + +static char cmma_value_buf[MAIN_PAGE_COUNT + TEST_DATA_PAGE_COUNT]; + +/** + * Dirty CMMA attributes of exactly one page in the TEST_DATA memslot, + * so use_cmma goes on and the CMMA related ioctls do something. + */ +static void guest_do_one_essa(void) +{ + asm volatile( + /* load TEST_DATA_START_GFN into r1 */ + " llilf 1,%[start_gfn]\n" + /* calculate the address from the gfn */ + " sllg 1,1,12(0)\n" + /* set the first page in TEST_DATA memslot to STABLE */ + " .insn rrf,0xb9ab0000,2,1,1,0\n" + /* hypercall */ + " diag 0,0,0x501\n" + "0: j 0b" + : + : [start_gfn] "L"(TEST_DATA_START_GFN) + : "r1", "r2", "memory", "cc" + ); +} + +/** + * Touch CMMA attributes of all pages in TEST_DATA memslot. Set them to stable + * state. + */ +static void guest_dirty_test_data(void) +{ + asm volatile( + /* r1 = TEST_DATA_START_GFN */ + " xgr 1,1\n" + " llilf 1,%[start_gfn]\n" + /* r5 = TEST_DATA_PAGE_COUNT */ + " lghi 5,%[page_count]\n" + /* r5 += r1 */ + "2: agfr 5,1\n" + /* r2 = r1 << 12 */ + "1: sllg 2,1,12(0)\n" + /* essa(r4, r2, SET_STABLE) */ + " .insn rrf,0xb9ab0000,4,2,1,0\n" + /* i++ */ + " agfi 1,1\n" + /* if r1 < r5 goto 1 */ + " cgrjl 1,5,1b\n" + /* hypercall */ + " diag 0,0,0x501\n" + "0: j 0b" + : + : [start_gfn] "L"(TEST_DATA_START_GFN), + [page_count] "L"(TEST_DATA_PAGE_COUNT) + : + /* the counter in our loop over the pages */ + "r1", + /* the calculated page physical address */ + "r2", + /* ESSA output register */ + "r4", + /* last page */ + "r5", + "cc", "memory" + ); +} + +static void create_main_memslot(struct kvm_vm *vm) +{ + int i; + + vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, 0, 0, MAIN_PAGE_COUNT, 0); + /* set the array of memslots to zero like __vm_create does */ + for (i = 0; i < NR_MEM_REGIONS; i++) + vm->memslots[i] = 0; +} + +static void create_test_memslot(struct kvm_vm *vm) +{ + vm_userspace_mem_region_add(vm, + VM_MEM_SRC_ANONYMOUS, + TEST_DATA_START_GFN << vm->page_shift, + TEST_DATA_MEMSLOT, + TEST_DATA_PAGE_COUNT, + 0 + ); + vm->memslots[MEM_REGION_TEST_DATA] = TEST_DATA_MEMSLOT; +} + +static void create_memslots(struct kvm_vm *vm) +{ + /* + * Our VM has the following memory layout: + * +------+---------------------------+ + * | GFN | Memslot | + * +------+---------------------------+ + * | 0 | | + * | ... | MAIN (Code, Stack, ...) | + * | 511 | | + * +------+---------------------------+ + * | 4096 | | + * | ... | TEST_DATA | + * | 4607 | | + * +------+---------------------------+ + */ + create_main_memslot(vm); + create_test_memslot(vm); +} + +static void finish_vm_setup(struct kvm_vm *vm) +{ + struct userspace_mem_region *slot0; + + kvm_vm_elf_load(vm, program_invocation_name); + + slot0 = memslot2region(vm, 0); + ucall_init(vm, slot0->region.guest_phys_addr + slot0->region.memory_size); + + kvm_arch_vm_post_create(vm); +} + +static struct kvm_vm *create_vm_two_memslots(void) +{ + struct kvm_vm *vm; + + vm = vm_create_barebones(); + + create_memslots(vm); + + finish_vm_setup(vm); + + return vm; +} + +static void enable_cmma(struct kvm_vm *vm) +{ + int r; + + r = __kvm_device_attr_set(vm->fd, KVM_S390_VM_MEM_CTRL, KVM_S390_VM_MEM_ENABLE_CMMA, NULL); + TEST_ASSERT(!r, "enabling cmma failed r=%d errno=%d", r, errno); +} + +static void enable_dirty_tracking(struct kvm_vm *vm) +{ + vm_mem_region_set_flags(vm, 0, KVM_MEM_LOG_DIRTY_PAGES); + vm_mem_region_set_flags(vm, TEST_DATA_MEMSLOT, KVM_MEM_LOG_DIRTY_PAGES); +} + +static int __enable_migration_mode(struct kvm_vm *vm) +{ + return __kvm_device_attr_set(vm->fd, + KVM_S390_VM_MIGRATION, + KVM_S390_VM_MIGRATION_START, + NULL + ); +} + +static void enable_migration_mode(struct kvm_vm *vm) +{ + int r = __enable_migration_mode(vm); + + TEST_ASSERT(!r, "enabling migration mode failed r=%d errno=%d", r, errno); +} + +static bool is_migration_mode_on(struct kvm_vm *vm) +{ + u64 out; + int r; + + r = __kvm_device_attr_get(vm->fd, + KVM_S390_VM_MIGRATION, + KVM_S390_VM_MIGRATION_STATUS, + &out + ); + TEST_ASSERT(!r, "getting migration mode status failed r=%d errno=%d", r, errno); + return out; +} + +static int vm_get_cmma_bits(struct kvm_vm *vm, u64 flags, int *errno_out) +{ + struct kvm_s390_cmma_log args; + int rc; + + errno = 0; + + args = (struct kvm_s390_cmma_log){ + .start_gfn = 0, + .count = sizeof(cmma_value_buf), + .flags = flags, + .values = (__u64)&cmma_value_buf[0] + }; + rc = __vm_ioctl(vm, KVM_S390_GET_CMMA_BITS, &args); + + *errno_out = errno; + return rc; +} + +static void test_get_cmma_basic(void) +{ + struct kvm_vm *vm = create_vm_two_memslots(); + struct kvm_vcpu *vcpu; + int rc, errno_out; + + /* GET_CMMA_BITS without CMMA enabled should fail */ + rc = vm_get_cmma_bits(vm, 0, &errno_out); + TEST_ASSERT_EQ(rc, -1); + TEST_ASSERT_EQ(errno_out, ENXIO); + + enable_cmma(vm); + vcpu = vm_vcpu_add(vm, 1, guest_do_one_essa); + + vcpu_run(vcpu); + + /* GET_CMMA_BITS without migration mode and without peeking should fail */ + rc = vm_get_cmma_bits(vm, 0, &errno_out); + TEST_ASSERT_EQ(rc, -1); + TEST_ASSERT_EQ(errno_out, EINVAL); + + /* GET_CMMA_BITS without migration mode and with peeking should work */ + rc = vm_get_cmma_bits(vm, KVM_S390_CMMA_PEEK, &errno_out); + TEST_ASSERT_EQ(rc, 0); + TEST_ASSERT_EQ(errno_out, 0); + + enable_dirty_tracking(vm); + enable_migration_mode(vm); + + /* GET_CMMA_BITS with invalid flags */ + rc = vm_get_cmma_bits(vm, 0xfeedc0fe, &errno_out); + TEST_ASSERT_EQ(rc, -1); + TEST_ASSERT_EQ(errno_out, EINVAL); + + kvm_vm_free(vm); +} + +static void assert_exit_was_hypercall(struct kvm_vcpu *vcpu) +{ + TEST_ASSERT_EQ(vcpu->run->exit_reason, 13); + TEST_ASSERT_EQ(vcpu->run->s390_sieic.icptcode, 4); + TEST_ASSERT_EQ(vcpu->run->s390_sieic.ipa, 0x8300); + TEST_ASSERT_EQ(vcpu->run->s390_sieic.ipb, 0x5010000); +} + +static void test_migration_mode(void) +{ + struct kvm_vm *vm = vm_create_barebones(); + struct kvm_vcpu *vcpu; + u64 orig_psw; + int rc; + + /* enabling migration mode on a VM without memory should fail */ + rc = __enable_migration_mode(vm); + TEST_ASSERT_EQ(rc, -1); + TEST_ASSERT_EQ(errno, EINVAL); + TEST_ASSERT(!is_migration_mode_on(vm), "migration mode should still be off"); + errno = 0; + + create_memslots(vm); + finish_vm_setup(vm); + + enable_cmma(vm); + vcpu = vm_vcpu_add(vm, 1, guest_do_one_essa); + orig_psw = vcpu->run->psw_addr; + + /* + * Execute one essa instruction in the guest. Otherwise the guest will + * not have use_cmm enabled and GET_CMMA_BITS will return no pages. + */ + vcpu_run(vcpu); + assert_exit_was_hypercall(vcpu); + + /* migration mode when memslots have dirty tracking off should fail */ + rc = __enable_migration_mode(vm); + TEST_ASSERT_EQ(rc, -1); + TEST_ASSERT_EQ(errno, EINVAL); + TEST_ASSERT(!is_migration_mode_on(vm), "migration mode should still be off"); + errno = 0; + + /* enable dirty tracking */ + enable_dirty_tracking(vm); + + /* enabling migration mode should work now */ + rc = __enable_migration_mode(vm); + TEST_ASSERT_EQ(rc, 0); + TEST_ASSERT(is_migration_mode_on(vm), "migration mode should be on"); + errno = 0; + + /* execute another ESSA instruction to see this goes fine */ + vcpu->run->psw_addr = orig_psw; + vcpu_run(vcpu); + assert_exit_was_hypercall(vcpu); + + /* + * With migration mode on, create a new memslot with dirty tracking off. + * This should turn off migration mode. + */ + TEST_ASSERT(is_migration_mode_on(vm), "migration mode should be on"); + vm_userspace_mem_region_add(vm, + VM_MEM_SRC_ANONYMOUS, + TEST_DATA_TWO_START_GFN << vm->page_shift, + TEST_DATA_TWO_MEMSLOT, + TEST_DATA_TWO_PAGE_COUNT, + 0 + ); + TEST_ASSERT(!is_migration_mode_on(vm), + "creating memslot without dirty tracking turns off migration mode" + ); + + /* ESSA instructions should still execute fine */ + vcpu->run->psw_addr = orig_psw; + vcpu_run(vcpu); + assert_exit_was_hypercall(vcpu); + + /* + * Turn on dirty tracking on the new memslot. + * It should be possible to turn migration mode back on again. + */ + vm_mem_region_set_flags(vm, TEST_DATA_TWO_MEMSLOT, KVM_MEM_LOG_DIRTY_PAGES); + rc = __enable_migration_mode(vm); + TEST_ASSERT_EQ(rc, 0); + TEST_ASSERT(is_migration_mode_on(vm), "migration mode should be on"); + errno = 0; + + /* + * Turn off dirty tracking again, this time with just a flag change. + * Again, migration mode should turn off. + */ + TEST_ASSERT(is_migration_mode_on(vm), "migration mode should be on"); + vm_mem_region_set_flags(vm, TEST_DATA_TWO_MEMSLOT, 0); + TEST_ASSERT(!is_migration_mode_on(vm), + "disabling dirty tracking should turn off migration mode" + ); + + /* ESSA instructions should still execute fine */ + vcpu->run->psw_addr = orig_psw; + vcpu_run(vcpu); + assert_exit_was_hypercall(vcpu); + + kvm_vm_free(vm); +} + +/** + * Given a VM with the MAIN and TEST_DATA memslot, assert that both slots have + * CMMA attributes of all pages in both memslots and nothing more dirty. + * This has the useful side effect of ensuring nothing is CMMA dirty after this + * function. + */ +static void assert_all_slots_cmma_dirty(struct kvm_vm *vm) +{ + struct kvm_s390_cmma_log args; + + /* + * First iteration - everything should be dirty. + * Start at the main memslot... + */ + args = (struct kvm_s390_cmma_log){ + .start_gfn = 0, + .count = sizeof(cmma_value_buf), + .flags = 0, + .values = (__u64)&cmma_value_buf[0] + }; + memset(cmma_value_buf, 0xff, sizeof(cmma_value_buf)); + vm_ioctl(vm, KVM_S390_GET_CMMA_BITS, &args); + TEST_ASSERT_EQ(args.count, MAIN_PAGE_COUNT); + TEST_ASSERT_EQ(args.remaining, TEST_DATA_PAGE_COUNT); + TEST_ASSERT_EQ(args.start_gfn, 0); + + /* ...and then - after a hole - the TEST_DATA memslot should follow */ + args = (struct kvm_s390_cmma_log){ + .start_gfn = MAIN_PAGE_COUNT, + .count = sizeof(cmma_value_buf), + .flags = 0, + .values = (__u64)&cmma_value_buf[0] + }; + memset(cmma_value_buf, 0xff, sizeof(cmma_value_buf)); + vm_ioctl(vm, KVM_S390_GET_CMMA_BITS, &args); + TEST_ASSERT_EQ(args.count, TEST_DATA_PAGE_COUNT); + TEST_ASSERT_EQ(args.start_gfn, TEST_DATA_START_GFN); + TEST_ASSERT_EQ(args.remaining, 0); + + /* ...and nothing else should be there */ + args = (struct kvm_s390_cmma_log){ + .start_gfn = TEST_DATA_START_GFN + TEST_DATA_PAGE_COUNT, + .count = sizeof(cmma_value_buf), + .flags = 0, + .values = (__u64)&cmma_value_buf[0] + }; + memset(cmma_value_buf, 0xff, sizeof(cmma_value_buf)); + vm_ioctl(vm, KVM_S390_GET_CMMA_BITS, &args); + TEST_ASSERT_EQ(args.count, 0); + TEST_ASSERT_EQ(args.start_gfn, 0); + TEST_ASSERT_EQ(args.remaining, 0); +} + +/** + * Given a VM, assert no pages are CMMA dirty. + */ +static void assert_no_pages_cmma_dirty(struct kvm_vm *vm) +{ + struct kvm_s390_cmma_log args; + + /* If we start from GFN 0 again, nothing should be dirty. */ + args = (struct kvm_s390_cmma_log){ + .start_gfn = 0, + .count = sizeof(cmma_value_buf), + .flags = 0, + .values = (__u64)&cmma_value_buf[0] + }; + memset(cmma_value_buf, 0xff, sizeof(cmma_value_buf)); + vm_ioctl(vm, KVM_S390_GET_CMMA_BITS, &args); + if (args.count || args.remaining || args.start_gfn) + TEST_FAIL("pages are still dirty start_gfn=0x%llx count=%u remaining=%llu", + args.start_gfn, + args.count, + args.remaining + ); +} + +static void test_get_inital_dirty(void) +{ + struct kvm_vm *vm = create_vm_two_memslots(); + struct kvm_vcpu *vcpu; + + enable_cmma(vm); + vcpu = vm_vcpu_add(vm, 1, guest_do_one_essa); + + /* + * Execute one essa instruction in the guest. Otherwise the guest will + * not have use_cmm enabled and GET_CMMA_BITS will return no pages. + */ + vcpu_run(vcpu); + assert_exit_was_hypercall(vcpu); + + enable_dirty_tracking(vm); + enable_migration_mode(vm); + + assert_all_slots_cmma_dirty(vm); + + /* Start from the beginning again and make sure nothing else is dirty */ + assert_no_pages_cmma_dirty(vm); + + kvm_vm_free(vm); +} + +static void query_cmma_range(struct kvm_vm *vm, + u64 start_gfn, u64 gfn_count, + struct kvm_s390_cmma_log *res_out) +{ + *res_out = (struct kvm_s390_cmma_log){ + .start_gfn = start_gfn, + .count = gfn_count, + .flags = 0, + .values = (__u64)&cmma_value_buf[0] + }; + memset(cmma_value_buf, 0xff, sizeof(cmma_value_buf)); + vm_ioctl(vm, KVM_S390_GET_CMMA_BITS, res_out); +} + +/** + * Assert the given cmma_log struct that was executed by query_cmma_range() + * indicates the first dirty gfn is at first_dirty_gfn and contains exactly + * dirty_gfn_count CMMA values. + */ +static void assert_cmma_dirty(u64 first_dirty_gfn, + u64 dirty_gfn_count, + const struct kvm_s390_cmma_log *res) +{ + TEST_ASSERT_EQ(res->start_gfn, first_dirty_gfn); + TEST_ASSERT_EQ(res->count, dirty_gfn_count); + for (size_t i = 0; i < dirty_gfn_count; i++) + TEST_ASSERT_EQ(cmma_value_buf[0], 0x0); /* stable state */ + TEST_ASSERT_EQ(cmma_value_buf[dirty_gfn_count], 0xff); /* not touched */ +} + +static void test_get_skip_holes(void) +{ + size_t gfn_offset; + struct kvm_vm *vm = create_vm_two_memslots(); + struct kvm_s390_cmma_log log; + struct kvm_vcpu *vcpu; + u64 orig_psw; + + enable_cmma(vm); + vcpu = vm_vcpu_add(vm, 1, guest_dirty_test_data); + + orig_psw = vcpu->run->psw_addr; + + /* + * Execute some essa instructions in the guest. Otherwise the guest will + * not have use_cmm enabled and GET_CMMA_BITS will return no pages. + */ + vcpu_run(vcpu); + assert_exit_was_hypercall(vcpu); + + enable_dirty_tracking(vm); + enable_migration_mode(vm); + + /* un-dirty all pages */ + assert_all_slots_cmma_dirty(vm); + + /* Then, dirty just the TEST_DATA memslot */ + vcpu->run->psw_addr = orig_psw; + vcpu_run(vcpu); + + gfn_offset = TEST_DATA_START_GFN; + /** + * Query CMMA attributes of one page, starting at page 0. Since the + * main memslot was not touched by the VM, this should yield the first + * page of the TEST_DATA memslot. + * The dirty bitmap should now look like this: + * 0: not dirty + * [0x1, 0x200): dirty + */ + query_cmma_range(vm, 0, 1, &log); + assert_cmma_dirty(gfn_offset, 1, &log); + gfn_offset++; + + /** + * Query CMMA attributes of 32 (0x20) pages past the end of the TEST_DATA + * memslot. This should wrap back to the beginning of the TEST_DATA + * memslot, page 1. + * The dirty bitmap should now look like this: + * [0, 0x21): not dirty + * [0x21, 0x200): dirty + */ + query_cmma_range(vm, TEST_DATA_START_GFN + TEST_DATA_PAGE_COUNT, 0x20, &log); + assert_cmma_dirty(gfn_offset, 0x20, &log); + gfn_offset += 0x20; + + /* Skip 32 pages */ + gfn_offset += 0x20; + + /** + * After skipping 32 pages, query the next 32 (0x20) pages. + * The dirty bitmap should now look like this: + * [0, 0x21): not dirty + * [0x21, 0x41): dirty + * [0x41, 0x61): not dirty + * [0x61, 0x200): dirty + */ + query_cmma_range(vm, gfn_offset, 0x20, &log); + assert_cmma_dirty(gfn_offset, 0x20, &log); + gfn_offset += 0x20; + + /** + * Query 1 page from the beginning of the TEST_DATA memslot. This should + * yield page 0x21. + * The dirty bitmap should now look like this: + * [0, 0x22): not dirty + * [0x22, 0x41): dirty + * [0x41, 0x61): not dirty + * [0x61, 0x200): dirty + */ + query_cmma_range(vm, TEST_DATA_START_GFN, 1, &log); + assert_cmma_dirty(TEST_DATA_START_GFN + 0x21, 1, &log); + gfn_offset++; + + /** + * Query 15 (0xF) pages from page 0x23 in TEST_DATA memslot. + * This should yield pages [0x23, 0x33). + * The dirty bitmap should now look like this: + * [0, 0x22): not dirty + * 0x22: dirty + * [0x23, 0x33): not dirty + * [0x33, 0x41): dirty + * [0x41, 0x61): not dirty + * [0x61, 0x200): dirty + */ + gfn_offset = TEST_DATA_START_GFN + 0x23; + query_cmma_range(vm, gfn_offset, 15, &log); + assert_cmma_dirty(gfn_offset, 15, &log); + + /** + * Query 17 (0x11) pages from page 0x22 in TEST_DATA memslot. + * This should yield page [0x22, 0x33) + * The dirty bitmap should now look like this: + * [0, 0x33): not dirty + * [0x33, 0x41): dirty + * [0x41, 0x61): not dirty + * [0x61, 0x200): dirty + */ + gfn_offset = TEST_DATA_START_GFN + 0x22; + query_cmma_range(vm, gfn_offset, 17, &log); + assert_cmma_dirty(gfn_offset, 17, &log); + + /** + * Query 25 (0x19) pages from page 0x40 in TEST_DATA memslot. + * This should yield page 0x40 and nothing more, since there are more + * than 16 non-dirty pages after page 0x40. + * The dirty bitmap should now look like this: + * [0, 0x33): not dirty + * [0x33, 0x40): dirty + * [0x40, 0x61): not dirty + * [0x61, 0x200): dirty + */ + gfn_offset = TEST_DATA_START_GFN + 0x40; + query_cmma_range(vm, gfn_offset, 25, &log); + assert_cmma_dirty(gfn_offset, 1, &log); + + /** + * Query pages [0x33, 0x40). + * The dirty bitmap should now look like this: + * [0, 0x61): not dirty + * [0x61, 0x200): dirty + */ + gfn_offset = TEST_DATA_START_GFN + 0x33; + query_cmma_range(vm, gfn_offset, 0x40 - 0x33, &log); + assert_cmma_dirty(gfn_offset, 0x40 - 0x33, &log); + + /** + * Query the remaining pages [0x61, 0x200). + */ + gfn_offset = TEST_DATA_START_GFN; + query_cmma_range(vm, gfn_offset, TEST_DATA_PAGE_COUNT - 0x61, &log); + assert_cmma_dirty(TEST_DATA_START_GFN + 0x61, TEST_DATA_PAGE_COUNT - 0x61, &log); + + assert_no_pages_cmma_dirty(vm); +} + +struct testdef { + const char *name; + void (*test)(void); +} testlist[] = { + { "migration mode and dirty tracking", test_migration_mode }, + { "GET_CMMA_BITS: basic calls", test_get_cmma_basic }, + { "GET_CMMA_BITS: all pages are dirty initally", test_get_inital_dirty }, + { "GET_CMMA_BITS: holes are skipped", test_get_skip_holes }, +}; + +/** + * The kernel may support CMMA, but the machine may not (i.e. if running as + * guest-3). + * + * In this case, the CMMA capabilities are all there, but the CMMA-related + * ioctls fail. To find out whether the machine supports CMMA, create a + * temporary VM and then query the CMMA feature of the VM. + */ +static int machine_has_cmma(void) +{ + struct kvm_vm *vm = vm_create_barebones(); + int r; + + r = !__kvm_has_device_attr(vm->fd, KVM_S390_VM_MEM_CTRL, KVM_S390_VM_MEM_ENABLE_CMMA); + kvm_vm_free(vm); + + return r; +} + +int main(int argc, char *argv[]) +{ + int idx; + + TEST_REQUIRE(kvm_has_cap(KVM_CAP_SYNC_REGS)); + TEST_REQUIRE(kvm_has_cap(KVM_CAP_S390_CMMA_MIGRATION)); + TEST_REQUIRE(machine_has_cmma()); + + ksft_print_header(); + + ksft_set_plan(ARRAY_SIZE(testlist)); + + for (idx = 0; idx < ARRAY_SIZE(testlist); idx++) { + testlist[idx].test(); + ksft_test_result_pass("%s\n", testlist[idx].name); + } + + ksft_finished(); /* Print results and exit() accordingly */ +} diff --git a/tools/testing/selftests/kvm/s390x/debug_test.c b/tools/testing/selftests/kvm/s390x/debug_test.c new file mode 100644 index 000000000000..84313fb27529 --- /dev/null +++ b/tools/testing/selftests/kvm/s390x/debug_test.c @@ -0,0 +1,160 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Test KVM debugging features. */ +#include "kvm_util.h" +#include "test_util.h" + +#include <linux/kvm.h> + +#define __LC_SVC_NEW_PSW 0x1c0 +#define __LC_PGM_NEW_PSW 0x1d0 +#define ICPT_INSTRUCTION 0x04 +#define IPA0_DIAG 0x8300 +#define PGM_SPECIFICATION 0x06 + +/* Common code for testing single-stepping interruptions. */ +extern char int_handler[]; +asm("int_handler:\n" + "j .\n"); + +static struct kvm_vm *test_step_int_1(struct kvm_vcpu **vcpu, void *guest_code, + size_t new_psw_off, uint64_t *new_psw) +{ + struct kvm_guest_debug debug = {}; + struct kvm_regs regs; + struct kvm_vm *vm; + char *lowcore; + + vm = vm_create_with_one_vcpu(vcpu, guest_code); + lowcore = addr_gpa2hva(vm, 0); + new_psw[0] = (*vcpu)->run->psw_mask; + new_psw[1] = (uint64_t)int_handler; + memcpy(lowcore + new_psw_off, new_psw, 16); + vcpu_regs_get(*vcpu, ®s); + regs.gprs[2] = -1; + vcpu_regs_set(*vcpu, ®s); + debug.control = KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_SINGLESTEP; + vcpu_guest_debug_set(*vcpu, &debug); + vcpu_run(*vcpu); + + return vm; +} + +static void test_step_int(void *guest_code, size_t new_psw_off) +{ + struct kvm_vcpu *vcpu; + uint64_t new_psw[2]; + struct kvm_vm *vm; + + vm = test_step_int_1(&vcpu, guest_code, new_psw_off, new_psw); + TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_DEBUG); + TEST_ASSERT_EQ(vcpu->run->psw_mask, new_psw[0]); + TEST_ASSERT_EQ(vcpu->run->psw_addr, new_psw[1]); + kvm_vm_free(vm); +} + +/* Test single-stepping "boring" program interruptions. */ +extern char test_step_pgm_guest_code[]; +asm("test_step_pgm_guest_code:\n" + ".insn rr,0x1d00,%r1,%r0 /* dr %r1,%r0 */\n" + "j .\n"); + +static void test_step_pgm(void) +{ + test_step_int(test_step_pgm_guest_code, __LC_PGM_NEW_PSW); +} + +/* + * Test single-stepping program interruptions caused by DIAG. + * Userspace emulation must not interfere with single-stepping. + */ +extern char test_step_pgm_diag_guest_code[]; +asm("test_step_pgm_diag_guest_code:\n" + "diag %r0,%r0,0\n" + "j .\n"); + +static void test_step_pgm_diag(void) +{ + struct kvm_s390_irq irq = { + .type = KVM_S390_PROGRAM_INT, + .u.pgm.code = PGM_SPECIFICATION, + }; + struct kvm_vcpu *vcpu; + uint64_t new_psw[2]; + struct kvm_vm *vm; + + vm = test_step_int_1(&vcpu, test_step_pgm_diag_guest_code, + __LC_PGM_NEW_PSW, new_psw); + TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_S390_SIEIC); + TEST_ASSERT_EQ(vcpu->run->s390_sieic.icptcode, ICPT_INSTRUCTION); + TEST_ASSERT_EQ(vcpu->run->s390_sieic.ipa & 0xff00, IPA0_DIAG); + vcpu_ioctl(vcpu, KVM_S390_IRQ, &irq); + vcpu_run(vcpu); + TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_DEBUG); + TEST_ASSERT_EQ(vcpu->run->psw_mask, new_psw[0]); + TEST_ASSERT_EQ(vcpu->run->psw_addr, new_psw[1]); + kvm_vm_free(vm); +} + +/* + * Test single-stepping program interruptions caused by ISKE. + * CPUSTAT_KSS handling must not interfere with single-stepping. + */ +extern char test_step_pgm_iske_guest_code[]; +asm("test_step_pgm_iske_guest_code:\n" + "iske %r2,%r2\n" + "j .\n"); + +static void test_step_pgm_iske(void) +{ + test_step_int(test_step_pgm_iske_guest_code, __LC_PGM_NEW_PSW); +} + +/* + * Test single-stepping program interruptions caused by LCTL. + * KVM emulation must not interfere with single-stepping. + */ +extern char test_step_pgm_lctl_guest_code[]; +asm("test_step_pgm_lctl_guest_code:\n" + "lctl %c0,%c0,1\n" + "j .\n"); + +static void test_step_pgm_lctl(void) +{ + test_step_int(test_step_pgm_lctl_guest_code, __LC_PGM_NEW_PSW); +} + +/* Test single-stepping supervisor-call interruptions. */ +extern char test_step_svc_guest_code[]; +asm("test_step_svc_guest_code:\n" + "svc 0\n" + "j .\n"); + +static void test_step_svc(void) +{ + test_step_int(test_step_svc_guest_code, __LC_SVC_NEW_PSW); +} + +/* Run all tests above. */ +static struct testdef { + const char *name; + void (*test)(void); +} testlist[] = { + { "single-step pgm", test_step_pgm }, + { "single-step pgm caused by diag", test_step_pgm_diag }, + { "single-step pgm caused by iske", test_step_pgm_iske }, + { "single-step pgm caused by lctl", test_step_pgm_lctl }, + { "single-step svc", test_step_svc }, +}; + +int main(int argc, char *argv[]) +{ + int idx; + + ksft_print_header(); + ksft_set_plan(ARRAY_SIZE(testlist)); + for (idx = 0; idx < ARRAY_SIZE(testlist); idx++) { + testlist[idx].test(); + ksft_test_result_pass("%s\n", testlist[idx].name); + } + ksft_finished(); +} diff --git a/tools/testing/selftests/kvm/s390x/memop.c b/tools/testing/selftests/kvm/s390x/memop.c index 9f49ead380ab..f2df7416be84 100644 --- a/tools/testing/selftests/kvm/s390x/memop.c +++ b/tools/testing/selftests/kvm/s390x/memop.c @@ -4,163 +4,1186 @@ * * Copyright (C) 2019, Red Hat, Inc. */ - #include <stdio.h> #include <stdlib.h> #include <string.h> #include <sys/ioctl.h> +#include <pthread.h> + +#include <linux/bits.h> #include "test_util.h" #include "kvm_util.h" +#include "kselftest.h" +#include "ucall_common.h" + +enum mop_target { + LOGICAL, + SIDA, + ABSOLUTE, + INVALID, +}; + +enum mop_access_mode { + READ, + WRITE, + CMPXCHG, +}; + +struct mop_desc { + uintptr_t gaddr; + uintptr_t gaddr_v; + uint64_t set_flags; + unsigned int f_check : 1; + unsigned int f_inject : 1; + unsigned int f_key : 1; + unsigned int _gaddr_v : 1; + unsigned int _set_flags : 1; + unsigned int _sida_offset : 1; + unsigned int _ar : 1; + uint32_t size; + enum mop_target target; + enum mop_access_mode mode; + void *buf; + uint32_t sida_offset; + void *old; + uint8_t old_value[16]; + bool *cmpxchg_success; + uint8_t ar; + uint8_t key; +}; + +const uint8_t NO_KEY = 0xff; + +static struct kvm_s390_mem_op ksmo_from_desc(struct mop_desc *desc) +{ + struct kvm_s390_mem_op ksmo = { + .gaddr = (uintptr_t)desc->gaddr, + .size = desc->size, + .buf = ((uintptr_t)desc->buf), + .reserved = "ignored_ignored_ignored_ignored" + }; + + switch (desc->target) { + case LOGICAL: + if (desc->mode == READ) + ksmo.op = KVM_S390_MEMOP_LOGICAL_READ; + if (desc->mode == WRITE) + ksmo.op = KVM_S390_MEMOP_LOGICAL_WRITE; + break; + case SIDA: + if (desc->mode == READ) + ksmo.op = KVM_S390_MEMOP_SIDA_READ; + if (desc->mode == WRITE) + ksmo.op = KVM_S390_MEMOP_SIDA_WRITE; + break; + case ABSOLUTE: + if (desc->mode == READ) + ksmo.op = KVM_S390_MEMOP_ABSOLUTE_READ; + if (desc->mode == WRITE) + ksmo.op = KVM_S390_MEMOP_ABSOLUTE_WRITE; + if (desc->mode == CMPXCHG) { + ksmo.op = KVM_S390_MEMOP_ABSOLUTE_CMPXCHG; + ksmo.old_addr = (uint64_t)desc->old; + memcpy(desc->old_value, desc->old, desc->size); + } + break; + case INVALID: + ksmo.op = -1; + } + if (desc->f_check) + ksmo.flags |= KVM_S390_MEMOP_F_CHECK_ONLY; + if (desc->f_inject) + ksmo.flags |= KVM_S390_MEMOP_F_INJECT_EXCEPTION; + if (desc->_set_flags) + ksmo.flags = desc->set_flags; + if (desc->f_key && desc->key != NO_KEY) { + ksmo.flags |= KVM_S390_MEMOP_F_SKEY_PROTECTION; + ksmo.key = desc->key; + } + if (desc->_ar) + ksmo.ar = desc->ar; + else + ksmo.ar = 0; + if (desc->_sida_offset) + ksmo.sida_offset = desc->sida_offset; + + return ksmo; +} + +struct test_info { + struct kvm_vm *vm; + struct kvm_vcpu *vcpu; +}; + +#define PRINT_MEMOP false +static void print_memop(struct kvm_vcpu *vcpu, const struct kvm_s390_mem_op *ksmo) +{ + if (!PRINT_MEMOP) + return; + + if (!vcpu) + printf("vm memop("); + else + printf("vcpu memop("); + switch (ksmo->op) { + case KVM_S390_MEMOP_LOGICAL_READ: + printf("LOGICAL, READ, "); + break; + case KVM_S390_MEMOP_LOGICAL_WRITE: + printf("LOGICAL, WRITE, "); + break; + case KVM_S390_MEMOP_SIDA_READ: + printf("SIDA, READ, "); + break; + case KVM_S390_MEMOP_SIDA_WRITE: + printf("SIDA, WRITE, "); + break; + case KVM_S390_MEMOP_ABSOLUTE_READ: + printf("ABSOLUTE, READ, "); + break; + case KVM_S390_MEMOP_ABSOLUTE_WRITE: + printf("ABSOLUTE, WRITE, "); + break; + case KVM_S390_MEMOP_ABSOLUTE_CMPXCHG: + printf("ABSOLUTE, CMPXCHG, "); + break; + } + printf("gaddr=%llu, size=%u, buf=%llu, ar=%u, key=%u, old_addr=%llx", + ksmo->gaddr, ksmo->size, ksmo->buf, ksmo->ar, ksmo->key, + ksmo->old_addr); + if (ksmo->flags & KVM_S390_MEMOP_F_CHECK_ONLY) + printf(", CHECK_ONLY"); + if (ksmo->flags & KVM_S390_MEMOP_F_INJECT_EXCEPTION) + printf(", INJECT_EXCEPTION"); + if (ksmo->flags & KVM_S390_MEMOP_F_SKEY_PROTECTION) + printf(", SKEY_PROTECTION"); + puts(")"); +} + +static int err_memop_ioctl(struct test_info info, struct kvm_s390_mem_op *ksmo, + struct mop_desc *desc) +{ + struct kvm_vcpu *vcpu = info.vcpu; + + if (!vcpu) + return __vm_ioctl(info.vm, KVM_S390_MEM_OP, ksmo); + else + return __vcpu_ioctl(vcpu, KVM_S390_MEM_OP, ksmo); +} + +static void memop_ioctl(struct test_info info, struct kvm_s390_mem_op *ksmo, + struct mop_desc *desc) +{ + int r; + + r = err_memop_ioctl(info, ksmo, desc); + if (ksmo->op == KVM_S390_MEMOP_ABSOLUTE_CMPXCHG) { + if (desc->cmpxchg_success) { + int diff = memcmp(desc->old_value, desc->old, desc->size); + *desc->cmpxchg_success = !diff; + } + } + TEST_ASSERT(!r, __KVM_IOCTL_ERROR("KVM_S390_MEM_OP", r)); +} + +#define MEMOP(err, info_p, mop_target_p, access_mode_p, buf_p, size_p, ...) \ +({ \ + struct test_info __info = (info_p); \ + struct mop_desc __desc = { \ + .target = (mop_target_p), \ + .mode = (access_mode_p), \ + .buf = (buf_p), \ + .size = (size_p), \ + __VA_ARGS__ \ + }; \ + struct kvm_s390_mem_op __ksmo; \ + \ + if (__desc._gaddr_v) { \ + if (__desc.target == ABSOLUTE) \ + __desc.gaddr = addr_gva2gpa(__info.vm, __desc.gaddr_v); \ + else \ + __desc.gaddr = __desc.gaddr_v; \ + } \ + __ksmo = ksmo_from_desc(&__desc); \ + print_memop(__info.vcpu, &__ksmo); \ + err##memop_ioctl(__info, &__ksmo, &__desc); \ +}) + +#define MOP(...) MEMOP(, __VA_ARGS__) +#define ERR_MOP(...) MEMOP(err_, __VA_ARGS__) + +#define GADDR(a) .gaddr = ((uintptr_t)a) +#define GADDR_V(v) ._gaddr_v = 1, .gaddr_v = ((uintptr_t)v) +#define CHECK_ONLY .f_check = 1 +#define SET_FLAGS(f) ._set_flags = 1, .set_flags = (f) +#define SIDA_OFFSET(o) ._sida_offset = 1, .sida_offset = (o) +#define AR(a) ._ar = 1, .ar = (a) +#define KEY(a) .f_key = 1, .key = (a) +#define INJECT .f_inject = 1 +#define CMPXCHG_OLD(o) .old = (o) +#define CMPXCHG_SUCCESS(s) .cmpxchg_success = (s) + +#define CHECK_N_DO(f, ...) ({ f(__VA_ARGS__, CHECK_ONLY); f(__VA_ARGS__); }) + +#define PAGE_SHIFT 12 +#define PAGE_SIZE (1ULL << PAGE_SHIFT) +#define PAGE_MASK (~(PAGE_SIZE - 1)) +#define CR0_FETCH_PROTECTION_OVERRIDE (1UL << (63 - 38)) +#define CR0_STORAGE_PROTECTION_OVERRIDE (1UL << (63 - 39)) -#define VCPU_ID 1 +static uint8_t __aligned(PAGE_SIZE) mem1[65536]; +static uint8_t __aligned(PAGE_SIZE) mem2[65536]; -static uint8_t mem1[65536]; -static uint8_t mem2[65536]; +struct test_default { + struct kvm_vm *kvm_vm; + struct test_info vm; + struct test_info vcpu; + struct kvm_run *run; + int size; +}; -static void guest_code(void) +static struct test_default test_default_init(void *guest_code) +{ + struct kvm_vcpu *vcpu; + struct test_default t; + + t.size = min((size_t)kvm_check_cap(KVM_CAP_S390_MEM_OP), sizeof(mem1)); + t.kvm_vm = vm_create_with_one_vcpu(&vcpu, guest_code); + t.vm = (struct test_info) { t.kvm_vm, NULL }; + t.vcpu = (struct test_info) { t.kvm_vm, vcpu }; + t.run = vcpu->run; + return t; +} + +enum stage { + /* Synced state set by host, e.g. DAT */ + STAGE_INITED, + /* Guest did nothing */ + STAGE_IDLED, + /* Guest set storage keys (specifics up to test case) */ + STAGE_SKEYS_SET, + /* Guest copied memory (locations up to test case) */ + STAGE_COPIED, + /* End of guest code reached */ + STAGE_DONE, +}; + +#define HOST_SYNC(info_p, stage) \ +({ \ + struct test_info __info = (info_p); \ + struct kvm_vcpu *__vcpu = __info.vcpu; \ + struct ucall uc; \ + int __stage = (stage); \ + \ + vcpu_run(__vcpu); \ + get_ucall(__vcpu, &uc); \ + if (uc.cmd == UCALL_ABORT) { \ + REPORT_GUEST_ASSERT(uc); \ + } \ + TEST_ASSERT_EQ(uc.cmd, UCALL_SYNC); \ + TEST_ASSERT_EQ(uc.args[1], __stage); \ +}) \ + +static void prepare_mem12(void) { int i; + for (i = 0; i < sizeof(mem1); i++) + mem1[i] = rand(); + memset(mem2, 0xaa, sizeof(mem2)); +} + +#define ASSERT_MEM_EQ(p1, p2, size) \ + TEST_ASSERT(!memcmp(p1, p2, size), "Memory contents do not match!") + +static void default_write_read(struct test_info copy_cpu, struct test_info mop_cpu, + enum mop_target mop_target, uint32_t size, uint8_t key) +{ + prepare_mem12(); + CHECK_N_DO(MOP, mop_cpu, mop_target, WRITE, mem1, size, + GADDR_V(mem1), KEY(key)); + HOST_SYNC(copy_cpu, STAGE_COPIED); + CHECK_N_DO(MOP, mop_cpu, mop_target, READ, mem2, size, + GADDR_V(mem2), KEY(key)); + ASSERT_MEM_EQ(mem1, mem2, size); +} + +static void default_read(struct test_info copy_cpu, struct test_info mop_cpu, + enum mop_target mop_target, uint32_t size, uint8_t key) +{ + prepare_mem12(); + CHECK_N_DO(MOP, mop_cpu, mop_target, WRITE, mem1, size, GADDR_V(mem1)); + HOST_SYNC(copy_cpu, STAGE_COPIED); + CHECK_N_DO(MOP, mop_cpu, mop_target, READ, mem2, size, + GADDR_V(mem2), KEY(key)); + ASSERT_MEM_EQ(mem1, mem2, size); +} + +static void default_cmpxchg(struct test_default *test, uint8_t key) +{ + for (int size = 1; size <= 16; size *= 2) { + for (int offset = 0; offset < 16; offset += size) { + uint8_t __aligned(16) new[16] = {}; + uint8_t __aligned(16) old[16]; + bool succ; + + prepare_mem12(); + default_write_read(test->vcpu, test->vcpu, LOGICAL, 16, NO_KEY); + + memcpy(&old, mem1, 16); + MOP(test->vm, ABSOLUTE, CMPXCHG, new + offset, + size, GADDR_V(mem1 + offset), + CMPXCHG_OLD(old + offset), + CMPXCHG_SUCCESS(&succ), KEY(key)); + HOST_SYNC(test->vcpu, STAGE_COPIED); + MOP(test->vm, ABSOLUTE, READ, mem2, 16, GADDR_V(mem2)); + TEST_ASSERT(succ, "exchange of values should succeed"); + memcpy(mem1 + offset, new + offset, size); + ASSERT_MEM_EQ(mem1, mem2, 16); + + memcpy(&old, mem1, 16); + new[offset]++; + old[offset]++; + MOP(test->vm, ABSOLUTE, CMPXCHG, new + offset, + size, GADDR_V(mem1 + offset), + CMPXCHG_OLD(old + offset), + CMPXCHG_SUCCESS(&succ), KEY(key)); + HOST_SYNC(test->vcpu, STAGE_COPIED); + MOP(test->vm, ABSOLUTE, READ, mem2, 16, GADDR_V(mem2)); + TEST_ASSERT(!succ, "exchange of values should not succeed"); + ASSERT_MEM_EQ(mem1, mem2, 16); + ASSERT_MEM_EQ(&old, mem1, 16); + } + } +} + +static void guest_copy(void) +{ + GUEST_SYNC(STAGE_INITED); + memcpy(&mem2, &mem1, sizeof(mem2)); + GUEST_SYNC(STAGE_COPIED); +} + +static void test_copy(void) +{ + struct test_default t = test_default_init(guest_copy); + + HOST_SYNC(t.vcpu, STAGE_INITED); + + default_write_read(t.vcpu, t.vcpu, LOGICAL, t.size, NO_KEY); + + kvm_vm_free(t.kvm_vm); +} + +static void test_copy_access_register(void) +{ + struct test_default t = test_default_init(guest_copy); + + HOST_SYNC(t.vcpu, STAGE_INITED); + + prepare_mem12(); + t.run->psw_mask &= ~(3UL << (63 - 17)); + t.run->psw_mask |= 1UL << (63 - 17); /* Enable AR mode */ + + /* + * Primary address space gets used if an access register + * contains zero. The host makes use of AR[1] so is a good + * candidate to ensure the guest AR (of zero) is used. + */ + CHECK_N_DO(MOP, t.vcpu, LOGICAL, WRITE, mem1, t.size, + GADDR_V(mem1), AR(1)); + HOST_SYNC(t.vcpu, STAGE_COPIED); + + CHECK_N_DO(MOP, t.vcpu, LOGICAL, READ, mem2, t.size, + GADDR_V(mem2), AR(1)); + ASSERT_MEM_EQ(mem1, mem2, t.size); + + kvm_vm_free(t.kvm_vm); +} + +static void set_storage_key_range(void *addr, size_t len, uint8_t key) +{ + uintptr_t _addr, abs, i; + int not_mapped = 0; + + _addr = (uintptr_t)addr; + for (i = _addr & PAGE_MASK; i < _addr + len; i += PAGE_SIZE) { + abs = i; + asm volatile ( + "lra %[abs], 0(0,%[abs])\n" + " jz 0f\n" + " llill %[not_mapped],1\n" + " j 1f\n" + "0: sske %[key], %[abs]\n" + "1:" + : [abs] "+&a" (abs), [not_mapped] "+r" (not_mapped) + : [key] "r" (key) + : "cc" + ); + GUEST_ASSERT_EQ(not_mapped, 0); + } +} + +static void guest_copy_key(void) +{ + set_storage_key_range(mem1, sizeof(mem1), 0x90); + set_storage_key_range(mem2, sizeof(mem2), 0x90); + GUEST_SYNC(STAGE_SKEYS_SET); + for (;;) { - for (i = 0; i < sizeof(mem2); i++) - mem2[i] = mem1[i]; - GUEST_SYNC(0); + memcpy(&mem2, &mem1, sizeof(mem2)); + GUEST_SYNC(STAGE_COPIED); } } -int main(int argc, char *argv[]) +static void test_copy_key(void) { - struct kvm_vm *vm; - struct kvm_run *run; - struct kvm_s390_mem_op ksmo; - int rv, i, maxsize; + struct test_default t = test_default_init(guest_copy_key); + + HOST_SYNC(t.vcpu, STAGE_SKEYS_SET); + + /* vm, no key */ + default_write_read(t.vcpu, t.vm, ABSOLUTE, t.size, NO_KEY); + + /* vm/vcpu, machting key or key 0 */ + default_write_read(t.vcpu, t.vcpu, LOGICAL, t.size, 0); + default_write_read(t.vcpu, t.vcpu, LOGICAL, t.size, 9); + default_write_read(t.vcpu, t.vm, ABSOLUTE, t.size, 0); + default_write_read(t.vcpu, t.vm, ABSOLUTE, t.size, 9); + /* + * There used to be different code paths for key handling depending on + * if the region crossed a page boundary. + * There currently are not, but the more tests the merrier. + */ + default_write_read(t.vcpu, t.vcpu, LOGICAL, 1, 0); + default_write_read(t.vcpu, t.vcpu, LOGICAL, 1, 9); + default_write_read(t.vcpu, t.vm, ABSOLUTE, 1, 0); + default_write_read(t.vcpu, t.vm, ABSOLUTE, 1, 9); + + /* vm/vcpu, mismatching keys on read, but no fetch protection */ + default_read(t.vcpu, t.vcpu, LOGICAL, t.size, 2); + default_read(t.vcpu, t.vm, ABSOLUTE, t.size, 2); + + kvm_vm_free(t.kvm_vm); +} + +static void test_cmpxchg_key(void) +{ + struct test_default t = test_default_init(guest_copy_key); - setbuf(stdout, NULL); /* Tell stdout not to buffer its content */ + HOST_SYNC(t.vcpu, STAGE_SKEYS_SET); - maxsize = kvm_check_cap(KVM_CAP_S390_MEM_OP); - if (!maxsize) { - print_skip("CAP_S390_MEM_OP not supported"); - exit(KSFT_SKIP); + default_cmpxchg(&t, NO_KEY); + default_cmpxchg(&t, 0); + default_cmpxchg(&t, 9); + + kvm_vm_free(t.kvm_vm); +} + +static __uint128_t cut_to_size(int size, __uint128_t val) +{ + switch (size) { + case 1: + return (uint8_t)val; + case 2: + return (uint16_t)val; + case 4: + return (uint32_t)val; + case 8: + return (uint64_t)val; + case 16: + return val; } - if (maxsize > sizeof(mem1)) - maxsize = sizeof(mem1); + GUEST_FAIL("Invalid size = %u", size); + return 0; +} - /* Create VM */ - vm = vm_create_default(VCPU_ID, 0, guest_code); - run = vcpu_state(vm, VCPU_ID); +static bool popcount_eq(__uint128_t a, __uint128_t b) +{ + unsigned int count_a, count_b; - for (i = 0; i < sizeof(mem1); i++) - mem1[i] = i * i + i; - - /* Set the first array */ - ksmo.gaddr = addr_gva2gpa(vm, (uintptr_t)mem1); - ksmo.flags = 0; - ksmo.size = maxsize; - ksmo.op = KVM_S390_MEMOP_LOGICAL_WRITE; - ksmo.buf = (uintptr_t)mem1; - ksmo.ar = 0; - vcpu_ioctl(vm, VCPU_ID, KVM_S390_MEM_OP, &ksmo); - - /* Let the guest code copy the first array to the second */ - vcpu_run(vm, VCPU_ID); - TEST_ASSERT(run->exit_reason == KVM_EXIT_S390_SIEIC, - "Unexpected exit reason: %u (%s)\n", - run->exit_reason, - exit_reason_str(run->exit_reason)); + count_a = __builtin_popcountl((uint64_t)(a >> 64)) + + __builtin_popcountl((uint64_t)a); + count_b = __builtin_popcountl((uint64_t)(b >> 64)) + + __builtin_popcountl((uint64_t)b); + return count_a == count_b; +} - memset(mem2, 0xaa, sizeof(mem2)); +static __uint128_t rotate(int size, __uint128_t val, int amount) +{ + unsigned int bits = size * 8; + + amount = (amount + bits) % bits; + val = cut_to_size(size, val); + if (!amount) + return val; + return (val << (bits - amount)) | (val >> amount); +} + +const unsigned int max_block = 16; + +static void choose_block(bool guest, int i, int *size, int *offset) +{ + unsigned int rand; + + rand = i; + if (guest) { + rand = rand * 19 + 11; + *size = 1 << ((rand % 3) + 2); + rand = rand * 19 + 11; + *offset = (rand % max_block) & ~(*size - 1); + } else { + rand = rand * 17 + 5; + *size = 1 << (rand % 5); + rand = rand * 17 + 5; + *offset = (rand % max_block) & ~(*size - 1); + } +} + +static __uint128_t permutate_bits(bool guest, int i, int size, __uint128_t old) +{ + unsigned int rand; + int amount; + bool swap; + + rand = i; + rand = rand * 3 + 1; + if (guest) + rand = rand * 3 + 1; + swap = rand % 2 == 0; + if (swap) { + int i, j; + __uint128_t new; + uint8_t byte0, byte1; + + rand = rand * 3 + 1; + i = rand % size; + rand = rand * 3 + 1; + j = rand % size; + if (i == j) + return old; + new = rotate(16, old, i * 8); + byte0 = new & 0xff; + new &= ~0xff; + new = rotate(16, new, -i * 8); + new = rotate(16, new, j * 8); + byte1 = new & 0xff; + new = (new & ~0xff) | byte0; + new = rotate(16, new, -j * 8); + new = rotate(16, new, i * 8); + new = new | byte1; + new = rotate(16, new, -i * 8); + return new; + } + rand = rand * 3 + 1; + amount = rand % (size * 8); + return rotate(size, old, amount); +} + +static bool _cmpxchg(int size, void *target, __uint128_t *old_addr, __uint128_t new) +{ + bool ret; + + switch (size) { + case 4: { + uint32_t old = *old_addr; + + asm volatile ("cs %[old],%[new],%[address]" + : [old] "+d" (old), + [address] "+Q" (*(uint32_t *)(target)) + : [new] "d" ((uint32_t)new) + : "cc" + ); + ret = old == (uint32_t)*old_addr; + *old_addr = old; + return ret; + } + case 8: { + uint64_t old = *old_addr; + + asm volatile ("csg %[old],%[new],%[address]" + : [old] "+d" (old), + [address] "+Q" (*(uint64_t *)(target)) + : [new] "d" ((uint64_t)new) + : "cc" + ); + ret = old == (uint64_t)*old_addr; + *old_addr = old; + return ret; + } + case 16: { + __uint128_t old = *old_addr; + + asm volatile ("cdsg %[old],%[new],%[address]" + : [old] "+d" (old), + [address] "+Q" (*(__uint128_t *)(target)) + : [new] "d" (new) + : "cc" + ); + ret = old == *old_addr; + *old_addr = old; + return ret; + } + } + GUEST_FAIL("Invalid size = %u", size); + return 0; +} + +const unsigned int cmpxchg_iter_outer = 100, cmpxchg_iter_inner = 10000; + +static void guest_cmpxchg_key(void) +{ + int size, offset; + __uint128_t old, new; + + set_storage_key_range(mem1, max_block, 0x10); + set_storage_key_range(mem2, max_block, 0x10); + GUEST_SYNC(STAGE_SKEYS_SET); + + for (int i = 0; i < cmpxchg_iter_outer; i++) { + do { + old = 1; + } while (!_cmpxchg(16, mem1, &old, 0)); + for (int j = 0; j < cmpxchg_iter_inner; j++) { + choose_block(true, i + j, &size, &offset); + do { + new = permutate_bits(true, i + j, size, old); + } while (!_cmpxchg(size, mem2 + offset, &old, new)); + } + } + + GUEST_SYNC(STAGE_DONE); +} - /* Get the second array */ - ksmo.gaddr = (uintptr_t)mem2; - ksmo.flags = 0; - ksmo.size = maxsize; - ksmo.op = KVM_S390_MEMOP_LOGICAL_READ; - ksmo.buf = (uintptr_t)mem2; - ksmo.ar = 0; - vcpu_ioctl(vm, VCPU_ID, KVM_S390_MEM_OP, &ksmo); - - TEST_ASSERT(!memcmp(mem1, mem2, maxsize), - "Memory contents do not match!"); - - /* Check error conditions - first bad size: */ - ksmo.gaddr = (uintptr_t)mem1; - ksmo.flags = 0; - ksmo.size = -1; - ksmo.op = KVM_S390_MEMOP_LOGICAL_WRITE; - ksmo.buf = (uintptr_t)mem1; - ksmo.ar = 0; - rv = _vcpu_ioctl(vm, VCPU_ID, KVM_S390_MEM_OP, &ksmo); +static void *run_guest(void *data) +{ + struct test_info *info = data; + + HOST_SYNC(*info, STAGE_DONE); + return NULL; +} + +static char *quad_to_char(__uint128_t *quad, int size) +{ + return ((char *)quad) + (sizeof(*quad) - size); +} + +static void test_cmpxchg_key_concurrent(void) +{ + struct test_default t = test_default_init(guest_cmpxchg_key); + int size, offset; + __uint128_t old, new; + bool success; + pthread_t thread; + + HOST_SYNC(t.vcpu, STAGE_SKEYS_SET); + prepare_mem12(); + MOP(t.vcpu, LOGICAL, WRITE, mem1, max_block, GADDR_V(mem2)); + pthread_create(&thread, NULL, run_guest, &t.vcpu); + + for (int i = 0; i < cmpxchg_iter_outer; i++) { + do { + old = 0; + new = 1; + MOP(t.vm, ABSOLUTE, CMPXCHG, &new, + sizeof(new), GADDR_V(mem1), + CMPXCHG_OLD(&old), + CMPXCHG_SUCCESS(&success), KEY(1)); + } while (!success); + for (int j = 0; j < cmpxchg_iter_inner; j++) { + choose_block(false, i + j, &size, &offset); + do { + new = permutate_bits(false, i + j, size, old); + MOP(t.vm, ABSOLUTE, CMPXCHG, quad_to_char(&new, size), + size, GADDR_V(mem2 + offset), + CMPXCHG_OLD(quad_to_char(&old, size)), + CMPXCHG_SUCCESS(&success), KEY(1)); + } while (!success); + } + } + + pthread_join(thread, NULL); + + MOP(t.vcpu, LOGICAL, READ, mem2, max_block, GADDR_V(mem2)); + TEST_ASSERT(popcount_eq(*(__uint128_t *)mem1, *(__uint128_t *)mem2), + "Must retain number of set bits"); + + kvm_vm_free(t.kvm_vm); +} + +static void guest_copy_key_fetch_prot(void) +{ + /* + * For some reason combining the first sync with override enablement + * results in an exception when calling HOST_SYNC. + */ + GUEST_SYNC(STAGE_INITED); + /* Storage protection override applies to both store and fetch. */ + set_storage_key_range(mem1, sizeof(mem1), 0x98); + set_storage_key_range(mem2, sizeof(mem2), 0x98); + GUEST_SYNC(STAGE_SKEYS_SET); + + for (;;) { + memcpy(&mem2, &mem1, sizeof(mem2)); + GUEST_SYNC(STAGE_COPIED); + } +} + +static void test_copy_key_storage_prot_override(void) +{ + struct test_default t = test_default_init(guest_copy_key_fetch_prot); + + HOST_SYNC(t.vcpu, STAGE_INITED); + t.run->s.regs.crs[0] |= CR0_STORAGE_PROTECTION_OVERRIDE; + t.run->kvm_dirty_regs = KVM_SYNC_CRS; + HOST_SYNC(t.vcpu, STAGE_SKEYS_SET); + + /* vcpu, mismatching keys, storage protection override in effect */ + default_write_read(t.vcpu, t.vcpu, LOGICAL, t.size, 2); + + kvm_vm_free(t.kvm_vm); +} + +static void test_copy_key_fetch_prot(void) +{ + struct test_default t = test_default_init(guest_copy_key_fetch_prot); + + HOST_SYNC(t.vcpu, STAGE_INITED); + HOST_SYNC(t.vcpu, STAGE_SKEYS_SET); + + /* vm/vcpu, matching key, fetch protection in effect */ + default_read(t.vcpu, t.vcpu, LOGICAL, t.size, 9); + default_read(t.vcpu, t.vm, ABSOLUTE, t.size, 9); + + kvm_vm_free(t.kvm_vm); +} + +#define ERR_PROT_MOP(...) \ +({ \ + int rv; \ + \ + rv = ERR_MOP(__VA_ARGS__); \ + TEST_ASSERT(rv == 4, "Should result in protection exception"); \ +}) + +static void guest_error_key(void) +{ + GUEST_SYNC(STAGE_INITED); + set_storage_key_range(mem1, PAGE_SIZE, 0x18); + set_storage_key_range(mem1 + PAGE_SIZE, sizeof(mem1) - PAGE_SIZE, 0x98); + GUEST_SYNC(STAGE_SKEYS_SET); + GUEST_SYNC(STAGE_IDLED); +} + +static void test_errors_key(void) +{ + struct test_default t = test_default_init(guest_error_key); + + HOST_SYNC(t.vcpu, STAGE_INITED); + HOST_SYNC(t.vcpu, STAGE_SKEYS_SET); + + /* vm/vcpu, mismatching keys, fetch protection in effect */ + CHECK_N_DO(ERR_PROT_MOP, t.vcpu, LOGICAL, WRITE, mem1, t.size, GADDR_V(mem1), KEY(2)); + CHECK_N_DO(ERR_PROT_MOP, t.vcpu, LOGICAL, READ, mem2, t.size, GADDR_V(mem1), KEY(2)); + CHECK_N_DO(ERR_PROT_MOP, t.vm, ABSOLUTE, WRITE, mem1, t.size, GADDR_V(mem1), KEY(2)); + CHECK_N_DO(ERR_PROT_MOP, t.vm, ABSOLUTE, READ, mem2, t.size, GADDR_V(mem1), KEY(2)); + + kvm_vm_free(t.kvm_vm); +} + +static void test_errors_cmpxchg_key(void) +{ + struct test_default t = test_default_init(guest_copy_key_fetch_prot); + int i; + + HOST_SYNC(t.vcpu, STAGE_INITED); + HOST_SYNC(t.vcpu, STAGE_SKEYS_SET); + + for (i = 1; i <= 16; i *= 2) { + __uint128_t old = 0; + + ERR_PROT_MOP(t.vm, ABSOLUTE, CMPXCHG, mem2, i, GADDR_V(mem2), + CMPXCHG_OLD(&old), KEY(2)); + } + + kvm_vm_free(t.kvm_vm); +} + +static void test_termination(void) +{ + struct test_default t = test_default_init(guest_error_key); + uint64_t prefix; + uint64_t teid; + uint64_t teid_mask = BIT(63 - 56) | BIT(63 - 60) | BIT(63 - 61); + uint64_t psw[2]; + + HOST_SYNC(t.vcpu, STAGE_INITED); + HOST_SYNC(t.vcpu, STAGE_SKEYS_SET); + + /* vcpu, mismatching keys after first page */ + ERR_PROT_MOP(t.vcpu, LOGICAL, WRITE, mem1, t.size, GADDR_V(mem1), KEY(1), INJECT); + /* + * The memop injected a program exception and the test needs to check the + * Translation-Exception Identification (TEID). It is necessary to run + * the guest in order to be able to read the TEID from guest memory. + * Set the guest program new PSW, so the guest state is not clobbered. + */ + prefix = t.run->s.regs.prefix; + psw[0] = t.run->psw_mask; + psw[1] = t.run->psw_addr; + MOP(t.vm, ABSOLUTE, WRITE, psw, sizeof(psw), GADDR(prefix + 464)); + HOST_SYNC(t.vcpu, STAGE_IDLED); + MOP(t.vm, ABSOLUTE, READ, &teid, sizeof(teid), GADDR(prefix + 168)); + /* Bits 56, 60, 61 form a code, 0 being the only one allowing for termination */ + TEST_ASSERT_EQ(teid & teid_mask, 0); + + kvm_vm_free(t.kvm_vm); +} + +static void test_errors_key_storage_prot_override(void) +{ + struct test_default t = test_default_init(guest_copy_key_fetch_prot); + + HOST_SYNC(t.vcpu, STAGE_INITED); + t.run->s.regs.crs[0] |= CR0_STORAGE_PROTECTION_OVERRIDE; + t.run->kvm_dirty_regs = KVM_SYNC_CRS; + HOST_SYNC(t.vcpu, STAGE_SKEYS_SET); + + /* vm, mismatching keys, storage protection override not applicable to vm */ + CHECK_N_DO(ERR_PROT_MOP, t.vm, ABSOLUTE, WRITE, mem1, t.size, GADDR_V(mem1), KEY(2)); + CHECK_N_DO(ERR_PROT_MOP, t.vm, ABSOLUTE, READ, mem2, t.size, GADDR_V(mem2), KEY(2)); + + kvm_vm_free(t.kvm_vm); +} + +const uint64_t last_page_addr = -PAGE_SIZE; + +static void guest_copy_key_fetch_prot_override(void) +{ + int i; + char *page_0 = 0; + + GUEST_SYNC(STAGE_INITED); + set_storage_key_range(0, PAGE_SIZE, 0x18); + set_storage_key_range((void *)last_page_addr, PAGE_SIZE, 0x0); + asm volatile ("sske %[key],%[addr]\n" :: [addr] "r"(0L), [key] "r"(0x18) : "cc"); + GUEST_SYNC(STAGE_SKEYS_SET); + + for (;;) { + for (i = 0; i < PAGE_SIZE; i++) + page_0[i] = mem1[i]; + GUEST_SYNC(STAGE_COPIED); + } +} + +static void test_copy_key_fetch_prot_override(void) +{ + struct test_default t = test_default_init(guest_copy_key_fetch_prot_override); + vm_vaddr_t guest_0_page, guest_last_page; + + guest_0_page = vm_vaddr_alloc(t.kvm_vm, PAGE_SIZE, 0); + guest_last_page = vm_vaddr_alloc(t.kvm_vm, PAGE_SIZE, last_page_addr); + if (guest_0_page != 0 || guest_last_page != last_page_addr) { + print_skip("did not allocate guest pages at required positions"); + goto out; + } + + HOST_SYNC(t.vcpu, STAGE_INITED); + t.run->s.regs.crs[0] |= CR0_FETCH_PROTECTION_OVERRIDE; + t.run->kvm_dirty_regs = KVM_SYNC_CRS; + HOST_SYNC(t.vcpu, STAGE_SKEYS_SET); + + /* vcpu, mismatching keys on fetch, fetch protection override applies */ + prepare_mem12(); + MOP(t.vcpu, LOGICAL, WRITE, mem1, PAGE_SIZE, GADDR_V(mem1)); + HOST_SYNC(t.vcpu, STAGE_COPIED); + CHECK_N_DO(MOP, t.vcpu, LOGICAL, READ, mem2, 2048, GADDR_V(guest_0_page), KEY(2)); + ASSERT_MEM_EQ(mem1, mem2, 2048); + + /* + * vcpu, mismatching keys on fetch, fetch protection override applies, + * wraparound + */ + prepare_mem12(); + MOP(t.vcpu, LOGICAL, WRITE, mem1, 2 * PAGE_SIZE, GADDR_V(guest_last_page)); + HOST_SYNC(t.vcpu, STAGE_COPIED); + CHECK_N_DO(MOP, t.vcpu, LOGICAL, READ, mem2, PAGE_SIZE + 2048, + GADDR_V(guest_last_page), KEY(2)); + ASSERT_MEM_EQ(mem1, mem2, 2048); + +out: + kvm_vm_free(t.kvm_vm); +} + +static void test_errors_key_fetch_prot_override_not_enabled(void) +{ + struct test_default t = test_default_init(guest_copy_key_fetch_prot_override); + vm_vaddr_t guest_0_page, guest_last_page; + + guest_0_page = vm_vaddr_alloc(t.kvm_vm, PAGE_SIZE, 0); + guest_last_page = vm_vaddr_alloc(t.kvm_vm, PAGE_SIZE, last_page_addr); + if (guest_0_page != 0 || guest_last_page != last_page_addr) { + print_skip("did not allocate guest pages at required positions"); + goto out; + } + HOST_SYNC(t.vcpu, STAGE_INITED); + HOST_SYNC(t.vcpu, STAGE_SKEYS_SET); + + /* vcpu, mismatching keys on fetch, fetch protection override not enabled */ + CHECK_N_DO(ERR_PROT_MOP, t.vcpu, LOGICAL, READ, mem2, 2048, GADDR_V(0), KEY(2)); + +out: + kvm_vm_free(t.kvm_vm); +} + +static void test_errors_key_fetch_prot_override_enabled(void) +{ + struct test_default t = test_default_init(guest_copy_key_fetch_prot_override); + vm_vaddr_t guest_0_page, guest_last_page; + + guest_0_page = vm_vaddr_alloc(t.kvm_vm, PAGE_SIZE, 0); + guest_last_page = vm_vaddr_alloc(t.kvm_vm, PAGE_SIZE, last_page_addr); + if (guest_0_page != 0 || guest_last_page != last_page_addr) { + print_skip("did not allocate guest pages at required positions"); + goto out; + } + HOST_SYNC(t.vcpu, STAGE_INITED); + t.run->s.regs.crs[0] |= CR0_FETCH_PROTECTION_OVERRIDE; + t.run->kvm_dirty_regs = KVM_SYNC_CRS; + HOST_SYNC(t.vcpu, STAGE_SKEYS_SET); + + /* + * vcpu, mismatching keys on fetch, + * fetch protection override does not apply because memory range exceeded + */ + CHECK_N_DO(ERR_PROT_MOP, t.vcpu, LOGICAL, READ, mem2, 2048 + 1, GADDR_V(0), KEY(2)); + CHECK_N_DO(ERR_PROT_MOP, t.vcpu, LOGICAL, READ, mem2, PAGE_SIZE + 2048 + 1, + GADDR_V(guest_last_page), KEY(2)); + /* vm, fetch protected override does not apply */ + CHECK_N_DO(ERR_PROT_MOP, t.vm, ABSOLUTE, READ, mem2, 2048, GADDR(0), KEY(2)); + CHECK_N_DO(ERR_PROT_MOP, t.vm, ABSOLUTE, READ, mem2, 2048, GADDR_V(guest_0_page), KEY(2)); + +out: + kvm_vm_free(t.kvm_vm); +} + +static void guest_idle(void) +{ + GUEST_SYNC(STAGE_INITED); /* for consistency's sake */ + for (;;) + GUEST_SYNC(STAGE_IDLED); +} + +static void _test_errors_common(struct test_info info, enum mop_target target, int size) +{ + int rv; + + /* Bad size: */ + rv = ERR_MOP(info, target, WRITE, mem1, -1, GADDR_V(mem1)); TEST_ASSERT(rv == -1 && errno == E2BIG, "ioctl allows insane sizes"); /* Zero size: */ - ksmo.gaddr = (uintptr_t)mem1; - ksmo.flags = 0; - ksmo.size = 0; - ksmo.op = KVM_S390_MEMOP_LOGICAL_WRITE; - ksmo.buf = (uintptr_t)mem1; - ksmo.ar = 0; - rv = _vcpu_ioctl(vm, VCPU_ID, KVM_S390_MEM_OP, &ksmo); + rv = ERR_MOP(info, target, WRITE, mem1, 0, GADDR_V(mem1)); TEST_ASSERT(rv == -1 && (errno == EINVAL || errno == ENOMEM), "ioctl allows 0 as size"); /* Bad flags: */ - ksmo.gaddr = (uintptr_t)mem1; - ksmo.flags = -1; - ksmo.size = maxsize; - ksmo.op = KVM_S390_MEMOP_LOGICAL_WRITE; - ksmo.buf = (uintptr_t)mem1; - ksmo.ar = 0; - rv = _vcpu_ioctl(vm, VCPU_ID, KVM_S390_MEM_OP, &ksmo); + rv = ERR_MOP(info, target, WRITE, mem1, size, GADDR_V(mem1), SET_FLAGS(-1)); TEST_ASSERT(rv == -1 && errno == EINVAL, "ioctl allows all flags"); - /* Bad operation: */ - ksmo.gaddr = (uintptr_t)mem1; - ksmo.flags = 0; - ksmo.size = maxsize; - ksmo.op = -1; - ksmo.buf = (uintptr_t)mem1; - ksmo.ar = 0; - rv = _vcpu_ioctl(vm, VCPU_ID, KVM_S390_MEM_OP, &ksmo); - TEST_ASSERT(rv == -1 && errno == EINVAL, "ioctl allows bad operations"); - /* Bad guest address: */ - ksmo.gaddr = ~0xfffUL; - ksmo.flags = KVM_S390_MEMOP_F_CHECK_ONLY; - ksmo.size = maxsize; - ksmo.op = KVM_S390_MEMOP_LOGICAL_WRITE; - ksmo.buf = (uintptr_t)mem1; - ksmo.ar = 0; - rv = _vcpu_ioctl(vm, VCPU_ID, KVM_S390_MEM_OP, &ksmo); - TEST_ASSERT(rv > 0, "ioctl does not report bad guest memory access"); + rv = ERR_MOP(info, target, WRITE, mem1, size, GADDR((void *)~0xfffUL), CHECK_ONLY); + TEST_ASSERT(rv > 0, "ioctl does not report bad guest memory address with CHECK_ONLY"); + rv = ERR_MOP(info, target, WRITE, mem1, size, GADDR((void *)~0xfffUL)); + TEST_ASSERT(rv > 0, "ioctl does not report bad guest memory address on write"); /* Bad host address: */ - ksmo.gaddr = (uintptr_t)mem1; - ksmo.flags = 0; - ksmo.size = maxsize; - ksmo.op = KVM_S390_MEMOP_LOGICAL_WRITE; - ksmo.buf = 0; - ksmo.ar = 0; - rv = _vcpu_ioctl(vm, VCPU_ID, KVM_S390_MEM_OP, &ksmo); + rv = ERR_MOP(info, target, WRITE, 0, size, GADDR_V(mem1)); TEST_ASSERT(rv == -1 && errno == EFAULT, "ioctl does not report bad host memory address"); + /* Bad key: */ + rv = ERR_MOP(info, target, WRITE, mem1, size, GADDR_V(mem1), KEY(17)); + TEST_ASSERT(rv == -1 && errno == EINVAL, "ioctl allows invalid key"); +} + +static void test_errors(void) +{ + struct test_default t = test_default_init(guest_idle); + int rv; + + HOST_SYNC(t.vcpu, STAGE_INITED); + + _test_errors_common(t.vcpu, LOGICAL, t.size); + _test_errors_common(t.vm, ABSOLUTE, t.size); + + /* Bad operation: */ + rv = ERR_MOP(t.vcpu, INVALID, WRITE, mem1, t.size, GADDR_V(mem1)); + TEST_ASSERT(rv == -1 && errno == EINVAL, "ioctl allows bad operations"); + /* virtual addresses are not translated when passing INVALID */ + rv = ERR_MOP(t.vm, INVALID, WRITE, mem1, PAGE_SIZE, GADDR(0)); + TEST_ASSERT(rv == -1 && errno == EINVAL, "ioctl allows bad operations"); + /* Bad access register: */ - run->psw_mask &= ~(3UL << (63 - 17)); - run->psw_mask |= 1UL << (63 - 17); /* Enable AR mode */ - vcpu_run(vm, VCPU_ID); /* To sync new state to SIE block */ - ksmo.gaddr = (uintptr_t)mem1; - ksmo.flags = 0; - ksmo.size = maxsize; - ksmo.op = KVM_S390_MEMOP_LOGICAL_WRITE; - ksmo.buf = (uintptr_t)mem1; - ksmo.ar = 17; - rv = _vcpu_ioctl(vm, VCPU_ID, KVM_S390_MEM_OP, &ksmo); + t.run->psw_mask &= ~(3UL << (63 - 17)); + t.run->psw_mask |= 1UL << (63 - 17); /* Enable AR mode */ + HOST_SYNC(t.vcpu, STAGE_IDLED); /* To sync new state to SIE block */ + rv = ERR_MOP(t.vcpu, LOGICAL, WRITE, mem1, t.size, GADDR_V(mem1), AR(17)); TEST_ASSERT(rv == -1 && errno == EINVAL, "ioctl allows ARs > 15"); - run->psw_mask &= ~(3UL << (63 - 17)); /* Disable AR mode */ - vcpu_run(vm, VCPU_ID); /* Run to sync new state */ + t.run->psw_mask &= ~(3UL << (63 - 17)); /* Disable AR mode */ + HOST_SYNC(t.vcpu, STAGE_IDLED); /* Run to sync new state */ - kvm_vm_free(vm); + /* Check that the SIDA calls are rejected for non-protected guests */ + rv = ERR_MOP(t.vcpu, SIDA, READ, mem1, 8, GADDR(0), SIDA_OFFSET(0x1c0)); + TEST_ASSERT(rv == -1 && errno == EINVAL, + "ioctl does not reject SIDA_READ in non-protected mode"); + rv = ERR_MOP(t.vcpu, SIDA, WRITE, mem1, 8, GADDR(0), SIDA_OFFSET(0x1c0)); + TEST_ASSERT(rv == -1 && errno == EINVAL, + "ioctl does not reject SIDA_WRITE in non-protected mode"); - return 0; + kvm_vm_free(t.kvm_vm); +} + +static void test_errors_cmpxchg(void) +{ + struct test_default t = test_default_init(guest_idle); + __uint128_t old; + int rv, i, power = 1; + + HOST_SYNC(t.vcpu, STAGE_INITED); + + for (i = 0; i < 32; i++) { + if (i == power) { + power *= 2; + continue; + } + rv = ERR_MOP(t.vm, ABSOLUTE, CMPXCHG, mem1, i, GADDR_V(mem1), + CMPXCHG_OLD(&old)); + TEST_ASSERT(rv == -1 && errno == EINVAL, + "ioctl allows bad size for cmpxchg"); + } + for (i = 1; i <= 16; i *= 2) { + rv = ERR_MOP(t.vm, ABSOLUTE, CMPXCHG, mem1, i, GADDR((void *)~0xfffUL), + CMPXCHG_OLD(&old)); + TEST_ASSERT(rv > 0, "ioctl allows bad guest address for cmpxchg"); + } + for (i = 2; i <= 16; i *= 2) { + rv = ERR_MOP(t.vm, ABSOLUTE, CMPXCHG, mem1, i, GADDR_V(mem1 + 1), + CMPXCHG_OLD(&old)); + TEST_ASSERT(rv == -1 && errno == EINVAL, + "ioctl allows bad alignment for cmpxchg"); + } + + kvm_vm_free(t.kvm_vm); +} + +int main(int argc, char *argv[]) +{ + int extension_cap, idx; + + TEST_REQUIRE(kvm_has_cap(KVM_CAP_S390_MEM_OP)); + extension_cap = kvm_check_cap(KVM_CAP_S390_MEM_OP_EXTENSION); + + struct testdef { + const char *name; + void (*test)(void); + bool requirements_met; + } testlist[] = { + { + .name = "simple copy", + .test = test_copy, + .requirements_met = true, + }, + { + .name = "generic error checks", + .test = test_errors, + .requirements_met = true, + }, + { + .name = "copy with storage keys", + .test = test_copy_key, + .requirements_met = extension_cap > 0, + }, + { + .name = "cmpxchg with storage keys", + .test = test_cmpxchg_key, + .requirements_met = extension_cap & 0x2, + }, + { + .name = "concurrently cmpxchg with storage keys", + .test = test_cmpxchg_key_concurrent, + .requirements_met = extension_cap & 0x2, + }, + { + .name = "copy with key storage protection override", + .test = test_copy_key_storage_prot_override, + .requirements_met = extension_cap > 0, + }, + { + .name = "copy with key fetch protection", + .test = test_copy_key_fetch_prot, + .requirements_met = extension_cap > 0, + }, + { + .name = "copy with key fetch protection override", + .test = test_copy_key_fetch_prot_override, + .requirements_met = extension_cap > 0, + }, + { + .name = "copy with access register mode", + .test = test_copy_access_register, + .requirements_met = true, + }, + { + .name = "error checks with key", + .test = test_errors_key, + .requirements_met = extension_cap > 0, + }, + { + .name = "error checks for cmpxchg with key", + .test = test_errors_cmpxchg_key, + .requirements_met = extension_cap & 0x2, + }, + { + .name = "error checks for cmpxchg", + .test = test_errors_cmpxchg, + .requirements_met = extension_cap & 0x2, + }, + { + .name = "termination", + .test = test_termination, + .requirements_met = extension_cap > 0, + }, + { + .name = "error checks with key storage protection override", + .test = test_errors_key_storage_prot_override, + .requirements_met = extension_cap > 0, + }, + { + .name = "error checks without key fetch prot override", + .test = test_errors_key_fetch_prot_override_not_enabled, + .requirements_met = extension_cap > 0, + }, + { + .name = "error checks with key fetch prot override", + .test = test_errors_key_fetch_prot_override_enabled, + .requirements_met = extension_cap > 0, + }, + }; + + ksft_print_header(); + ksft_set_plan(ARRAY_SIZE(testlist)); + + for (idx = 0; idx < ARRAY_SIZE(testlist); idx++) { + if (testlist[idx].requirements_met) { + testlist[idx].test(); + ksft_test_result_pass("%s\n", testlist[idx].name); + } else { + ksft_test_result_skip("%s - requirements not met (kernel has extension cap %#x)\n", + testlist[idx].name, extension_cap); + } + } + + ksft_finished(); /* Print results and exit() accordingly */ } diff --git a/tools/testing/selftests/kvm/s390x/resets.c b/tools/testing/selftests/kvm/s390x/resets.c index b143db6d8693..357943f2bea8 100644 --- a/tools/testing/selftests/kvm/s390x/resets.c +++ b/tools/testing/selftests/kvm/s390x/resets.c @@ -12,15 +12,14 @@ #include "test_util.h" #include "kvm_util.h" +#include "kselftest.h" -#define VCPU_ID 3 #define LOCAL_IRQS 32 -struct kvm_s390_irq buf[VCPU_ID + LOCAL_IRQS]; +#define ARBITRARY_NON_ZERO_VCPU_ID 3 + +struct kvm_s390_irq buf[ARBITRARY_NON_ZERO_VCPU_ID + LOCAL_IRQS]; -struct kvm_vm *vm; -struct kvm_run *run; -struct kvm_sync_regs *sync_regs; static uint8_t regs_null[512]; static void guest_code_initial(void) @@ -58,47 +57,45 @@ static void guest_code_initial(void) ); } -static void test_one_reg(uint64_t id, uint64_t value) +static void test_one_reg(struct kvm_vcpu *vcpu, uint64_t id, uint64_t value) { - struct kvm_one_reg reg; uint64_t eval_reg; - reg.addr = (uintptr_t)&eval_reg; - reg.id = id; - vcpu_get_reg(vm, VCPU_ID, ®); + vcpu_get_reg(vcpu, id, &eval_reg); TEST_ASSERT(eval_reg == value, "value == 0x%lx", value); } -static void assert_noirq(void) +static void assert_noirq(struct kvm_vcpu *vcpu) { struct kvm_s390_irq_state irq_state; int irqs; irq_state.len = sizeof(buf); irq_state.buf = (unsigned long)buf; - irqs = _vcpu_ioctl(vm, VCPU_ID, KVM_S390_GET_IRQ_STATE, &irq_state); + irqs = __vcpu_ioctl(vcpu, KVM_S390_GET_IRQ_STATE, &irq_state); /* * irqs contains the number of retrieved interrupts. Any interrupt * (notably, the emergency call interrupt we have injected) should * be cleared by the resets, so this should be 0. */ - TEST_ASSERT(irqs >= 0, "Could not fetch IRQs: errno %d\n", errno); + TEST_ASSERT(irqs >= 0, "Could not fetch IRQs: errno %d", errno); TEST_ASSERT(!irqs, "IRQ pending"); } -static void assert_clear(void) +static void assert_clear(struct kvm_vcpu *vcpu) { + struct kvm_sync_regs *sync_regs = &vcpu->run->s.regs; struct kvm_sregs sregs; struct kvm_regs regs; struct kvm_fpu fpu; - vcpu_regs_get(vm, VCPU_ID, ®s); + vcpu_regs_get(vcpu, ®s); TEST_ASSERT(!memcmp(®s.gprs, regs_null, sizeof(regs.gprs)), "grs == 0"); - vcpu_sregs_get(vm, VCPU_ID, &sregs); + vcpu_sregs_get(vcpu, &sregs); TEST_ASSERT(!memcmp(&sregs.acrs, regs_null, sizeof(sregs.acrs)), "acrs == 0"); - vcpu_fpu_get(vm, VCPU_ID, &fpu); + vcpu_fpu_get(vcpu, &fpu); TEST_ASSERT(!memcmp(&fpu.fprs, regs_null, sizeof(fpu.fprs)), "fprs == 0"); /* sync regs */ @@ -112,8 +109,10 @@ static void assert_clear(void) "vrs0-15 == 0 (sync_regs)"); } -static void assert_initial_noclear(void) +static void assert_initial_noclear(struct kvm_vcpu *vcpu) { + struct kvm_sync_regs *sync_regs = &vcpu->run->s.regs; + TEST_ASSERT(sync_regs->gprs[0] == 0xffff000000000000UL, "gpr0 == 0xffff000000000000 (sync_regs)"); TEST_ASSERT(sync_regs->gprs[1] == 0x0000555500000000UL, @@ -127,13 +126,14 @@ static void assert_initial_noclear(void) TEST_ASSERT(sync_regs->acrs[9] == 1, "ar9 == 1 (sync_regs)"); } -static void assert_initial(void) +static void assert_initial(struct kvm_vcpu *vcpu) { + struct kvm_sync_regs *sync_regs = &vcpu->run->s.regs; struct kvm_sregs sregs; struct kvm_fpu fpu; /* KVM_GET_SREGS */ - vcpu_sregs_get(vm, VCPU_ID, &sregs); + vcpu_sregs_get(vcpu, &sregs); TEST_ASSERT(sregs.crs[0] == 0xE0UL, "cr0 == 0xE0 (KVM_GET_SREGS)"); TEST_ASSERT(sregs.crs[14] == 0xC2000000UL, "cr14 == 0xC2000000 (KVM_GET_SREGS)"); @@ -156,36 +156,38 @@ static void assert_initial(void) TEST_ASSERT(sync_regs->gbea == 1, "gbea == 1 (sync_regs)"); /* kvm_run */ - TEST_ASSERT(run->psw_addr == 0, "psw_addr == 0 (kvm_run)"); - TEST_ASSERT(run->psw_mask == 0, "psw_mask == 0 (kvm_run)"); + TEST_ASSERT(vcpu->run->psw_addr == 0, "psw_addr == 0 (kvm_run)"); + TEST_ASSERT(vcpu->run->psw_mask == 0, "psw_mask == 0 (kvm_run)"); - vcpu_fpu_get(vm, VCPU_ID, &fpu); + vcpu_fpu_get(vcpu, &fpu); TEST_ASSERT(!fpu.fpc, "fpc == 0"); - test_one_reg(KVM_REG_S390_GBEA, 1); - test_one_reg(KVM_REG_S390_PP, 0); - test_one_reg(KVM_REG_S390_TODPR, 0); - test_one_reg(KVM_REG_S390_CPU_TIMER, 0); - test_one_reg(KVM_REG_S390_CLOCK_COMP, 0); + test_one_reg(vcpu, KVM_REG_S390_GBEA, 1); + test_one_reg(vcpu, KVM_REG_S390_PP, 0); + test_one_reg(vcpu, KVM_REG_S390_TODPR, 0); + test_one_reg(vcpu, KVM_REG_S390_CPU_TIMER, 0); + test_one_reg(vcpu, KVM_REG_S390_CLOCK_COMP, 0); } -static void assert_normal_noclear(void) +static void assert_normal_noclear(struct kvm_vcpu *vcpu) { + struct kvm_sync_regs *sync_regs = &vcpu->run->s.regs; + TEST_ASSERT(sync_regs->crs[2] == 0x10, "cr2 == 10 (sync_regs)"); TEST_ASSERT(sync_regs->crs[8] == 1, "cr10 == 1 (sync_regs)"); TEST_ASSERT(sync_regs->crs[10] == 1, "cr10 == 1 (sync_regs)"); TEST_ASSERT(sync_regs->crs[11] == -1, "cr11 == -1 (sync_regs)"); } -static void assert_normal(void) +static void assert_normal(struct kvm_vcpu *vcpu) { - test_one_reg(KVM_REG_S390_PFTOKEN, KVM_S390_PFAULT_TOKEN_INVALID); - TEST_ASSERT(sync_regs->pft == KVM_S390_PFAULT_TOKEN_INVALID, + test_one_reg(vcpu, KVM_REG_S390_PFTOKEN, KVM_S390_PFAULT_TOKEN_INVALID); + TEST_ASSERT(vcpu->run->s.regs.pft == KVM_S390_PFAULT_TOKEN_INVALID, "pft == 0xff..... (sync_regs)"); - assert_noirq(); + assert_noirq(vcpu); } -static void inject_irq(int cpu_id) +static void inject_irq(struct kvm_vcpu *vcpu) { struct kvm_s390_irq_state irq_state; struct kvm_s390_irq *irq = &buf[0]; @@ -195,85 +197,117 @@ static void inject_irq(int cpu_id) irq_state.len = sizeof(struct kvm_s390_irq); irq_state.buf = (unsigned long)buf; irq->type = KVM_S390_INT_EMERGENCY; - irq->u.emerg.code = cpu_id; - irqs = _vcpu_ioctl(vm, cpu_id, KVM_S390_SET_IRQ_STATE, &irq_state); - TEST_ASSERT(irqs >= 0, "Error injecting EMERGENCY IRQ errno %d\n", errno); + irq->u.emerg.code = vcpu->id; + irqs = __vcpu_ioctl(vcpu, KVM_S390_SET_IRQ_STATE, &irq_state); + TEST_ASSERT(irqs >= 0, "Error injecting EMERGENCY IRQ errno %d", errno); +} + +static struct kvm_vm *create_vm(struct kvm_vcpu **vcpu) +{ + struct kvm_vm *vm; + + vm = vm_create(1); + + *vcpu = vm_vcpu_add(vm, ARBITRARY_NON_ZERO_VCPU_ID, guest_code_initial); + + return vm; } static void test_normal(void) { - pr_info("Testing normal reset\n"); - /* Create VM */ - vm = vm_create_default(VCPU_ID, 0, guest_code_initial); - run = vcpu_state(vm, VCPU_ID); - sync_regs = &run->s.regs; + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; + + ksft_print_msg("Testing normal reset\n"); + vm = create_vm(&vcpu); - vcpu_run(vm, VCPU_ID); + vcpu_run(vcpu); - inject_irq(VCPU_ID); + inject_irq(vcpu); - vcpu_ioctl(vm, VCPU_ID, KVM_S390_NORMAL_RESET, 0); + vcpu_ioctl(vcpu, KVM_S390_NORMAL_RESET, NULL); /* must clears */ - assert_normal(); + assert_normal(vcpu); /* must not clears */ - assert_normal_noclear(); - assert_initial_noclear(); + assert_normal_noclear(vcpu); + assert_initial_noclear(vcpu); kvm_vm_free(vm); } static void test_initial(void) { - pr_info("Testing initial reset\n"); - vm = vm_create_default(VCPU_ID, 0, guest_code_initial); - run = vcpu_state(vm, VCPU_ID); - sync_regs = &run->s.regs; + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; - vcpu_run(vm, VCPU_ID); + ksft_print_msg("Testing initial reset\n"); + vm = create_vm(&vcpu); - inject_irq(VCPU_ID); + vcpu_run(vcpu); - vcpu_ioctl(vm, VCPU_ID, KVM_S390_INITIAL_RESET, 0); + inject_irq(vcpu); + + vcpu_ioctl(vcpu, KVM_S390_INITIAL_RESET, NULL); /* must clears */ - assert_normal(); - assert_initial(); + assert_normal(vcpu); + assert_initial(vcpu); /* must not clears */ - assert_initial_noclear(); + assert_initial_noclear(vcpu); kvm_vm_free(vm); } static void test_clear(void) { - pr_info("Testing clear reset\n"); - vm = vm_create_default(VCPU_ID, 0, guest_code_initial); - run = vcpu_state(vm, VCPU_ID); - sync_regs = &run->s.regs; + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; + + ksft_print_msg("Testing clear reset\n"); + vm = create_vm(&vcpu); - vcpu_run(vm, VCPU_ID); + vcpu_run(vcpu); - inject_irq(VCPU_ID); + inject_irq(vcpu); - vcpu_ioctl(vm, VCPU_ID, KVM_S390_CLEAR_RESET, 0); + vcpu_ioctl(vcpu, KVM_S390_CLEAR_RESET, NULL); /* must clears */ - assert_normal(); - assert_initial(); - assert_clear(); + assert_normal(vcpu); + assert_initial(vcpu); + assert_clear(vcpu); kvm_vm_free(vm); } +struct testdef { + const char *name; + void (*test)(void); + bool needs_cap; +} testlist[] = { + { "initial", test_initial, false }, + { "normal", test_normal, true }, + { "clear", test_clear, true }, +}; + int main(int argc, char *argv[]) { - setbuf(stdout, NULL); /* Tell stdout not to buffer its content */ - - test_initial(); - if (kvm_check_cap(KVM_CAP_S390_VCPU_RESETS)) { - test_normal(); - test_clear(); + bool has_s390_vcpu_resets = kvm_check_cap(KVM_CAP_S390_VCPU_RESETS); + int idx; + + ksft_print_header(); + ksft_set_plan(ARRAY_SIZE(testlist)); + + for (idx = 0; idx < ARRAY_SIZE(testlist); idx++) { + if (!testlist[idx].needs_cap || has_s390_vcpu_resets) { + testlist[idx].test(); + ksft_test_result_pass("%s\n", testlist[idx].name); + } else { + ksft_test_result_skip("%s - no VCPU_RESETS capability\n", + testlist[idx].name); + } } - return 0; + + ksft_finished(); /* Print results and exit() accordingly */ } diff --git a/tools/testing/selftests/kvm/s390x/sync_regs_test.c b/tools/testing/selftests/kvm/s390x/sync_regs_test.c index 5731ccf34917..53def355ccba 100644 --- a/tools/testing/selftests/kvm/s390x/sync_regs_test.c +++ b/tools/testing/selftests/kvm/s390x/sync_regs_test.c @@ -10,8 +10,6 @@ * * Test expected behavior of the KVM_CAP_SYNC_REGS functionality. */ - -#define _GNU_SOURCE /* for program_invocation_short_name */ #include <fcntl.h> #include <stdio.h> #include <stdlib.h> @@ -20,8 +18,8 @@ #include "test_util.h" #include "kvm_util.h" - -#define VCPU_ID 5 +#include "diag318_test_handler.h" +#include "kselftest.h" static void guest_code(void) { @@ -39,13 +37,13 @@ static void guest_code(void) #define REG_COMPARE(reg) \ TEST_ASSERT(left->reg == right->reg, \ "Register " #reg \ - " values did not match: 0x%llx, 0x%llx\n", \ + " values did not match: 0x%llx, 0x%llx", \ left->reg, right->reg) #define REG_COMPARE32(reg) \ TEST_ASSERT(left->reg == right->reg, \ "Register " #reg \ - " values did not match: 0x%x, 0x%x\n", \ + " values did not match: 0x%x, 0x%x", \ left->reg, right->reg) @@ -70,81 +68,83 @@ static void compare_sregs(struct kvm_sregs *left, struct kvm_sync_regs *right) #undef REG_COMPARE -#define TEST_SYNC_FIELDS (KVM_SYNC_GPRS|KVM_SYNC_ACRS|KVM_SYNC_CRS) +#define TEST_SYNC_FIELDS (KVM_SYNC_GPRS|KVM_SYNC_ACRS|KVM_SYNC_CRS|KVM_SYNC_DIAG318) #define INVALID_SYNC_FIELD 0x80000000 -int main(int argc, char *argv[]) +void test_read_invalid(struct kvm_vcpu *vcpu) { - struct kvm_vm *vm; - struct kvm_run *run; - struct kvm_regs regs; - struct kvm_sregs sregs; - int rv, cap; - - /* Tell stdout not to buffer its content */ - setbuf(stdout, NULL); - - cap = kvm_check_cap(KVM_CAP_SYNC_REGS); - if (!cap) { - print_skip("CAP_SYNC_REGS not supported"); - exit(KSFT_SKIP); - } - - /* Create VM */ - vm = vm_create_default(VCPU_ID, 0, guest_code); - - run = vcpu_state(vm, VCPU_ID); + struct kvm_run *run = vcpu->run; + int rv; /* Request reading invalid register set from VCPU. */ run->kvm_valid_regs = INVALID_SYNC_FIELD; - rv = _vcpu_run(vm, VCPU_ID); + rv = _vcpu_run(vcpu); TEST_ASSERT(rv < 0 && errno == EINVAL, - "Invalid kvm_valid_regs did not cause expected KVM_RUN error: %d\n", + "Invalid kvm_valid_regs did not cause expected KVM_RUN error: %d", rv); - vcpu_state(vm, VCPU_ID)->kvm_valid_regs = 0; + run->kvm_valid_regs = 0; run->kvm_valid_regs = INVALID_SYNC_FIELD | TEST_SYNC_FIELDS; - rv = _vcpu_run(vm, VCPU_ID); + rv = _vcpu_run(vcpu); TEST_ASSERT(rv < 0 && errno == EINVAL, - "Invalid kvm_valid_regs did not cause expected KVM_RUN error: %d\n", + "Invalid kvm_valid_regs did not cause expected KVM_RUN error: %d", rv); - vcpu_state(vm, VCPU_ID)->kvm_valid_regs = 0; + run->kvm_valid_regs = 0; +} + +void test_set_invalid(struct kvm_vcpu *vcpu) +{ + struct kvm_run *run = vcpu->run; + int rv; /* Request setting invalid register set into VCPU. */ run->kvm_dirty_regs = INVALID_SYNC_FIELD; - rv = _vcpu_run(vm, VCPU_ID); + rv = _vcpu_run(vcpu); TEST_ASSERT(rv < 0 && errno == EINVAL, - "Invalid kvm_dirty_regs did not cause expected KVM_RUN error: %d\n", + "Invalid kvm_dirty_regs did not cause expected KVM_RUN error: %d", rv); - vcpu_state(vm, VCPU_ID)->kvm_dirty_regs = 0; + run->kvm_dirty_regs = 0; run->kvm_dirty_regs = INVALID_SYNC_FIELD | TEST_SYNC_FIELDS; - rv = _vcpu_run(vm, VCPU_ID); + rv = _vcpu_run(vcpu); TEST_ASSERT(rv < 0 && errno == EINVAL, - "Invalid kvm_dirty_regs did not cause expected KVM_RUN error: %d\n", + "Invalid kvm_dirty_regs did not cause expected KVM_RUN error: %d", rv); - vcpu_state(vm, VCPU_ID)->kvm_dirty_regs = 0; + run->kvm_dirty_regs = 0; +} + +void test_req_and_verify_all_valid_regs(struct kvm_vcpu *vcpu) +{ + struct kvm_run *run = vcpu->run; + struct kvm_sregs sregs; + struct kvm_regs regs; + int rv; /* Request and verify all valid register sets. */ run->kvm_valid_regs = TEST_SYNC_FIELDS; - rv = _vcpu_run(vm, VCPU_ID); - TEST_ASSERT(rv == 0, "vcpu_run failed: %d\n", rv); - TEST_ASSERT(run->exit_reason == KVM_EXIT_S390_SIEIC, - "Unexpected exit reason: %u (%s)\n", - run->exit_reason, - exit_reason_str(run->exit_reason)); + rv = _vcpu_run(vcpu); + TEST_ASSERT(rv == 0, "vcpu_run failed: %d", rv); + TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_S390_SIEIC); TEST_ASSERT(run->s390_sieic.icptcode == 4 && (run->s390_sieic.ipa >> 8) == 0x83 && (run->s390_sieic.ipb >> 16) == 0x501, - "Unexpected interception code: ic=%u, ipa=0x%x, ipb=0x%x\n", + "Unexpected interception code: ic=%u, ipa=0x%x, ipb=0x%x", run->s390_sieic.icptcode, run->s390_sieic.ipa, run->s390_sieic.ipb); - vcpu_regs_get(vm, VCPU_ID, ®s); + vcpu_regs_get(vcpu, ®s); compare_regs(®s, &run->s.regs); - vcpu_sregs_get(vm, VCPU_ID, &sregs); + vcpu_sregs_get(vcpu, &sregs); compare_sregs(&sregs, &run->s.regs); +} + +void test_set_and_verify_various_reg_values(struct kvm_vcpu *vcpu) +{ + struct kvm_run *run = vcpu->run; + struct kvm_sregs sregs; + struct kvm_regs regs; + int rv; /* Set and verify various register values */ run->s.regs.gprs[11] = 0xBAD1DEA; @@ -152,24 +152,36 @@ int main(int argc, char *argv[]) run->kvm_valid_regs = TEST_SYNC_FIELDS; run->kvm_dirty_regs = KVM_SYNC_GPRS | KVM_SYNC_ACRS; - rv = _vcpu_run(vm, VCPU_ID); - TEST_ASSERT(rv == 0, "vcpu_run failed: %d\n", rv); - TEST_ASSERT(run->exit_reason == KVM_EXIT_S390_SIEIC, - "Unexpected exit reason: %u (%s)\n", - run->exit_reason, - exit_reason_str(run->exit_reason)); + + if (get_diag318_info() > 0) { + run->s.regs.diag318 = get_diag318_info(); + run->kvm_dirty_regs |= KVM_SYNC_DIAG318; + } + + rv = _vcpu_run(vcpu); + TEST_ASSERT(rv == 0, "vcpu_run failed: %d", rv); + TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_S390_SIEIC); TEST_ASSERT(run->s.regs.gprs[11] == 0xBAD1DEA + 1, "r11 sync regs value incorrect 0x%llx.", run->s.regs.gprs[11]); TEST_ASSERT(run->s.regs.acrs[0] == 1 << 11, "acr0 sync regs value incorrect 0x%x.", run->s.regs.acrs[0]); + TEST_ASSERT(run->s.regs.diag318 == get_diag318_info(), + "diag318 sync regs value incorrect 0x%llx.", + run->s.regs.diag318); - vcpu_regs_get(vm, VCPU_ID, ®s); + vcpu_regs_get(vcpu, ®s); compare_regs(®s, &run->s.regs); - vcpu_sregs_get(vm, VCPU_ID, &sregs); + vcpu_sregs_get(vcpu, &sregs); compare_sregs(&sregs, &run->s.regs); +} + +void test_clear_kvm_dirty_regs_bits(struct kvm_vcpu *vcpu) +{ + struct kvm_run *run = vcpu->run; + int rv; /* Clear kvm_dirty_regs bits, verify new s.regs values are * overwritten with existing guest values. @@ -177,17 +189,50 @@ int main(int argc, char *argv[]) run->kvm_valid_regs = TEST_SYNC_FIELDS; run->kvm_dirty_regs = 0; run->s.regs.gprs[11] = 0xDEADBEEF; - rv = _vcpu_run(vm, VCPU_ID); - TEST_ASSERT(rv == 0, "vcpu_run failed: %d\n", rv); - TEST_ASSERT(run->exit_reason == KVM_EXIT_S390_SIEIC, - "Unexpected exit reason: %u (%s)\n", - run->exit_reason, - exit_reason_str(run->exit_reason)); + run->s.regs.diag318 = 0x4B1D; + rv = _vcpu_run(vcpu); + TEST_ASSERT(rv == 0, "vcpu_run failed: %d", rv); + TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_S390_SIEIC); TEST_ASSERT(run->s.regs.gprs[11] != 0xDEADBEEF, "r11 sync regs value incorrect 0x%llx.", run->s.regs.gprs[11]); + TEST_ASSERT(run->s.regs.diag318 != 0x4B1D, + "diag318 sync regs value incorrect 0x%llx.", + run->s.regs.diag318); +} + +struct testdef { + const char *name; + void (*test)(struct kvm_vcpu *vcpu); +} testlist[] = { + { "read invalid", test_read_invalid }, + { "set invalid", test_set_invalid }, + { "request+verify all valid regs", test_req_and_verify_all_valid_regs }, + { "set+verify various regs", test_set_and_verify_various_reg_values }, + { "clear kvm_dirty_regs bits", test_clear_kvm_dirty_regs_bits }, +}; + +int main(int argc, char *argv[]) +{ + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; + int idx; + + TEST_REQUIRE(kvm_has_cap(KVM_CAP_SYNC_REGS)); + + ksft_print_header(); + + ksft_set_plan(ARRAY_SIZE(testlist)); + + /* Create VM */ + vm = vm_create_with_one_vcpu(&vcpu, guest_code); + + for (idx = 0; idx < ARRAY_SIZE(testlist); idx++) { + testlist[idx].test(vcpu); + ksft_test_result_pass("%s\n", testlist[idx].name); + } kvm_vm_free(vm); - return 0; + ksft_finished(); /* Print results and exit() accordingly */ } diff --git a/tools/testing/selftests/kvm/s390x/tprot.c b/tools/testing/selftests/kvm/s390x/tprot.c new file mode 100644 index 000000000000..7a742a673b7c --- /dev/null +++ b/tools/testing/selftests/kvm/s390x/tprot.c @@ -0,0 +1,245 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Test TEST PROTECTION emulation. + * + * Copyright IBM Corp. 2021 + */ +#include <sys/mman.h> +#include "test_util.h" +#include "kvm_util.h" +#include "kselftest.h" +#include "ucall_common.h" + +#define PAGE_SHIFT 12 +#define PAGE_SIZE (1 << PAGE_SHIFT) +#define CR0_FETCH_PROTECTION_OVERRIDE (1UL << (63 - 38)) +#define CR0_STORAGE_PROTECTION_OVERRIDE (1UL << (63 - 39)) + +static __aligned(PAGE_SIZE) uint8_t pages[2][PAGE_SIZE]; +static uint8_t *const page_store_prot = pages[0]; +static uint8_t *const page_fetch_prot = pages[1]; + +/* Nonzero return value indicates that address not mapped */ +static int set_storage_key(void *addr, uint8_t key) +{ + int not_mapped = 0; + + asm volatile ( + "lra %[addr], 0(0,%[addr])\n" + " jz 0f\n" + " llill %[not_mapped],1\n" + " j 1f\n" + "0: sske %[key], %[addr]\n" + "1:" + : [addr] "+&a" (addr), [not_mapped] "+r" (not_mapped) + : [key] "r" (key) + : "cc" + ); + return -not_mapped; +} + +enum permission { + READ_WRITE = 0, + READ = 1, + RW_PROTECTED = 2, + TRANSL_UNAVAIL = 3, +}; + +static enum permission test_protection(void *addr, uint8_t key) +{ + uint64_t mask; + + asm volatile ( + "tprot %[addr], 0(%[key])\n" + " ipm %[mask]\n" + : [mask] "=r" (mask) + : [addr] "Q" (*(char *)addr), + [key] "a" (key) + : "cc" + ); + + return (enum permission)(mask >> 28); +} + +enum stage { + STAGE_INIT_SIMPLE, + TEST_SIMPLE, + STAGE_INIT_FETCH_PROT_OVERRIDE, + TEST_FETCH_PROT_OVERRIDE, + TEST_STORAGE_PROT_OVERRIDE, + STAGE_END /* must be the last entry (it's the amount of tests) */ +}; + +struct test { + enum stage stage; + void *addr; + uint8_t key; + enum permission expected; +} tests[] = { + /* + * We perform each test in the array by executing TEST PROTECTION on + * the specified addr with the specified key and checking if the returned + * permissions match the expected value. + * Both guest and host cooperate to set up the required test conditions. + * A central condition is that the page targeted by addr has to be DAT + * protected in the host mappings, in order for KVM to emulate the + * TEST PROTECTION instruction. + * Since the page tables are shared, the host uses mprotect to achieve + * this. + * + * Test resulting in RW_PROTECTED/TRANSL_UNAVAIL will be interpreted + * by SIE, not KVM, but there is no harm in testing them also. + * See Enhanced Suppression-on-Protection Facilities in the + * Interpretive-Execution Mode + */ + /* + * guest: set storage key of page_store_prot to 1 + * storage key of page_fetch_prot to 9 and enable + * protection for it + * STAGE_INIT_SIMPLE + * host: write protect both via mprotect + */ + /* access key 0 matches any storage key -> RW */ + { TEST_SIMPLE, page_store_prot, 0x00, READ_WRITE }, + /* access key matches storage key -> RW */ + { TEST_SIMPLE, page_store_prot, 0x10, READ_WRITE }, + /* mismatched keys, but no fetch protection -> RO */ + { TEST_SIMPLE, page_store_prot, 0x20, READ }, + /* access key 0 matches any storage key -> RW */ + { TEST_SIMPLE, page_fetch_prot, 0x00, READ_WRITE }, + /* access key matches storage key -> RW */ + { TEST_SIMPLE, page_fetch_prot, 0x90, READ_WRITE }, + /* mismatched keys, fetch protection -> inaccessible */ + { TEST_SIMPLE, page_fetch_prot, 0x10, RW_PROTECTED }, + /* page 0 not mapped yet -> translation not available */ + { TEST_SIMPLE, (void *)0x00, 0x10, TRANSL_UNAVAIL }, + /* + * host: try to map page 0 + * guest: set storage key of page 0 to 9 and enable fetch protection + * STAGE_INIT_FETCH_PROT_OVERRIDE + * host: write protect page 0 + * enable fetch protection override + */ + /* mismatched keys, fetch protection, but override applies -> RO */ + { TEST_FETCH_PROT_OVERRIDE, (void *)0x00, 0x10, READ }, + /* mismatched keys, fetch protection, override applies to 0-2048 only -> inaccessible */ + { TEST_FETCH_PROT_OVERRIDE, (void *)2049, 0x10, RW_PROTECTED }, + /* + * host: enable storage protection override + */ + /* mismatched keys, but override applies (storage key 9) -> RW */ + { TEST_STORAGE_PROT_OVERRIDE, page_fetch_prot, 0x10, READ_WRITE }, + /* mismatched keys, no fetch protection, override doesn't apply -> RO */ + { TEST_STORAGE_PROT_OVERRIDE, page_store_prot, 0x20, READ }, + /* mismatched keys, but override applies (storage key 9) -> RW */ + { TEST_STORAGE_PROT_OVERRIDE, (void *)2049, 0x10, READ_WRITE }, + /* end marker */ + { STAGE_END, 0, 0, 0 }, +}; + +static enum stage perform_next_stage(int *i, bool mapped_0) +{ + enum stage stage = tests[*i].stage; + enum permission result; + bool skip; + + for (; tests[*i].stage == stage; (*i)++) { + /* + * Some fetch protection override tests require that page 0 + * be mapped, however, when the hosts tries to map that page via + * vm_vaddr_alloc, it may happen that some other page gets mapped + * instead. + * In order to skip these tests we detect this inside the guest + */ + skip = tests[*i].addr < (void *)4096 && + tests[*i].expected != TRANSL_UNAVAIL && + !mapped_0; + if (!skip) { + result = test_protection(tests[*i].addr, tests[*i].key); + __GUEST_ASSERT(result == tests[*i].expected, + "Wanted %u, got %u, for i = %u", + tests[*i].expected, result, *i); + } + } + return stage; +} + +static void guest_code(void) +{ + bool mapped_0; + int i = 0; + + GUEST_ASSERT_EQ(set_storage_key(page_store_prot, 0x10), 0); + GUEST_ASSERT_EQ(set_storage_key(page_fetch_prot, 0x98), 0); + GUEST_SYNC(STAGE_INIT_SIMPLE); + GUEST_SYNC(perform_next_stage(&i, false)); + + /* Fetch-protection override */ + mapped_0 = !set_storage_key((void *)0, 0x98); + GUEST_SYNC(STAGE_INIT_FETCH_PROT_OVERRIDE); + GUEST_SYNC(perform_next_stage(&i, mapped_0)); + + /* Storage-protection override */ + GUEST_SYNC(perform_next_stage(&i, mapped_0)); +} + +#define HOST_SYNC_NO_TAP(vcpup, stage) \ +({ \ + struct kvm_vcpu *__vcpu = (vcpup); \ + struct ucall uc; \ + int __stage = (stage); \ + \ + vcpu_run(__vcpu); \ + get_ucall(__vcpu, &uc); \ + if (uc.cmd == UCALL_ABORT) \ + REPORT_GUEST_ASSERT(uc); \ + TEST_ASSERT_EQ(uc.cmd, UCALL_SYNC); \ + TEST_ASSERT_EQ(uc.args[1], __stage); \ +}) + +#define HOST_SYNC(vcpu, stage) \ +({ \ + HOST_SYNC_NO_TAP(vcpu, stage); \ + ksft_test_result_pass("" #stage "\n"); \ +}) + +int main(int argc, char *argv[]) +{ + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; + struct kvm_run *run; + vm_vaddr_t guest_0_page; + + ksft_print_header(); + ksft_set_plan(STAGE_END); + + vm = vm_create_with_one_vcpu(&vcpu, guest_code); + run = vcpu->run; + + HOST_SYNC(vcpu, STAGE_INIT_SIMPLE); + mprotect(addr_gva2hva(vm, (vm_vaddr_t)pages), PAGE_SIZE * 2, PROT_READ); + HOST_SYNC(vcpu, TEST_SIMPLE); + + guest_0_page = vm_vaddr_alloc(vm, PAGE_SIZE, 0); + if (guest_0_page != 0) { + /* Use NO_TAP so we don't get a PASS print */ + HOST_SYNC_NO_TAP(vcpu, STAGE_INIT_FETCH_PROT_OVERRIDE); + ksft_test_result_skip("STAGE_INIT_FETCH_PROT_OVERRIDE - " + "Did not allocate page at 0\n"); + } else { + HOST_SYNC(vcpu, STAGE_INIT_FETCH_PROT_OVERRIDE); + } + if (guest_0_page == 0) + mprotect(addr_gva2hva(vm, (vm_vaddr_t)0), PAGE_SIZE, PROT_READ); + run->s.regs.crs[0] |= CR0_FETCH_PROTECTION_OVERRIDE; + run->kvm_dirty_regs = KVM_SYNC_CRS; + HOST_SYNC(vcpu, TEST_FETCH_PROT_OVERRIDE); + + run->s.regs.crs[0] |= CR0_STORAGE_PROTECTION_OVERRIDE; + run->kvm_dirty_regs = KVM_SYNC_CRS; + HOST_SYNC(vcpu, TEST_STORAGE_PROT_OVERRIDE); + + kvm_vm_free(vm); + + ksft_finished(); /* Print results and exit() accordingly */ +} diff --git a/tools/testing/selftests/kvm/set_memory_region_test.c b/tools/testing/selftests/kvm/set_memory_region_test.c index b3ece55a2da6..bb8002084f52 100644 --- a/tools/testing/selftests/kvm/set_memory_region_test.c +++ b/tools/testing/selftests/kvm/set_memory_region_test.c @@ -1,5 +1,4 @@ // SPDX-License-Identifier: GPL-2.0 -#define _GNU_SOURCE /* for program_invocation_short_name */ #include <fcntl.h> #include <pthread.h> #include <sched.h> @@ -17,8 +16,6 @@ #include <kvm_util.h> #include <processor.h> -#define VCPU_ID 0 - /* * s390x needs at least 1MB alignment, and the x86_64 MOVE/DELETE tests need a * 2MB sized and aligned region so that the initial region corresponds to @@ -54,8 +51,8 @@ static inline uint64_t guest_spin_on_val(uint64_t spin_val) static void *vcpu_worker(void *data) { - struct kvm_vm *vm = data; - struct kvm_run *run; + struct kvm_vcpu *vcpu = data; + struct kvm_run *run = vcpu->run; struct ucall uc; uint64_t cmd; @@ -64,13 +61,11 @@ static void *vcpu_worker(void *data) * which will occur if the guest attempts to access a memslot after it * has been deleted or while it is being moved . */ - run = vcpu_state(vm, VCPU_ID); - while (1) { - vcpu_run(vm, VCPU_ID); + vcpu_run(vcpu); if (run->exit_reason == KVM_EXIT_IO) { - cmd = get_ucall(vm, VCPU_ID, &uc); + cmd = get_ucall(vcpu, &uc); if (cmd != UCALL_SYNC) break; @@ -92,8 +87,7 @@ static void *vcpu_worker(void *data) } if (run->exit_reason == KVM_EXIT_IO && cmd == UCALL_ABORT) - TEST_FAIL("%s at %s:%ld, val = %lu", (const char *)uc.args[0], - __FILE__, uc.args[1], uc.args[2]); + REPORT_GUEST_ASSERT(uc); return NULL; } @@ -103,25 +97,24 @@ static void wait_for_vcpu(void) struct timespec ts; TEST_ASSERT(!clock_gettime(CLOCK_REALTIME, &ts), - "clock_gettime() failed: %d\n", errno); + "clock_gettime() failed: %d", errno); ts.tv_sec += 2; TEST_ASSERT(!sem_timedwait(&vcpu_ready, &ts), - "sem_timedwait() failed: %d\n", errno); + "sem_timedwait() failed: %d", errno); /* Wait for the vCPU thread to reenter the guest. */ usleep(100000); } -static struct kvm_vm *spawn_vm(pthread_t *vcpu_thread, void *guest_code) +static struct kvm_vm *spawn_vm(struct kvm_vcpu **vcpu, pthread_t *vcpu_thread, + void *guest_code) { struct kvm_vm *vm; uint64_t *hva; uint64_t gpa; - vm = vm_create_default(VCPU_ID, 0, guest_code); - - vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid()); + vm = vm_create_with_one_vcpu(vcpu, guest_code); vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS_THP, MEM_REGION_GPA, MEM_REGION_SLOT, @@ -134,13 +127,13 @@ static struct kvm_vm *spawn_vm(pthread_t *vcpu_thread, void *guest_code) gpa = vm_phy_pages_alloc(vm, 2, MEM_REGION_GPA, MEM_REGION_SLOT); TEST_ASSERT(gpa == MEM_REGION_GPA, "Failed vm_phy_pages_alloc\n"); - virt_map(vm, MEM_REGION_GPA, MEM_REGION_GPA, 2, 0); + virt_map(vm, MEM_REGION_GPA, MEM_REGION_GPA, 2); /* Ditto for the host mapping so that both pages can be zeroed. */ hva = addr_gpa2hva(vm, MEM_REGION_GPA); memset(hva, 0, 2 * 4096); - pthread_create(vcpu_thread, NULL, vcpu_worker, vm); + pthread_create(vcpu_thread, NULL, vcpu_worker, *vcpu); /* Ensure the guest thread is spun up. */ wait_for_vcpu(); @@ -156,16 +149,28 @@ static void guest_code_move_memory_region(void) GUEST_SYNC(0); /* - * Spin until the memory region is moved to a misaligned address. This - * may or may not trigger MMIO, as the window where the memslot is - * invalid is quite small. + * Spin until the memory region starts getting moved to a + * misaligned address. + * Every region move may or may not trigger MMIO, as the + * window where the memslot is invalid is usually quite small. */ val = guest_spin_on_val(0); - GUEST_ASSERT_1(val == 1 || val == MMIO_VAL, val); + __GUEST_ASSERT(val == 1 || val == MMIO_VAL, + "Expected '1' or MMIO ('%lx'), got '%lx'", MMIO_VAL, val); - /* Spin until the memory region is realigned. */ + /* Spin until the misaligning memory region move completes. */ val = guest_spin_on_val(MMIO_VAL); - GUEST_ASSERT_1(val == 1, val); + __GUEST_ASSERT(val == 1 || val == 0, + "Expected '0' or '1' (no MMIO), got '%lx'", val); + + /* Spin until the memory region starts to get re-aligned. */ + val = guest_spin_on_val(0); + __GUEST_ASSERT(val == 1 || val == MMIO_VAL, + "Expected '1' or MMIO ('%lx'), got '%lx'", MMIO_VAL, val); + + /* Spin until the re-aligning memory region move completes. */ + val = guest_spin_on_val(MMIO_VAL); + GUEST_ASSERT_EQ(val, 1); GUEST_DONE(); } @@ -173,10 +178,11 @@ static void guest_code_move_memory_region(void) static void test_move_memory_region(void) { pthread_t vcpu_thread; + struct kvm_vcpu *vcpu; struct kvm_vm *vm; uint64_t *hva; - vm = spawn_vm(&vcpu_thread, guest_code_move_memory_region); + vm = spawn_vm(&vcpu, &vcpu_thread, guest_code_move_memory_region); hva = addr_gpa2hva(vm, MEM_REGION_GPA); @@ -214,21 +220,33 @@ static void test_move_memory_region(void) static void guest_code_delete_memory_region(void) { + struct desc_ptr idt; uint64_t val; + /* + * Clobber the IDT so that a #PF due to the memory region being deleted + * escalates to triple-fault shutdown. Because the memory region is + * deleted, there will be no valid mappings. As a result, KVM will + * repeatedly intercepts the state-2 page fault that occurs when trying + * to vector the guest's #PF. I.e. trying to actually handle the #PF + * in the guest will never succeed, and so isn't an option. + */ + memset(&idt, 0, sizeof(idt)); + __asm__ __volatile__("lidt %0" :: "m"(idt)); + GUEST_SYNC(0); /* Spin until the memory region is deleted. */ val = guest_spin_on_val(0); - GUEST_ASSERT_1(val == MMIO_VAL, val); + GUEST_ASSERT_EQ(val, MMIO_VAL); /* Spin until the memory region is recreated. */ val = guest_spin_on_val(MMIO_VAL); - GUEST_ASSERT_1(val == 0, val); + GUEST_ASSERT_EQ(val, 0); /* Spin until the memory region is deleted. */ val = guest_spin_on_val(0); - GUEST_ASSERT_1(val == MMIO_VAL, val); + GUEST_ASSERT_EQ(val, MMIO_VAL); asm("1:\n\t" ".pushsection .rodata\n\t" @@ -245,17 +263,18 @@ static void guest_code_delete_memory_region(void) "final_rip_end: .quad 1b\n\t" ".popsection"); - GUEST_ASSERT_1(0, 0); + GUEST_ASSERT(0); } static void test_delete_memory_region(void) { pthread_t vcpu_thread; + struct kvm_vcpu *vcpu; struct kvm_regs regs; struct kvm_run *run; struct kvm_vm *vm; - vm = spawn_vm(&vcpu_thread, guest_code_delete_memory_region); + vm = spawn_vm(&vcpu, &vcpu_thread, guest_code_delete_memory_region); /* Delete the memory region, the guest should not die. */ vm_mem_region_delete(vm, MEM_REGION_SLOT); @@ -279,13 +298,13 @@ static void test_delete_memory_region(void) pthread_join(vcpu_thread, NULL); - run = vcpu_state(vm, VCPU_ID); + run = vcpu->run; TEST_ASSERT(run->exit_reason == KVM_EXIT_SHUTDOWN || run->exit_reason == KVM_EXIT_INTERNAL_ERROR, "Unexpected exit reason = %d", run->exit_reason); - vcpu_regs_get(vm, VCPU_ID, ®s); + vcpu_regs_get(vcpu, ®s); /* * On AMD, after KVM_EXIT_SHUTDOWN the VMCB has been reinitialized already, @@ -294,7 +313,7 @@ static void test_delete_memory_region(void) if (run->exit_reason == KVM_EXIT_INTERNAL_ERROR) TEST_ASSERT(regs.rip >= final_rip_start && regs.rip < final_rip_end, - "Bad rip, expected 0x%lx - 0x%lx, got 0x%llx\n", + "Bad rip, expected 0x%lx - 0x%lx, got 0x%llx", final_rip_start, final_rip_end, regs.rip); kvm_vm_free(vm); @@ -302,26 +321,81 @@ static void test_delete_memory_region(void) static void test_zero_memory_regions(void) { - struct kvm_run *run; + struct kvm_vcpu *vcpu; struct kvm_vm *vm; pr_info("Testing KVM_RUN with zero added memory regions\n"); - vm = vm_create(VM_MODE_DEFAULT, 0, O_RDWR); - vm_vcpu_add(vm, VCPU_ID); - - TEST_ASSERT(!ioctl(vm_get_fd(vm), KVM_SET_NR_MMU_PAGES, 64), - "KVM_SET_NR_MMU_PAGES failed, errno = %d\n", errno); - vcpu_run(vm, VCPU_ID); + vm = vm_create_barebones(); + vcpu = __vm_vcpu_add(vm, 0); - run = vcpu_state(vm, VCPU_ID); - TEST_ASSERT(run->exit_reason == KVM_EXIT_INTERNAL_ERROR, - "Unexpected exit_reason = %u\n", run->exit_reason); + vm_ioctl(vm, KVM_SET_NR_MMU_PAGES, (void *)64ul); + vcpu_run(vcpu); + TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_INTERNAL_ERROR); kvm_vm_free(vm); } #endif /* __x86_64__ */ +static void test_invalid_memory_region_flags(void) +{ + uint32_t supported_flags = KVM_MEM_LOG_DIRTY_PAGES; + const uint32_t v2_only_flags = KVM_MEM_GUEST_MEMFD; + struct kvm_vm *vm; + int r, i; + +#if defined __aarch64__ || defined __riscv || defined __x86_64__ + supported_flags |= KVM_MEM_READONLY; +#endif + +#ifdef __x86_64__ + if (kvm_check_cap(KVM_CAP_VM_TYPES) & BIT(KVM_X86_SW_PROTECTED_VM)) + vm = vm_create_barebones_type(KVM_X86_SW_PROTECTED_VM); + else +#endif + vm = vm_create_barebones(); + + if (kvm_check_cap(KVM_CAP_MEMORY_ATTRIBUTES) & KVM_MEMORY_ATTRIBUTE_PRIVATE) + supported_flags |= KVM_MEM_GUEST_MEMFD; + + for (i = 0; i < 32; i++) { + if ((supported_flags & BIT(i)) && !(v2_only_flags & BIT(i))) + continue; + + r = __vm_set_user_memory_region(vm, 0, BIT(i), + 0, MEM_REGION_SIZE, NULL); + + TEST_ASSERT(r && errno == EINVAL, + "KVM_SET_USER_MEMORY_REGION should have failed on v2 only flag 0x%lx", BIT(i)); + + if (supported_flags & BIT(i)) + continue; + + r = __vm_set_user_memory_region2(vm, 0, BIT(i), + 0, MEM_REGION_SIZE, NULL, 0, 0); + TEST_ASSERT(r && errno == EINVAL, + "KVM_SET_USER_MEMORY_REGION2 should have failed on unsupported flag 0x%lx", BIT(i)); + } + + if (supported_flags & KVM_MEM_GUEST_MEMFD) { + int guest_memfd = vm_create_guest_memfd(vm, MEM_REGION_SIZE, 0); + + r = __vm_set_user_memory_region2(vm, 0, + KVM_MEM_LOG_DIRTY_PAGES | KVM_MEM_GUEST_MEMFD, + 0, MEM_REGION_SIZE, NULL, guest_memfd, 0); + TEST_ASSERT(r && errno == EINVAL, + "KVM_SET_USER_MEMORY_REGION2 should have failed, dirty logging private memory is unsupported"); + + r = __vm_set_user_memory_region2(vm, 0, + KVM_MEM_READONLY | KVM_MEM_GUEST_MEMFD, + 0, MEM_REGION_SIZE, NULL, guest_memfd, 0); + TEST_ASSERT(r && errno == EINVAL, + "KVM_SET_USER_MEMORY_REGION2 should have failed, read-only GUEST_MEMFD memslots are unsupported"); + + close(guest_memfd); + } +} + /* * Test it can be added memory slots up to KVM_CAP_NR_MEMSLOTS, then any * tentative to add further slots should fail. @@ -332,54 +406,154 @@ static void test_add_max_memory_regions(void) struct kvm_vm *vm; uint32_t max_mem_slots; uint32_t slot; - uint64_t guest_addr = 0x0; - uint64_t mem_reg_npages; - void *mem; + void *mem, *mem_aligned, *mem_extra; + size_t alignment; + +#ifdef __s390x__ + /* On s390x, the host address must be aligned to 1M (due to PGSTEs) */ + alignment = 0x100000; +#else + alignment = 1; +#endif max_mem_slots = kvm_check_cap(KVM_CAP_NR_MEMSLOTS); TEST_ASSERT(max_mem_slots > 0, "KVM_CAP_NR_MEMSLOTS should be greater than 0"); pr_info("Allowed number of memory slots: %i\n", max_mem_slots); - vm = vm_create(VM_MODE_DEFAULT, 0, O_RDWR); - - mem_reg_npages = vm_calc_num_guest_pages(VM_MODE_DEFAULT, MEM_REGION_SIZE); + vm = vm_create_barebones(); /* Check it can be added memory slots up to the maximum allowed */ pr_info("Adding slots 0..%i, each memory region with %dK size\n", (max_mem_slots - 1), MEM_REGION_SIZE >> 10); - for (slot = 0; slot < max_mem_slots; slot++) { - vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, - guest_addr, slot, mem_reg_npages, - 0); - guest_addr += MEM_REGION_SIZE; - } - /* Check it cannot be added memory slots beyond the limit */ - mem = mmap(NULL, MEM_REGION_SIZE, PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + mem = mmap(NULL, (size_t)max_mem_slots * MEM_REGION_SIZE + alignment, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_NORESERVE, -1, 0); TEST_ASSERT(mem != MAP_FAILED, "Failed to mmap() host"); + mem_aligned = (void *)(((size_t) mem + alignment - 1) & ~(alignment - 1)); - ret = ioctl(vm_get_fd(vm), KVM_SET_USER_MEMORY_REGION, - &(struct kvm_userspace_memory_region) {slot, 0, guest_addr, - MEM_REGION_SIZE, (uint64_t) mem}); + for (slot = 0; slot < max_mem_slots; slot++) + vm_set_user_memory_region(vm, slot, 0, + ((uint64_t)slot * MEM_REGION_SIZE), + MEM_REGION_SIZE, + mem_aligned + (uint64_t)slot * MEM_REGION_SIZE); + + /* Check it cannot be added memory slots beyond the limit */ + mem_extra = mmap(NULL, MEM_REGION_SIZE, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + TEST_ASSERT(mem_extra != MAP_FAILED, "Failed to mmap() host"); + + ret = __vm_set_user_memory_region(vm, max_mem_slots, 0, + (uint64_t)max_mem_slots * MEM_REGION_SIZE, + MEM_REGION_SIZE, mem_extra); TEST_ASSERT(ret == -1 && errno == EINVAL, "Adding one more memory slot should fail with EINVAL"); - munmap(mem, MEM_REGION_SIZE); + munmap(mem, (size_t)max_mem_slots * MEM_REGION_SIZE + alignment); + munmap(mem_extra, MEM_REGION_SIZE); + kvm_vm_free(vm); +} + + +#ifdef __x86_64__ +static void test_invalid_guest_memfd(struct kvm_vm *vm, int memfd, + size_t offset, const char *msg) +{ + int r = __vm_set_user_memory_region2(vm, MEM_REGION_SLOT, KVM_MEM_GUEST_MEMFD, + MEM_REGION_GPA, MEM_REGION_SIZE, + 0, memfd, offset); + TEST_ASSERT(r == -1 && errno == EINVAL, "%s", msg); +} + +static void test_add_private_memory_region(void) +{ + struct kvm_vm *vm, *vm2; + int memfd, i; + + pr_info("Testing ADD of KVM_MEM_GUEST_MEMFD memory regions\n"); + + vm = vm_create_barebones_type(KVM_X86_SW_PROTECTED_VM); + + test_invalid_guest_memfd(vm, vm->kvm_fd, 0, "KVM fd should fail"); + test_invalid_guest_memfd(vm, vm->fd, 0, "VM's fd should fail"); + + memfd = kvm_memfd_alloc(MEM_REGION_SIZE, false); + test_invalid_guest_memfd(vm, memfd, 0, "Regular memfd() should fail"); + close(memfd); + + vm2 = vm_create_barebones_type(KVM_X86_SW_PROTECTED_VM); + memfd = vm_create_guest_memfd(vm2, MEM_REGION_SIZE, 0); + test_invalid_guest_memfd(vm, memfd, 0, "Other VM's guest_memfd() should fail"); + + vm_set_user_memory_region2(vm2, MEM_REGION_SLOT, KVM_MEM_GUEST_MEMFD, + MEM_REGION_GPA, MEM_REGION_SIZE, 0, memfd, 0); + close(memfd); + kvm_vm_free(vm2); + + memfd = vm_create_guest_memfd(vm, MEM_REGION_SIZE, 0); + for (i = 1; i < PAGE_SIZE; i++) + test_invalid_guest_memfd(vm, memfd, i, "Unaligned offset should fail"); + + vm_set_user_memory_region2(vm, MEM_REGION_SLOT, KVM_MEM_GUEST_MEMFD, + MEM_REGION_GPA, MEM_REGION_SIZE, 0, memfd, 0); + close(memfd); + kvm_vm_free(vm); } +static void test_add_overlapping_private_memory_regions(void) +{ + struct kvm_vm *vm; + int memfd; + int r; + + pr_info("Testing ADD of overlapping KVM_MEM_GUEST_MEMFD memory regions\n"); + + vm = vm_create_barebones_type(KVM_X86_SW_PROTECTED_VM); + + memfd = vm_create_guest_memfd(vm, MEM_REGION_SIZE * 4, 0); + + vm_set_user_memory_region2(vm, MEM_REGION_SLOT, KVM_MEM_GUEST_MEMFD, + MEM_REGION_GPA, MEM_REGION_SIZE * 2, 0, memfd, 0); + + vm_set_user_memory_region2(vm, MEM_REGION_SLOT + 1, KVM_MEM_GUEST_MEMFD, + MEM_REGION_GPA * 2, MEM_REGION_SIZE * 2, + 0, memfd, MEM_REGION_SIZE * 2); + + /* + * Delete the first memslot, and then attempt to recreate it except + * with a "bad" offset that results in overlap in the guest_memfd(). + */ + vm_set_user_memory_region2(vm, MEM_REGION_SLOT, KVM_MEM_GUEST_MEMFD, + MEM_REGION_GPA, 0, NULL, -1, 0); + + /* Overlap the front half of the other slot. */ + r = __vm_set_user_memory_region2(vm, MEM_REGION_SLOT, KVM_MEM_GUEST_MEMFD, + MEM_REGION_GPA * 2 - MEM_REGION_SIZE, + MEM_REGION_SIZE * 2, + 0, memfd, 0); + TEST_ASSERT(r == -1 && errno == EEXIST, "%s", + "Overlapping guest_memfd() bindings should fail with EEXIST"); + + /* And now the back half of the other slot. */ + r = __vm_set_user_memory_region2(vm, MEM_REGION_SLOT, KVM_MEM_GUEST_MEMFD, + MEM_REGION_GPA * 2 + MEM_REGION_SIZE, + MEM_REGION_SIZE * 2, + 0, memfd, 0); + TEST_ASSERT(r == -1 && errno == EEXIST, "%s", + "Overlapping guest_memfd() bindings should fail with EEXIST"); + + close(memfd); + kvm_vm_free(vm); +} +#endif + int main(int argc, char *argv[]) { #ifdef __x86_64__ int i, loops; -#endif - /* Tell stdout not to buffer its content */ - setbuf(stdout, NULL); - -#ifdef __x86_64__ /* * FIXME: the zero-memslot test fails on aarch64 and s390x because * KVM_RUN fails with ENOEXEC or EFAULT. @@ -387,11 +561,21 @@ int main(int argc, char *argv[]) test_zero_memory_regions(); #endif + test_invalid_memory_region_flags(); + test_add_max_memory_regions(); #ifdef __x86_64__ + if (kvm_has_cap(KVM_CAP_GUEST_MEMFD) && + (kvm_check_cap(KVM_CAP_VM_TYPES) & BIT(KVM_X86_SW_PROTECTED_VM))) { + test_add_private_memory_region(); + test_add_overlapping_private_memory_regions(); + } else { + pr_info("Skipping tests for KVM_MEM_GUEST_MEMFD memory regions\n"); + } + if (argc > 1) - loops = atoi(argv[1]); + loops = atoi_positive("Number of iterations", argv[1]); else loops = 10; diff --git a/tools/testing/selftests/kvm/settings b/tools/testing/selftests/kvm/settings new file mode 100644 index 000000000000..6091b45d226b --- /dev/null +++ b/tools/testing/selftests/kvm/settings @@ -0,0 +1 @@ +timeout=120 diff --git a/tools/testing/selftests/kvm/steal_time.c b/tools/testing/selftests/kvm/steal_time.c index fcc840088c91..a8d3afa0b86b 100644 --- a/tools/testing/selftests/kvm/steal_time.c +++ b/tools/testing/selftests/kvm/steal_time.c @@ -4,23 +4,25 @@ * * Copyright (C) 2020, Red Hat, Inc. */ -#define _GNU_SOURCE #include <stdio.h> #include <time.h> #include <sched.h> #include <pthread.h> #include <linux/kernel.h> -#include <sys/syscall.h> #include <asm/kvm.h> +#ifdef __riscv +#include "sbi.h" +#else #include <asm/kvm_para.h> +#endif #include "test_util.h" #include "kvm_util.h" #include "processor.h" +#include "ucall_common.h" #define NR_VCPUS 4 #define ST_GPA_BASE (1 << 30) -#define MIN_RUN_DELAY_NS 200000UL static void *st_gva[NR_VCPUS]; static uint64_t guest_stolen_time[NR_VCPUS]; @@ -33,8 +35,8 @@ static uint64_t guest_stolen_time[NR_VCPUS]; static void check_status(struct kvm_steal_time *st) { GUEST_ASSERT(!(READ_ONCE(st->version) & 1)); - GUEST_ASSERT(READ_ONCE(st->flags) == 0); - GUEST_ASSERT(READ_ONCE(st->preempted) == 0); + GUEST_ASSERT_EQ(READ_ONCE(st->flags), 0); + GUEST_ASSERT_EQ(READ_ONCE(st->preempted), 0); } static void guest_code(int cpu) @@ -42,7 +44,7 @@ static void guest_code(int cpu) struct kvm_steal_time *st = st_gva[cpu]; uint32_t version; - GUEST_ASSERT(rdmsr(MSR_KVM_STEAL_TIME) == ((uint64_t)st_gva[cpu] | KVM_MSR_ENABLED)); + GUEST_ASSERT_EQ(rdmsr(MSR_KVM_STEAL_TIME), ((uint64_t)st_gva[cpu] | KVM_MSR_ENABLED)); memset(st, 0, sizeof(*st)); GUEST_SYNC(0); @@ -60,49 +62,41 @@ static void guest_code(int cpu) GUEST_DONE(); } -static void steal_time_init(struct kvm_vm *vm) +static bool is_steal_time_supported(struct kvm_vcpu *vcpu) { - int i; - - if (!(kvm_get_supported_cpuid_entry(KVM_CPUID_FEATURES)->eax & - KVM_FEATURE_STEAL_TIME)) { - print_skip("steal-time not supported"); - exit(KSFT_SKIP); - } - - for (i = 0; i < NR_VCPUS; ++i) { - int ret; + return kvm_cpu_has(X86_FEATURE_KVM_STEAL_TIME); +} - vcpu_set_cpuid(vm, i, kvm_get_supported_cpuid()); +static void steal_time_init(struct kvm_vcpu *vcpu, uint32_t i) +{ + int ret; - /* ST_GPA_BASE is identity mapped */ - st_gva[i] = (void *)(ST_GPA_BASE + i * STEAL_TIME_SIZE); - sync_global_to_guest(vm, st_gva[i]); + /* ST_GPA_BASE is identity mapped */ + st_gva[i] = (void *)(ST_GPA_BASE + i * STEAL_TIME_SIZE); + sync_global_to_guest(vcpu->vm, st_gva[i]); - ret = _vcpu_set_msr(vm, i, MSR_KVM_STEAL_TIME, (ulong)st_gva[i] | KVM_STEAL_RESERVED_MASK); - TEST_ASSERT(ret == 0, "Bad GPA didn't fail"); + ret = _vcpu_set_msr(vcpu, MSR_KVM_STEAL_TIME, + (ulong)st_gva[i] | KVM_STEAL_RESERVED_MASK); + TEST_ASSERT(ret == 0, "Bad GPA didn't fail"); - vcpu_set_msr(vm, i, MSR_KVM_STEAL_TIME, (ulong)st_gva[i] | KVM_MSR_ENABLED); - } + vcpu_set_msr(vcpu, MSR_KVM_STEAL_TIME, (ulong)st_gva[i] | KVM_MSR_ENABLED); } -static void steal_time_dump(struct kvm_vm *vm, uint32_t vcpuid) +static void steal_time_dump(struct kvm_vm *vm, uint32_t vcpu_idx) { - struct kvm_steal_time *st = addr_gva2hva(vm, (ulong)st_gva[vcpuid]); - int i; - - pr_info("VCPU%d:\n", vcpuid); - pr_info(" steal: %lld\n", st->steal); - pr_info(" version: %d\n", st->version); - pr_info(" flags: %d\n", st->flags); - pr_info(" preempted: %d\n", st->preempted); - pr_info(" u8_pad: "); - for (i = 0; i < 3; ++i) - pr_info("%d", st->u8_pad[i]); - pr_info("\n pad: "); - for (i = 0; i < 11; ++i) - pr_info("%d", st->pad[i]); - pr_info("\n"); + struct kvm_steal_time *st = addr_gva2hva(vm, (ulong)st_gva[vcpu_idx]); + + ksft_print_msg("VCPU%d:\n", vcpu_idx); + ksft_print_msg(" steal: %lld\n", st->steal); + ksft_print_msg(" version: %d\n", st->version); + ksft_print_msg(" flags: %d\n", st->flags); + ksft_print_msg(" preempted: %d\n", st->preempted); + ksft_print_msg(" u8_pad: %d %d %d\n", + st->u8_pad[0], st->u8_pad[1], st->u8_pad[2]); + ksft_print_msg(" pad: %d %d %d %d %d %d %d %d %d %d %d\n", + st->pad[0], st->pad[1], st->pad[2], st->pad[3], + st->pad[4], st->pad[5], st->pad[6], st->pad[7], + st->pad[8], st->pad[9], st->pad[10]); } #elif defined(__aarch64__) @@ -120,25 +114,18 @@ struct st_time { uint64_t st_time; }; -static int64_t smccc(uint32_t func, uint32_t arg) +static int64_t smccc(uint32_t func, uint64_t arg) { - unsigned long ret; + struct arm_smccc_res res; - asm volatile( - "mov x0, %1\n" - "mov x1, %2\n" - "hvc #0\n" - "mov %0, x0\n" - : "=r" (ret) : "r" (func), "r" (arg) : - "x0", "x1", "x2", "x3"); - - return ret; + smccc_hvc(func, arg, 0, 0, 0, 0, 0, 0, &res); + return res.a0; } static void check_status(struct st_time *st) { - GUEST_ASSERT(READ_ONCE(st->rev) == 0); - GUEST_ASSERT(READ_ONCE(st->attr) == 0); + GUEST_ASSERT_EQ(READ_ONCE(st->rev), 0); + GUEST_ASSERT_EQ(READ_ONCE(st->attr), 0); } static void guest_code(int cpu) @@ -147,15 +134,15 @@ static void guest_code(int cpu) int64_t status; status = smccc(SMCCC_ARCH_FEATURES, PV_TIME_FEATURES); - GUEST_ASSERT(status == 0); + GUEST_ASSERT_EQ(status, 0); status = smccc(PV_TIME_FEATURES, PV_TIME_FEATURES); - GUEST_ASSERT(status == 0); + GUEST_ASSERT_EQ(status, 0); status = smccc(PV_TIME_FEATURES, PV_TIME_ST); - GUEST_ASSERT(status == 0); + GUEST_ASSERT_EQ(status, 0); status = smccc(PV_TIME_ST, 0); - GUEST_ASSERT(status != -1); - GUEST_ASSERT(status == (ulong)st_gva[cpu]); + GUEST_ASSERT_NE(status, -1); + GUEST_ASSERT_EQ(status, (ulong)st_gva[cpu]); st = (struct st_time *)status; GUEST_SYNC(0); @@ -169,70 +156,154 @@ static void guest_code(int cpu) GUEST_DONE(); } -static void steal_time_init(struct kvm_vm *vm) +static bool is_steal_time_supported(struct kvm_vcpu *vcpu) { struct kvm_device_attr dev = { .group = KVM_ARM_VCPU_PVTIME_CTRL, .attr = KVM_ARM_VCPU_PVTIME_IPA, }; - int i, ret; - - ret = _vcpu_ioctl(vm, 0, KVM_HAS_DEVICE_ATTR, &dev); - if (ret != 0 && errno == ENXIO) { - print_skip("steal-time not supported"); - exit(KSFT_SKIP); - } - for (i = 0; i < NR_VCPUS; ++i) { - uint64_t st_ipa; + return !__vcpu_ioctl(vcpu, KVM_HAS_DEVICE_ATTR, &dev); +} - vcpu_ioctl(vm, i, KVM_HAS_DEVICE_ATTR, &dev); +static void steal_time_init(struct kvm_vcpu *vcpu, uint32_t i) +{ + struct kvm_vm *vm = vcpu->vm; + uint64_t st_ipa; + int ret; - dev.addr = (uint64_t)&st_ipa; + struct kvm_device_attr dev = { + .group = KVM_ARM_VCPU_PVTIME_CTRL, + .attr = KVM_ARM_VCPU_PVTIME_IPA, + .addr = (uint64_t)&st_ipa, + }; - /* ST_GPA_BASE is identity mapped */ - st_gva[i] = (void *)(ST_GPA_BASE + i * STEAL_TIME_SIZE); - sync_global_to_guest(vm, st_gva[i]); + vcpu_ioctl(vcpu, KVM_HAS_DEVICE_ATTR, &dev); - st_ipa = (ulong)st_gva[i] | 1; - ret = _vcpu_ioctl(vm, i, KVM_SET_DEVICE_ATTR, &dev); - TEST_ASSERT(ret == -1 && errno == EINVAL, "Bad IPA didn't report EINVAL"); + /* ST_GPA_BASE is identity mapped */ + st_gva[i] = (void *)(ST_GPA_BASE + i * STEAL_TIME_SIZE); + sync_global_to_guest(vm, st_gva[i]); - st_ipa = (ulong)st_gva[i]; - vcpu_ioctl(vm, i, KVM_SET_DEVICE_ATTR, &dev); + st_ipa = (ulong)st_gva[i] | 1; + ret = __vcpu_ioctl(vcpu, KVM_SET_DEVICE_ATTR, &dev); + TEST_ASSERT(ret == -1 && errno == EINVAL, "Bad IPA didn't report EINVAL"); - ret = _vcpu_ioctl(vm, i, KVM_SET_DEVICE_ATTR, &dev); - TEST_ASSERT(ret == -1 && errno == EEXIST, "Set IPA twice without EEXIST"); + st_ipa = (ulong)st_gva[i]; + vcpu_ioctl(vcpu, KVM_SET_DEVICE_ATTR, &dev); - } + ret = __vcpu_ioctl(vcpu, KVM_SET_DEVICE_ATTR, &dev); + TEST_ASSERT(ret == -1 && errno == EEXIST, "Set IPA twice without EEXIST"); } -static void steal_time_dump(struct kvm_vm *vm, uint32_t vcpuid) +static void steal_time_dump(struct kvm_vm *vm, uint32_t vcpu_idx) { - struct st_time *st = addr_gva2hva(vm, (ulong)st_gva[vcpuid]); + struct st_time *st = addr_gva2hva(vm, (ulong)st_gva[vcpu_idx]); - pr_info("VCPU%d:\n", vcpuid); - pr_info(" rev: %d\n", st->rev); - pr_info(" attr: %d\n", st->attr); - pr_info(" st_time: %ld\n", st->st_time); + ksft_print_msg("VCPU%d:\n", vcpu_idx); + ksft_print_msg(" rev: %d\n", st->rev); + ksft_print_msg(" attr: %d\n", st->attr); + ksft_print_msg(" st_time: %ld\n", st->st_time); } +#elif defined(__riscv) + +/* SBI STA shmem must have 64-byte alignment */ +#define STEAL_TIME_SIZE ((sizeof(struct sta_struct) + 63) & ~63) + +static vm_paddr_t st_gpa[NR_VCPUS]; + +struct sta_struct { + uint32_t sequence; + uint32_t flags; + uint64_t steal; + uint8_t preempted; + uint8_t pad[47]; +} __packed; + +static void sta_set_shmem(vm_paddr_t gpa, unsigned long flags) +{ + unsigned long lo = (unsigned long)gpa; +#if __riscv_xlen == 32 + unsigned long hi = (unsigned long)(gpa >> 32); +#else + unsigned long hi = gpa == -1 ? -1 : 0; #endif + struct sbiret ret = sbi_ecall(SBI_EXT_STA, 0, lo, hi, flags, 0, 0, 0); -static long get_run_delay(void) + GUEST_ASSERT(ret.value == 0 && ret.error == 0); +} + +static void check_status(struct sta_struct *st) { - char path[64]; - long val[2]; - FILE *fp; + GUEST_ASSERT(!(READ_ONCE(st->sequence) & 1)); + GUEST_ASSERT(READ_ONCE(st->flags) == 0); + GUEST_ASSERT(READ_ONCE(st->preempted) == 0); +} - sprintf(path, "/proc/%ld/schedstat", syscall(SYS_gettid)); - fp = fopen(path, "r"); - fscanf(fp, "%ld %ld ", &val[0], &val[1]); - fclose(fp); +static void guest_code(int cpu) +{ + struct sta_struct *st = st_gva[cpu]; + uint32_t sequence; + long out_val = 0; + bool probe; + + probe = guest_sbi_probe_extension(SBI_EXT_STA, &out_val); + GUEST_ASSERT(probe && out_val == 1); + + sta_set_shmem(st_gpa[cpu], 0); + GUEST_SYNC(0); + + check_status(st); + WRITE_ONCE(guest_stolen_time[cpu], st->steal); + sequence = READ_ONCE(st->sequence); + check_status(st); + GUEST_SYNC(1); + + check_status(st); + GUEST_ASSERT(sequence < READ_ONCE(st->sequence)); + WRITE_ONCE(guest_stolen_time[cpu], st->steal); + check_status(st); + GUEST_DONE(); +} + +static bool is_steal_time_supported(struct kvm_vcpu *vcpu) +{ + uint64_t id = RISCV_SBI_EXT_REG(KVM_RISCV_SBI_EXT_STA); + unsigned long enabled; + + vcpu_get_reg(vcpu, id, &enabled); + TEST_ASSERT(enabled == 0 || enabled == 1, "Expected boolean result"); - return val[1]; + return enabled; } +static void steal_time_init(struct kvm_vcpu *vcpu, uint32_t i) +{ + /* ST_GPA_BASE is identity mapped */ + st_gva[i] = (void *)(ST_GPA_BASE + i * STEAL_TIME_SIZE); + st_gpa[i] = addr_gva2gpa(vcpu->vm, (vm_vaddr_t)st_gva[i]); + sync_global_to_guest(vcpu->vm, st_gva[i]); + sync_global_to_guest(vcpu->vm, st_gpa[i]); +} + +static void steal_time_dump(struct kvm_vm *vm, uint32_t vcpu_idx) +{ + struct sta_struct *st = addr_gva2hva(vm, (ulong)st_gva[vcpu_idx]); + int i; + + pr_info("VCPU%d:\n", vcpu_idx); + pr_info(" sequence: %d\n", st->sequence); + pr_info(" flags: %d\n", st->flags); + pr_info(" steal: %"PRIu64"\n", st->steal); + pr_info(" preempted: %d\n", st->preempted); + pr_info(" pad: "); + for (i = 0; i < 47; ++i) + pr_info("%d", st->pad[i]); + pr_info("\n"); +} + +#endif + static void *do_steal_time(void *arg) { struct timespec ts, stop; @@ -249,29 +320,27 @@ static void *do_steal_time(void *arg) return NULL; } -static void run_vcpu(struct kvm_vm *vm, uint32_t vcpuid) +static void run_vcpu(struct kvm_vcpu *vcpu) { struct ucall uc; - vcpu_args_set(vm, vcpuid, 1, vcpuid); + vcpu_run(vcpu); - vcpu_ioctl(vm, vcpuid, KVM_RUN, NULL); - - switch (get_ucall(vm, vcpuid, &uc)) { + switch (get_ucall(vcpu, &uc)) { case UCALL_SYNC: case UCALL_DONE: break; case UCALL_ABORT: - TEST_ASSERT(false, "%s at %s:%ld", (const char *)uc.args[0], - __FILE__, uc.args[1]); + REPORT_GUEST_ASSERT(uc); default: TEST_ASSERT(false, "Unexpected exit: %s", - exit_reason_str(vcpu_state(vm, vcpuid)->exit_reason)); + exit_reason_str(vcpu->run->exit_reason)); } } int main(int ac, char **av) { + struct kvm_vcpu *vcpus[NR_VCPUS]; struct kvm_vm *vm; pthread_attr_t attr; pthread_t thread; @@ -291,26 +360,27 @@ int main(int ac, char **av) pthread_attr_setaffinity_np(&attr, sizeof(cpu_set_t), &cpuset); pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset); - /* Create a one VCPU guest and an identity mapped memslot for the steal time structure */ - vm = vm_create_default(0, 0, guest_code); + /* Create a VM and an identity mapped memslot for the steal time structure */ + vm = vm_create_with_vcpus(NR_VCPUS, guest_code, vcpus); gpages = vm_calc_num_guest_pages(VM_MODE_DEFAULT, STEAL_TIME_SIZE * NR_VCPUS); vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, ST_GPA_BASE, 1, gpages, 0); - virt_map(vm, ST_GPA_BASE, ST_GPA_BASE, gpages, 0); - ucall_init(vm, NULL); - - /* Add the rest of the VCPUs */ - for (i = 1; i < NR_VCPUS; ++i) - vm_vcpu_add_default(vm, i, guest_code); + virt_map(vm, ST_GPA_BASE, ST_GPA_BASE, gpages); - steal_time_init(vm); + ksft_print_header(); + TEST_REQUIRE(is_steal_time_supported(vcpus[0])); + ksft_set_plan(NR_VCPUS); /* Run test on each VCPU */ for (i = 0; i < NR_VCPUS; ++i) { + steal_time_init(vcpus[i], i); + + vcpu_args_set(vcpus[i], 1, i); + /* First VCPU run initializes steal-time */ - run_vcpu(vm, i); + run_vcpu(vcpus[i]); /* Second VCPU run, expect guest stolen time to be <= run_delay */ - run_vcpu(vm, i); + run_vcpu(vcpus[i]); sync_global_from_guest(vm, guest_stolen_time[i]); stolen_time = guest_stolen_time[i]; run_delay = get_run_delay(); @@ -322,7 +392,7 @@ int main(int ac, char **av) run_delay = get_run_delay(); pthread_create(&thread, &attr, do_steal_time, NULL); do - pthread_yield(); + sched_yield(); while (get_run_delay() - run_delay < MIN_RUN_DELAY_NS); pthread_join(thread, NULL); run_delay = get_run_delay() - run_delay; @@ -331,7 +401,7 @@ int main(int ac, char **av) MIN_RUN_DELAY_NS, run_delay); /* Run VCPU again to confirm stolen time is consistent with run_delay */ - run_vcpu(vm, i); + run_vcpu(vcpus[i]); sync_global_from_guest(vm, guest_stolen_time[i]); stolen_time = guest_stolen_time[i] - stolen_time; TEST_ASSERT(stolen_time >= run_delay, @@ -339,14 +409,15 @@ int main(int ac, char **av) run_delay, stolen_time); if (verbose) { - pr_info("VCPU%d: total-stolen-time=%ld test-stolen-time=%ld", i, - guest_stolen_time[i], stolen_time); - if (stolen_time == run_delay) - pr_info(" (BONUS: guest test-stolen-time even exactly matches test-run_delay)"); - pr_info("\n"); + ksft_print_msg("VCPU%d: total-stolen-time=%ld test-stolen-time=%ld%s\n", + i, guest_stolen_time[i], stolen_time, + stolen_time == run_delay ? + " (BONUS: guest test-stolen-time even exactly matches test-run_delay)" : ""); steal_time_dump(vm, i); } + ksft_test_result_pass("vcpu%d\n", i); } - return 0; + /* Print results and exit() accordingly */ + ksft_finished(); } diff --git a/tools/testing/selftests/kvm/system_counter_offset_test.c b/tools/testing/selftests/kvm/system_counter_offset_test.c new file mode 100644 index 000000000000..513d421a9bff --- /dev/null +++ b/tools/testing/selftests/kvm/system_counter_offset_test.c @@ -0,0 +1,127 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2021, Google LLC. + * + * Tests for adjusting the system counter from userspace + */ +#include <asm/kvm_para.h> +#include <stdint.h> +#include <string.h> +#include <sys/stat.h> +#include <time.h> + +#include "test_util.h" +#include "kvm_util.h" +#include "processor.h" + +#ifdef __x86_64__ + +struct test_case { + uint64_t tsc_offset; +}; + +static struct test_case test_cases[] = { + { 0 }, + { 180 * NSEC_PER_SEC }, + { -180 * NSEC_PER_SEC }, +}; + +static void check_preconditions(struct kvm_vcpu *vcpu) +{ + __TEST_REQUIRE(!__vcpu_has_device_attr(vcpu, KVM_VCPU_TSC_CTRL, + KVM_VCPU_TSC_OFFSET), + "KVM_VCPU_TSC_OFFSET not supported; skipping test"); +} + +static void setup_system_counter(struct kvm_vcpu *vcpu, struct test_case *test) +{ + vcpu_device_attr_set(vcpu, KVM_VCPU_TSC_CTRL, KVM_VCPU_TSC_OFFSET, + &test->tsc_offset); +} + +static uint64_t guest_read_system_counter(struct test_case *test) +{ + return rdtsc(); +} + +static uint64_t host_read_guest_system_counter(struct test_case *test) +{ + return rdtsc() + test->tsc_offset; +} + +#else /* __x86_64__ */ + +#error test not implemented for this architecture! + +#endif + +#define GUEST_SYNC_CLOCK(__stage, __val) \ + GUEST_SYNC_ARGS(__stage, __val, 0, 0, 0) + +static void guest_main(void) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(test_cases); i++) { + struct test_case *test = &test_cases[i]; + + GUEST_SYNC_CLOCK(i, guest_read_system_counter(test)); + } +} + +static void handle_sync(struct ucall *uc, uint64_t start, uint64_t end) +{ + uint64_t obs = uc->args[2]; + + TEST_ASSERT(start <= obs && obs <= end, + "unexpected system counter value: %"PRIu64" expected range: [%"PRIu64", %"PRIu64"]", + obs, start, end); + + pr_info("system counter value: %"PRIu64" expected range [%"PRIu64", %"PRIu64"]\n", + obs, start, end); +} + +static void handle_abort(struct ucall *uc) +{ + REPORT_GUEST_ASSERT(*uc); +} + +static void enter_guest(struct kvm_vcpu *vcpu) +{ + uint64_t start, end; + struct ucall uc; + int i; + + for (i = 0; i < ARRAY_SIZE(test_cases); i++) { + struct test_case *test = &test_cases[i]; + + setup_system_counter(vcpu, test); + start = host_read_guest_system_counter(test); + vcpu_run(vcpu); + end = host_read_guest_system_counter(test); + + switch (get_ucall(vcpu, &uc)) { + case UCALL_SYNC: + handle_sync(&uc, start, end); + break; + case UCALL_ABORT: + handle_abort(&uc); + return; + default: + TEST_ASSERT(0, "unhandled ucall %ld", + get_ucall(vcpu, &uc)); + } + } +} + +int main(void) +{ + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; + + vm = vm_create_with_one_vcpu(&vcpu, guest_main); + check_preconditions(vcpu); + + enter_guest(vcpu); + kvm_vm_free(vm); +} diff --git a/tools/testing/selftests/kvm/x86_64/amx_test.c b/tools/testing/selftests/kvm/x86_64/amx_test.c new file mode 100644 index 000000000000..903940c54d2d --- /dev/null +++ b/tools/testing/selftests/kvm/x86_64/amx_test.c @@ -0,0 +1,330 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * amx tests + * + * Copyright (C) 2021, Intel, Inc. + * + * Tests for amx #NM exception and save/restore. + */ +#include <fcntl.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/ioctl.h> +#include <sys/syscall.h> + +#include "test_util.h" + +#include "kvm_util.h" +#include "processor.h" +#include "vmx.h" + +#ifndef __x86_64__ +# error This test is 64-bit only +#endif + +#define NUM_TILES 8 +#define TILE_SIZE 1024 +#define XSAVE_SIZE ((NUM_TILES * TILE_SIZE) + PAGE_SIZE) + +/* Tile configuration associated: */ +#define PALETTE_TABLE_INDEX 1 +#define MAX_TILES 16 +#define RESERVED_BYTES 14 + +#define XSAVE_HDR_OFFSET 512 + +struct tile_config { + u8 palette_id; + u8 start_row; + u8 reserved[RESERVED_BYTES]; + u16 colsb[MAX_TILES]; + u8 rows[MAX_TILES]; +}; + +struct tile_data { + u8 data[NUM_TILES * TILE_SIZE]; +}; + +struct xtile_info { + u16 bytes_per_tile; + u16 bytes_per_row; + u16 max_names; + u16 max_rows; + u32 xsave_offset; + u32 xsave_size; +}; + +static struct xtile_info xtile; + +static inline void __ldtilecfg(void *cfg) +{ + asm volatile(".byte 0xc4,0xe2,0x78,0x49,0x00" + : : "a"(cfg)); +} + +static inline void __tileloadd(void *tile) +{ + asm volatile(".byte 0xc4,0xe2,0x7b,0x4b,0x04,0x10" + : : "a"(tile), "d"(0)); +} + +static inline void __tilerelease(void) +{ + asm volatile(".byte 0xc4, 0xe2, 0x78, 0x49, 0xc0" ::); +} + +static inline void __xsavec(struct xstate *xstate, uint64_t rfbm) +{ + uint32_t rfbm_lo = rfbm; + uint32_t rfbm_hi = rfbm >> 32; + + asm volatile("xsavec (%%rdi)" + : : "D" (xstate), "a" (rfbm_lo), "d" (rfbm_hi) + : "memory"); +} + +static void check_xtile_info(void) +{ + GUEST_ASSERT(this_cpu_has_p(X86_PROPERTY_XSTATE_MAX_SIZE_XCR0)); + GUEST_ASSERT(this_cpu_property(X86_PROPERTY_XSTATE_MAX_SIZE_XCR0) <= XSAVE_SIZE); + + xtile.xsave_offset = this_cpu_property(X86_PROPERTY_XSTATE_TILE_OFFSET); + GUEST_ASSERT(xtile.xsave_offset == 2816); + xtile.xsave_size = this_cpu_property(X86_PROPERTY_XSTATE_TILE_SIZE); + GUEST_ASSERT(xtile.xsave_size == 8192); + GUEST_ASSERT(sizeof(struct tile_data) >= xtile.xsave_size); + + GUEST_ASSERT(this_cpu_has_p(X86_PROPERTY_AMX_MAX_PALETTE_TABLES)); + GUEST_ASSERT(this_cpu_property(X86_PROPERTY_AMX_MAX_PALETTE_TABLES) >= + PALETTE_TABLE_INDEX); + + GUEST_ASSERT(this_cpu_has_p(X86_PROPERTY_AMX_NR_TILE_REGS)); + xtile.max_names = this_cpu_property(X86_PROPERTY_AMX_NR_TILE_REGS); + GUEST_ASSERT(xtile.max_names == 8); + xtile.bytes_per_tile = this_cpu_property(X86_PROPERTY_AMX_BYTES_PER_TILE); + GUEST_ASSERT(xtile.bytes_per_tile == 1024); + xtile.bytes_per_row = this_cpu_property(X86_PROPERTY_AMX_BYTES_PER_ROW); + GUEST_ASSERT(xtile.bytes_per_row == 64); + xtile.max_rows = this_cpu_property(X86_PROPERTY_AMX_MAX_ROWS); + GUEST_ASSERT(xtile.max_rows == 16); +} + +static void set_tilecfg(struct tile_config *cfg) +{ + int i; + + /* Only palette id 1 */ + cfg->palette_id = 1; + for (i = 0; i < xtile.max_names; i++) { + cfg->colsb[i] = xtile.bytes_per_row; + cfg->rows[i] = xtile.max_rows; + } +} + +static void init_regs(void) +{ + uint64_t cr4, xcr0; + + GUEST_ASSERT(this_cpu_has(X86_FEATURE_XSAVE)); + + /* turn on CR4.OSXSAVE */ + cr4 = get_cr4(); + cr4 |= X86_CR4_OSXSAVE; + set_cr4(cr4); + GUEST_ASSERT(this_cpu_has(X86_FEATURE_OSXSAVE)); + + xcr0 = xgetbv(0); + xcr0 |= XFEATURE_MASK_XTILE; + xsetbv(0x0, xcr0); + GUEST_ASSERT((xgetbv(0) & XFEATURE_MASK_XTILE) == XFEATURE_MASK_XTILE); +} + +static void __attribute__((__flatten__)) guest_code(struct tile_config *amx_cfg, + struct tile_data *tiledata, + struct xstate *xstate) +{ + init_regs(); + check_xtile_info(); + GUEST_SYNC(1); + + /* xfd=0, enable amx */ + wrmsr(MSR_IA32_XFD, 0); + GUEST_SYNC(2); + GUEST_ASSERT(rdmsr(MSR_IA32_XFD) == 0); + set_tilecfg(amx_cfg); + __ldtilecfg(amx_cfg); + GUEST_SYNC(3); + /* Check save/restore when trap to userspace */ + __tileloadd(tiledata); + GUEST_SYNC(4); + __tilerelease(); + GUEST_SYNC(5); + /* + * After XSAVEC, XTILEDATA is cleared in the xstate_bv but is set in + * the xcomp_bv. + */ + xstate->header.xstate_bv = XFEATURE_MASK_XTILE_DATA; + __xsavec(xstate, XFEATURE_MASK_XTILE_DATA); + GUEST_ASSERT(!(xstate->header.xstate_bv & XFEATURE_MASK_XTILE_DATA)); + GUEST_ASSERT(xstate->header.xcomp_bv & XFEATURE_MASK_XTILE_DATA); + + /* xfd=0x40000, disable amx tiledata */ + wrmsr(MSR_IA32_XFD, XFEATURE_MASK_XTILE_DATA); + + /* + * XTILEDATA is cleared in xstate_bv but set in xcomp_bv, this property + * remains the same even when amx tiledata is disabled by IA32_XFD. + */ + xstate->header.xstate_bv = XFEATURE_MASK_XTILE_DATA; + __xsavec(xstate, XFEATURE_MASK_XTILE_DATA); + GUEST_ASSERT(!(xstate->header.xstate_bv & XFEATURE_MASK_XTILE_DATA)); + GUEST_ASSERT((xstate->header.xcomp_bv & XFEATURE_MASK_XTILE_DATA)); + + GUEST_SYNC(6); + GUEST_ASSERT(rdmsr(MSR_IA32_XFD) == XFEATURE_MASK_XTILE_DATA); + set_tilecfg(amx_cfg); + __ldtilecfg(amx_cfg); + /* Trigger #NM exception */ + __tileloadd(tiledata); + GUEST_SYNC(10); + + GUEST_DONE(); +} + +void guest_nm_handler(struct ex_regs *regs) +{ + /* Check if #NM is triggered by XFEATURE_MASK_XTILE_DATA */ + GUEST_SYNC(7); + GUEST_ASSERT(!(get_cr0() & X86_CR0_TS)); + GUEST_ASSERT(rdmsr(MSR_IA32_XFD_ERR) == XFEATURE_MASK_XTILE_DATA); + GUEST_ASSERT(rdmsr(MSR_IA32_XFD) == XFEATURE_MASK_XTILE_DATA); + GUEST_SYNC(8); + GUEST_ASSERT(rdmsr(MSR_IA32_XFD_ERR) == XFEATURE_MASK_XTILE_DATA); + GUEST_ASSERT(rdmsr(MSR_IA32_XFD) == XFEATURE_MASK_XTILE_DATA); + /* Clear xfd_err */ + wrmsr(MSR_IA32_XFD_ERR, 0); + /* xfd=0, enable amx */ + wrmsr(MSR_IA32_XFD, 0); + GUEST_SYNC(9); +} + +int main(int argc, char *argv[]) +{ + struct kvm_regs regs1, regs2; + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; + struct kvm_x86_state *state; + int xsave_restore_size; + vm_vaddr_t amx_cfg, tiledata, xstate; + struct ucall uc; + u32 amx_offset; + int ret; + + /* + * Note, all off-by-default features must be enabled before anything + * caches KVM_GET_SUPPORTED_CPUID, e.g. before using kvm_cpu_has(). + */ + vm_xsave_require_permission(XFEATURE_MASK_XTILE_DATA); + + TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_XFD)); + TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_XSAVE)); + TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_AMX_TILE)); + TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_XTILECFG)); + TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_XTILEDATA)); + TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_XTILEDATA_XFD)); + + /* Create VM */ + vm = vm_create_with_one_vcpu(&vcpu, guest_code); + + TEST_ASSERT(kvm_cpu_has_p(X86_PROPERTY_XSTATE_MAX_SIZE), + "KVM should enumerate max XSAVE size when XSAVE is supported"); + xsave_restore_size = kvm_cpu_property(X86_PROPERTY_XSTATE_MAX_SIZE); + + vcpu_regs_get(vcpu, ®s1); + + /* Register #NM handler */ + vm_install_exception_handler(vm, NM_VECTOR, guest_nm_handler); + + /* amx cfg for guest_code */ + amx_cfg = vm_vaddr_alloc_page(vm); + memset(addr_gva2hva(vm, amx_cfg), 0x0, getpagesize()); + + /* amx tiledata for guest_code */ + tiledata = vm_vaddr_alloc_pages(vm, 2); + memset(addr_gva2hva(vm, tiledata), rand() | 1, 2 * getpagesize()); + + /* XSAVE state for guest_code */ + xstate = vm_vaddr_alloc_pages(vm, DIV_ROUND_UP(XSAVE_SIZE, PAGE_SIZE)); + memset(addr_gva2hva(vm, xstate), 0, PAGE_SIZE * DIV_ROUND_UP(XSAVE_SIZE, PAGE_SIZE)); + vcpu_args_set(vcpu, 3, amx_cfg, tiledata, xstate); + + for (;;) { + vcpu_run(vcpu); + TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO); + + switch (get_ucall(vcpu, &uc)) { + case UCALL_ABORT: + REPORT_GUEST_ASSERT(uc); + /* NOT REACHED */ + case UCALL_SYNC: + switch (uc.args[1]) { + case 1: + case 2: + case 3: + case 5: + case 6: + case 7: + case 8: + fprintf(stderr, "GUEST_SYNC(%ld)\n", uc.args[1]); + break; + case 4: + case 10: + fprintf(stderr, + "GUEST_SYNC(%ld), check save/restore status\n", uc.args[1]); + + /* Compacted mode, get amx offset by xsave area + * size subtract 8K amx size. + */ + amx_offset = xsave_restore_size - NUM_TILES*TILE_SIZE; + state = vcpu_save_state(vcpu); + void *amx_start = (void *)state->xsave + amx_offset; + void *tiles_data = (void *)addr_gva2hva(vm, tiledata); + /* Only check TMM0 register, 1 tile */ + ret = memcmp(amx_start, tiles_data, TILE_SIZE); + TEST_ASSERT(ret == 0, "memcmp failed, ret=%d", ret); + kvm_x86_state_cleanup(state); + break; + case 9: + fprintf(stderr, + "GUEST_SYNC(%ld), #NM exception and enable amx\n", uc.args[1]); + break; + } + break; + case UCALL_DONE: + fprintf(stderr, "UCALL_DONE\n"); + goto done; + default: + TEST_FAIL("Unknown ucall %lu", uc.cmd); + } + + state = vcpu_save_state(vcpu); + memset(®s1, 0, sizeof(regs1)); + vcpu_regs_get(vcpu, ®s1); + + kvm_vm_release(vm); + + /* Restore state in a new VM. */ + vcpu = vm_recreate_with_one_vcpu(vm); + vcpu_load_state(vcpu, state); + kvm_x86_state_cleanup(state); + + memset(®s2, 0, sizeof(regs2)); + vcpu_regs_get(vcpu, ®s2); + TEST_ASSERT(!memcmp(®s1, ®s2, sizeof(regs2)), + "Unexpected register values after vcpu_load_state; rdi: %lx rsi: %lx", + (ulong) regs2.rdi, (ulong) regs2.rsi); + } +done: + kvm_vm_free(vm); +} diff --git a/tools/testing/selftests/kvm/x86_64/cpuid_test.c b/tools/testing/selftests/kvm/x86_64/cpuid_test.c new file mode 100644 index 000000000000..8c579ce714e9 --- /dev/null +++ b/tools/testing/selftests/kvm/x86_64/cpuid_test.c @@ -0,0 +1,208 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2021, Red Hat Inc. + * + * Generic tests for KVM CPUID set/get ioctls + */ +#include <asm/kvm_para.h> +#include <linux/kvm_para.h> +#include <stdint.h> + +#include "test_util.h" +#include "kvm_util.h" +#include "processor.h" + +/* CPUIDs known to differ */ +struct { + u32 function; + u32 index; +} mangled_cpuids[] = { + /* + * These entries depend on the vCPU's XCR0 register and IA32_XSS MSR, + * which are not controlled for by this test. + */ + {.function = 0xd, .index = 0}, + {.function = 0xd, .index = 1}, +}; + +static void test_guest_cpuids(struct kvm_cpuid2 *guest_cpuid) +{ + int i; + u32 eax, ebx, ecx, edx; + + for (i = 0; i < guest_cpuid->nent; i++) { + __cpuid(guest_cpuid->entries[i].function, + guest_cpuid->entries[i].index, + &eax, &ebx, &ecx, &edx); + + GUEST_ASSERT_EQ(eax, guest_cpuid->entries[i].eax); + GUEST_ASSERT_EQ(ebx, guest_cpuid->entries[i].ebx); + GUEST_ASSERT_EQ(ecx, guest_cpuid->entries[i].ecx); + GUEST_ASSERT_EQ(edx, guest_cpuid->entries[i].edx); + } + +} + +static void guest_main(struct kvm_cpuid2 *guest_cpuid) +{ + GUEST_SYNC(1); + + test_guest_cpuids(guest_cpuid); + + GUEST_SYNC(2); + + GUEST_ASSERT_EQ(this_cpu_property(X86_PROPERTY_MAX_KVM_LEAF), 0x40000001); + + GUEST_DONE(); +} + +static bool is_cpuid_mangled(const struct kvm_cpuid_entry2 *entrie) +{ + int i; + + for (i = 0; i < sizeof(mangled_cpuids); i++) { + if (mangled_cpuids[i].function == entrie->function && + mangled_cpuids[i].index == entrie->index) + return true; + } + + return false; +} + +static void compare_cpuids(const struct kvm_cpuid2 *cpuid1, + const struct kvm_cpuid2 *cpuid2) +{ + const struct kvm_cpuid_entry2 *e1, *e2; + int i; + + TEST_ASSERT(cpuid1->nent == cpuid2->nent, + "CPUID nent mismatch: %d vs. %d", cpuid1->nent, cpuid2->nent); + + for (i = 0; i < cpuid1->nent; i++) { + e1 = &cpuid1->entries[i]; + e2 = &cpuid2->entries[i]; + + TEST_ASSERT(e1->function == e2->function && + e1->index == e2->index && e1->flags == e2->flags, + "CPUID entries[%d] mismtach: 0x%x.%d.%x vs. 0x%x.%d.%x", + i, e1->function, e1->index, e1->flags, + e2->function, e2->index, e2->flags); + + if (is_cpuid_mangled(e1)) + continue; + + TEST_ASSERT(e1->eax == e2->eax && e1->ebx == e2->ebx && + e1->ecx == e2->ecx && e1->edx == e2->edx, + "CPUID 0x%x.%x differ: 0x%x:0x%x:0x%x:0x%x vs 0x%x:0x%x:0x%x:0x%x", + e1->function, e1->index, + e1->eax, e1->ebx, e1->ecx, e1->edx, + e2->eax, e2->ebx, e2->ecx, e2->edx); + } +} + +static void run_vcpu(struct kvm_vcpu *vcpu, int stage) +{ + struct ucall uc; + + vcpu_run(vcpu); + + switch (get_ucall(vcpu, &uc)) { + case UCALL_SYNC: + TEST_ASSERT(!strcmp((const char *)uc.args[0], "hello") && + uc.args[1] == stage + 1, + "Stage %d: Unexpected register values vmexit, got %lx", + stage + 1, (ulong)uc.args[1]); + return; + case UCALL_DONE: + return; + case UCALL_ABORT: + REPORT_GUEST_ASSERT(uc); + default: + TEST_ASSERT(false, "Unexpected exit: %s", + exit_reason_str(vcpu->run->exit_reason)); + } +} + +struct kvm_cpuid2 *vcpu_alloc_cpuid(struct kvm_vm *vm, vm_vaddr_t *p_gva, struct kvm_cpuid2 *cpuid) +{ + int size = sizeof(*cpuid) + cpuid->nent * sizeof(cpuid->entries[0]); + vm_vaddr_t gva = vm_vaddr_alloc(vm, size, KVM_UTIL_MIN_VADDR); + struct kvm_cpuid2 *guest_cpuids = addr_gva2hva(vm, gva); + + memcpy(guest_cpuids, cpuid, size); + + *p_gva = gva; + return guest_cpuids; +} + +static void set_cpuid_after_run(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *ent; + int rc; + u32 eax, ebx, x; + + /* Setting unmodified CPUID is allowed */ + rc = __vcpu_set_cpuid(vcpu); + TEST_ASSERT(!rc, "Setting unmodified CPUID after KVM_RUN failed: %d", rc); + + /* Changing CPU features is forbidden */ + ent = vcpu_get_cpuid_entry(vcpu, 0x7); + ebx = ent->ebx; + ent->ebx--; + rc = __vcpu_set_cpuid(vcpu); + TEST_ASSERT(rc, "Changing CPU features should fail"); + ent->ebx = ebx; + + /* Changing MAXPHYADDR is forbidden */ + ent = vcpu_get_cpuid_entry(vcpu, 0x80000008); + eax = ent->eax; + x = eax & 0xff; + ent->eax = (eax & ~0xffu) | (x - 1); + rc = __vcpu_set_cpuid(vcpu); + TEST_ASSERT(rc, "Changing MAXPHYADDR should fail"); + ent->eax = eax; +} + +static void test_get_cpuid2(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid2 *cpuid = allocate_kvm_cpuid2(vcpu->cpuid->nent + 1); + int i, r; + + vcpu_ioctl(vcpu, KVM_GET_CPUID2, cpuid); + TEST_ASSERT(cpuid->nent == vcpu->cpuid->nent, + "KVM didn't update nent on success, wanted %u, got %u", + vcpu->cpuid->nent, cpuid->nent); + + for (i = 0; i < vcpu->cpuid->nent; i++) { + cpuid->nent = i; + r = __vcpu_ioctl(vcpu, KVM_GET_CPUID2, cpuid); + TEST_ASSERT(r && errno == E2BIG, KVM_IOCTL_ERROR(KVM_GET_CPUID2, r)); + TEST_ASSERT(cpuid->nent == i, "KVM modified nent on failure"); + } + free(cpuid); +} + +int main(void) +{ + struct kvm_vcpu *vcpu; + vm_vaddr_t cpuid_gva; + struct kvm_vm *vm; + int stage; + + vm = vm_create_with_one_vcpu(&vcpu, guest_main); + + compare_cpuids(kvm_get_supported_cpuid(), vcpu->cpuid); + + vcpu_alloc_cpuid(vm, &cpuid_gva, vcpu->cpuid); + + vcpu_args_set(vcpu, 1, cpuid_gva); + + for (stage = 0; stage < 3; stage++) + run_vcpu(vcpu, stage); + + set_cpuid_after_run(vcpu); + + test_get_cpuid2(vcpu); + + kvm_vm_free(vm); +} diff --git a/tools/testing/selftests/kvm/x86_64/cr4_cpuid_sync_test.c b/tools/testing/selftests/kvm/x86_64/cr4_cpuid_sync_test.c index 140e91901582..624dc725e14d 100644 --- a/tools/testing/selftests/kvm/x86_64/cr4_cpuid_sync_test.c +++ b/tools/testing/selftests/kvm/x86_64/cr4_cpuid_sync_test.c @@ -19,25 +19,11 @@ #include "kvm_util.h" #include "processor.h" -#define X86_FEATURE_XSAVE (1<<26) -#define X86_FEATURE_OSXSAVE (1<<27) -#define VCPU_ID 1 - static inline bool cr4_cpuid_is_sync(void) { - int func, subfunc; - uint32_t eax, ebx, ecx, edx; - uint64_t cr4; - - func = 0x1; - subfunc = 0x0; - __asm__ __volatile__("cpuid" - : "=a"(eax), "=b"(ebx), "=c"(ecx), "=d"(edx) - : "a"(func), "c"(subfunc)); + uint64_t cr4 = get_cr4(); - cr4 = get_cr4(); - - return (!!(ecx & X86_FEATURE_OSXSAVE)) == (!!(cr4 & X86_CR4_OSXSAVE)); + return (this_cpu_has(X86_FEATURE_OSXSAVE) == !!(cr4 & X86_CR4_OSXSAVE)); } static void guest_code(void) @@ -63,45 +49,28 @@ static void guest_code(void) int main(int argc, char *argv[]) { - struct kvm_run *run; + struct kvm_vcpu *vcpu; struct kvm_vm *vm; struct kvm_sregs sregs; - struct kvm_cpuid_entry2 *entry; struct ucall uc; - int rc; - entry = kvm_get_supported_cpuid_entry(1); - if (!(entry->ecx & X86_FEATURE_XSAVE)) { - print_skip("XSAVE feature not supported"); - return 0; - } + TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_XSAVE)); - /* Tell stdout not to buffer its content */ - setbuf(stdout, NULL); - - /* Create VM */ - vm = vm_create_default(VCPU_ID, 0, guest_code); - vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid()); - run = vcpu_state(vm, VCPU_ID); + vm = vm_create_with_one_vcpu(&vcpu, guest_code); while (1) { - rc = _vcpu_run(vm, VCPU_ID); - - TEST_ASSERT(rc == 0, "vcpu_run failed: %d\n", rc); - TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, - "Unexpected exit reason: %u (%s),\n", - run->exit_reason, - exit_reason_str(run->exit_reason)); + vcpu_run(vcpu); + TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO); - switch (get_ucall(vm, VCPU_ID, &uc)) { + switch (get_ucall(vcpu, &uc)) { case UCALL_SYNC: /* emulate hypervisor clearing CR4.OSXSAVE */ - vcpu_sregs_get(vm, VCPU_ID, &sregs); + vcpu_sregs_get(vcpu, &sregs); sregs.cr4 &= ~X86_CR4_OSXSAVE; - vcpu_sregs_set(vm, VCPU_ID, &sregs); + vcpu_sregs_set(vcpu, &sregs); break; case UCALL_ABORT: - TEST_FAIL("Guest CR4 bit (OSXSAVE) unsynchronized with CPUID bit."); + REPORT_GUEST_ASSERT(uc); break; case UCALL_DONE: goto done; @@ -110,8 +79,7 @@ int main(int argc, char *argv[]) } } - kvm_vm_free(vm); - done: + kvm_vm_free(vm); return 0; } diff --git a/tools/testing/selftests/kvm/x86_64/debug_regs.c b/tools/testing/selftests/kvm/x86_64/debug_regs.c index b8d14f9db5f9..f6b295e0b2d2 100644 --- a/tools/testing/selftests/kvm/x86_64/debug_regs.c +++ b/tools/testing/selftests/kvm/x86_64/debug_regs.c @@ -8,12 +8,13 @@ #include <string.h> #include "kvm_util.h" #include "processor.h" - -#define VCPU_ID 0 +#include "apic.h" #define DR6_BD (1 << 13) #define DR7_GD (1 << 13) +#define IRQ_VECTOR 0xAA + /* For testing data access debug BP */ uint32_t guest_value; @@ -21,6 +22,11 @@ extern unsigned char sw_bp, hw_bp, write_data, ss_start, bd_start; static void guest_code(void) { + /* Create a pending interrupt on current vCPU */ + x2apic_enable(); + x2apic_write_reg(APIC_ICR, APIC_DEST_SELF | APIC_INT_ASSERT | + APIC_DM_FIXED | IRQ_VECTOR); + /* * Software BP tests. * @@ -38,12 +44,19 @@ static void guest_code(void) "mov %%rax,%0;\n\t write_data:" : "=m" (guest_value) : : "rax"); - /* Single step test, covers 2 basic instructions and 2 emulated */ + /* + * Single step test, covers 2 basic instructions and 2 emulated + * + * Enable interrupts during the single stepping to see that + * pending interrupt we raised is not handled due to KVM_GUESTDBG_BLOCKIRQ + */ asm volatile("ss_start: " + "sti\n\t" "xor %%eax,%%eax\n\t" "cpuid\n\t" "movl $0x1a0,%%ecx\n\t" "rdmsr\n\t" + "cli\n\t" : : : "eax", "ebx", "ecx", "edx"); /* DR6.BD test */ @@ -51,64 +64,63 @@ static void guest_code(void) GUEST_DONE(); } -#define CLEAR_DEBUG() memset(&debug, 0, sizeof(debug)) -#define APPLY_DEBUG() vcpu_set_guest_debug(vm, VCPU_ID, &debug) #define CAST_TO_RIP(v) ((unsigned long long)&(v)) -#define SET_RIP(v) do { \ - vcpu_regs_get(vm, VCPU_ID, ®s); \ - regs.rip = (v); \ - vcpu_regs_set(vm, VCPU_ID, ®s); \ - } while (0) -#define MOVE_RIP(v) SET_RIP(regs.rip + (v)); + +static void vcpu_skip_insn(struct kvm_vcpu *vcpu, int insn_len) +{ + struct kvm_regs regs; + + vcpu_regs_get(vcpu, ®s); + regs.rip += insn_len; + vcpu_regs_set(vcpu, ®s); +} int main(void) { struct kvm_guest_debug debug; unsigned long long target_dr6, target_rip; - struct kvm_regs regs; + struct kvm_vcpu *vcpu; struct kvm_run *run; struct kvm_vm *vm; struct ucall uc; uint64_t cmd; int i; /* Instruction lengths starting at ss_start */ - int ss_size[4] = { - 3, /* xor */ + int ss_size[6] = { + 1, /* sti*/ + 2, /* xor */ 2, /* cpuid */ 5, /* mov */ 2, /* rdmsr */ + 1, /* cli */ }; - if (!kvm_check_cap(KVM_CAP_SET_GUEST_DEBUG)) { - print_skip("KVM_CAP_SET_GUEST_DEBUG not supported"); - return 0; - } + TEST_REQUIRE(kvm_has_cap(KVM_CAP_SET_GUEST_DEBUG)); - vm = vm_create_default(VCPU_ID, 0, guest_code); - vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid()); - run = vcpu_state(vm, VCPU_ID); + vm = vm_create_with_one_vcpu(&vcpu, guest_code); + run = vcpu->run; /* Test software BPs - int3 */ - CLEAR_DEBUG(); + memset(&debug, 0, sizeof(debug)); debug.control = KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP; - APPLY_DEBUG(); - vcpu_run(vm, VCPU_ID); + vcpu_guest_debug_set(vcpu, &debug); + vcpu_run(vcpu); TEST_ASSERT(run->exit_reason == KVM_EXIT_DEBUG && run->debug.arch.exception == BP_VECTOR && run->debug.arch.pc == CAST_TO_RIP(sw_bp), "INT3: exit %d exception %d rip 0x%llx (should be 0x%llx)", run->exit_reason, run->debug.arch.exception, run->debug.arch.pc, CAST_TO_RIP(sw_bp)); - MOVE_RIP(1); + vcpu_skip_insn(vcpu, 1); /* Test instruction HW BP over DR[0-3] */ for (i = 0; i < 4; i++) { - CLEAR_DEBUG(); + memset(&debug, 0, sizeof(debug)); debug.control = KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW_BP; debug.arch.debugreg[i] = CAST_TO_RIP(hw_bp); debug.arch.debugreg[7] = 0x400 | (1UL << (2*i+1)); - APPLY_DEBUG(); - vcpu_run(vm, VCPU_ID); + vcpu_guest_debug_set(vcpu, &debug); + vcpu_run(vcpu); target_dr6 = 0xffff0ff0 | (1UL << i); TEST_ASSERT(run->exit_reason == KVM_EXIT_DEBUG && run->debug.arch.exception == DB_VECTOR && @@ -121,17 +133,17 @@ int main(void) run->debug.arch.dr6, target_dr6); } /* Skip "nop" */ - MOVE_RIP(1); + vcpu_skip_insn(vcpu, 1); /* Test data access HW BP over DR[0-3] */ for (i = 0; i < 4; i++) { - CLEAR_DEBUG(); + memset(&debug, 0, sizeof(debug)); debug.control = KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW_BP; debug.arch.debugreg[i] = CAST_TO_RIP(guest_value); debug.arch.debugreg[7] = 0x00000400 | (1UL << (2*i+1)) | (0x000d0000UL << (4*i)); - APPLY_DEBUG(); - vcpu_run(vm, VCPU_ID); + vcpu_guest_debug_set(vcpu, &debug); + vcpu_run(vcpu); target_dr6 = 0xffff0ff0 | (1UL << i); TEST_ASSERT(run->exit_reason == KVM_EXIT_DEBUG && run->debug.arch.exception == DB_VECTOR && @@ -143,22 +155,22 @@ int main(void) run->debug.arch.pc, CAST_TO_RIP(write_data), run->debug.arch.dr6, target_dr6); /* Rollback the 4-bytes "mov" */ - MOVE_RIP(-7); + vcpu_skip_insn(vcpu, -7); } /* Skip the 4-bytes "mov" */ - MOVE_RIP(7); + vcpu_skip_insn(vcpu, 7); /* Test single step */ target_rip = CAST_TO_RIP(ss_start); target_dr6 = 0xffff4ff0ULL; - vcpu_regs_get(vm, VCPU_ID, ®s); for (i = 0; i < (sizeof(ss_size) / sizeof(ss_size[0])); i++) { target_rip += ss_size[i]; - CLEAR_DEBUG(); - debug.control = KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_SINGLESTEP; + memset(&debug, 0, sizeof(debug)); + debug.control = KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_SINGLESTEP | + KVM_GUESTDBG_BLOCKIRQ; debug.arch.debugreg[7] = 0x00000400; - APPLY_DEBUG(); - vcpu_run(vm, VCPU_ID); + vcpu_guest_debug_set(vcpu, &debug); + vcpu_run(vcpu); TEST_ASSERT(run->exit_reason == KVM_EXIT_DEBUG && run->debug.arch.exception == DB_VECTOR && run->debug.arch.pc == target_rip && @@ -171,11 +183,11 @@ int main(void) } /* Finally test global disable */ - CLEAR_DEBUG(); + memset(&debug, 0, sizeof(debug)); debug.control = KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW_BP; debug.arch.debugreg[7] = 0x400 | DR7_GD; - APPLY_DEBUG(); - vcpu_run(vm, VCPU_ID); + vcpu_guest_debug_set(vcpu, &debug); + vcpu_run(vcpu); target_dr6 = 0xffff0ff0 | DR6_BD; TEST_ASSERT(run->exit_reason == KVM_EXIT_DEBUG && run->debug.arch.exception == DB_VECTOR && @@ -188,12 +200,12 @@ int main(void) target_dr6); /* Disable all debug controls, run to the end */ - CLEAR_DEBUG(); - APPLY_DEBUG(); + memset(&debug, 0, sizeof(debug)); + vcpu_guest_debug_set(vcpu, &debug); - vcpu_run(vm, VCPU_ID); - TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, "KVM_EXIT_IO"); - cmd = get_ucall(vm, VCPU_ID, &uc); + vcpu_run(vcpu); + TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO); + cmd = get_ucall(vcpu, &uc); TEST_ASSERT(cmd == UCALL_DONE, "UCALL_DONE"); kvm_vm_free(vm); diff --git a/tools/testing/selftests/kvm/x86_64/dirty_log_page_splitting_test.c b/tools/testing/selftests/kvm/x86_64/dirty_log_page_splitting_test.c new file mode 100644 index 000000000000..2929c067c207 --- /dev/null +++ b/tools/testing/selftests/kvm/x86_64/dirty_log_page_splitting_test.c @@ -0,0 +1,263 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * KVM dirty logging page splitting test + * + * Based on dirty_log_perf.c + * + * Copyright (C) 2018, Red Hat, Inc. + * Copyright (C) 2023, Google, Inc. + */ + +#include <stdio.h> +#include <stdlib.h> +#include <pthread.h> +#include <linux/bitmap.h> + +#include "kvm_util.h" +#include "test_util.h" +#include "memstress.h" +#include "guest_modes.h" +#include "ucall_common.h" + +#define VCPUS 2 +#define SLOTS 2 +#define ITERATIONS 2 + +static uint64_t guest_percpu_mem_size = DEFAULT_PER_VCPU_MEM_SIZE; + +static enum vm_mem_backing_src_type backing_src = VM_MEM_SRC_ANONYMOUS_HUGETLB; + +static u64 dirty_log_manual_caps; +static bool host_quit; +static int iteration; +static int vcpu_last_completed_iteration[KVM_MAX_VCPUS]; + +struct kvm_page_stats { + uint64_t pages_4k; + uint64_t pages_2m; + uint64_t pages_1g; + uint64_t hugepages; +}; + +static void get_page_stats(struct kvm_vm *vm, struct kvm_page_stats *stats, const char *stage) +{ + stats->pages_4k = vm_get_stat(vm, "pages_4k"); + stats->pages_2m = vm_get_stat(vm, "pages_2m"); + stats->pages_1g = vm_get_stat(vm, "pages_1g"); + stats->hugepages = stats->pages_2m + stats->pages_1g; + + pr_debug("\nPage stats after %s: 4K: %ld 2M: %ld 1G: %ld huge: %ld\n", + stage, stats->pages_4k, stats->pages_2m, stats->pages_1g, + stats->hugepages); +} + +static void run_vcpu_iteration(struct kvm_vm *vm) +{ + int i; + + iteration++; + for (i = 0; i < VCPUS; i++) { + while (READ_ONCE(vcpu_last_completed_iteration[i]) != + iteration) + ; + } +} + +static void vcpu_worker(struct memstress_vcpu_args *vcpu_args) +{ + struct kvm_vcpu *vcpu = vcpu_args->vcpu; + int vcpu_idx = vcpu_args->vcpu_idx; + + while (!READ_ONCE(host_quit)) { + int current_iteration = READ_ONCE(iteration); + + vcpu_run(vcpu); + + TEST_ASSERT_EQ(get_ucall(vcpu, NULL), UCALL_SYNC); + + vcpu_last_completed_iteration[vcpu_idx] = current_iteration; + + /* Wait for the start of the next iteration to be signaled. */ + while (current_iteration == READ_ONCE(iteration) && + READ_ONCE(iteration) >= 0 && + !READ_ONCE(host_quit)) + ; + } +} + +static void run_test(enum vm_guest_mode mode, void *unused) +{ + struct kvm_vm *vm; + unsigned long **bitmaps; + uint64_t guest_num_pages; + uint64_t host_num_pages; + uint64_t pages_per_slot; + int i; + struct kvm_page_stats stats_populated; + struct kvm_page_stats stats_dirty_logging_enabled; + struct kvm_page_stats stats_dirty_pass[ITERATIONS]; + struct kvm_page_stats stats_clear_pass[ITERATIONS]; + struct kvm_page_stats stats_dirty_logging_disabled; + struct kvm_page_stats stats_repopulated; + + vm = memstress_create_vm(mode, VCPUS, guest_percpu_mem_size, + SLOTS, backing_src, false); + + guest_num_pages = (VCPUS * guest_percpu_mem_size) >> vm->page_shift; + guest_num_pages = vm_adjust_num_guest_pages(mode, guest_num_pages); + host_num_pages = vm_num_host_pages(mode, guest_num_pages); + pages_per_slot = host_num_pages / SLOTS; + TEST_ASSERT_EQ(host_num_pages, pages_per_slot * SLOTS); + TEST_ASSERT(!(host_num_pages % 512), + "Number of pages, '%lu' not a multiple of 2MiB", host_num_pages); + + bitmaps = memstress_alloc_bitmaps(SLOTS, pages_per_slot); + + if (dirty_log_manual_caps) + vm_enable_cap(vm, KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2, + dirty_log_manual_caps); + + /* Start the iterations */ + iteration = -1; + host_quit = false; + + for (i = 0; i < VCPUS; i++) + vcpu_last_completed_iteration[i] = -1; + + memstress_start_vcpu_threads(VCPUS, vcpu_worker); + + run_vcpu_iteration(vm); + get_page_stats(vm, &stats_populated, "populating memory"); + + /* Enable dirty logging */ + memstress_enable_dirty_logging(vm, SLOTS); + + get_page_stats(vm, &stats_dirty_logging_enabled, "enabling dirty logging"); + + while (iteration < ITERATIONS) { + run_vcpu_iteration(vm); + get_page_stats(vm, &stats_dirty_pass[iteration - 1], + "dirtying memory"); + + memstress_get_dirty_log(vm, bitmaps, SLOTS); + + if (dirty_log_manual_caps) { + memstress_clear_dirty_log(vm, bitmaps, SLOTS, pages_per_slot); + + get_page_stats(vm, &stats_clear_pass[iteration - 1], "clearing dirty log"); + } + } + + /* Disable dirty logging */ + memstress_disable_dirty_logging(vm, SLOTS); + + get_page_stats(vm, &stats_dirty_logging_disabled, "disabling dirty logging"); + + /* Run vCPUs again to fault pages back in. */ + run_vcpu_iteration(vm); + get_page_stats(vm, &stats_repopulated, "repopulating memory"); + + /* + * Tell the vCPU threads to quit. No need to manually check that vCPUs + * have stopped running after disabling dirty logging, the join will + * wait for them to exit. + */ + host_quit = true; + memstress_join_vcpu_threads(VCPUS); + + memstress_free_bitmaps(bitmaps, SLOTS); + memstress_destroy_vm(vm); + + TEST_ASSERT_EQ((stats_populated.pages_2m * 512 + + stats_populated.pages_1g * 512 * 512), host_num_pages); + + /* + * Check that all huge pages were split. Since large pages can only + * exist in the data slot, and the vCPUs should have dirtied all pages + * in the data slot, there should be no huge pages left after splitting. + * Splitting happens at dirty log enable time without + * KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 and after the first clear pass + * with that capability. + */ + if (dirty_log_manual_caps) { + TEST_ASSERT_EQ(stats_clear_pass[0].hugepages, 0); + TEST_ASSERT(stats_clear_pass[0].pages_4k >= host_num_pages, + "Expected at least '%lu' 4KiB pages, found only '%lu'", + host_num_pages, stats_clear_pass[0].pages_4k); + TEST_ASSERT_EQ(stats_dirty_logging_enabled.hugepages, stats_populated.hugepages); + } else { + TEST_ASSERT_EQ(stats_dirty_logging_enabled.hugepages, 0); + TEST_ASSERT(stats_dirty_logging_enabled.pages_4k >= host_num_pages, + "Expected at least '%lu' 4KiB pages, found only '%lu'", + host_num_pages, stats_dirty_logging_enabled.pages_4k); + } + + /* + * Once dirty logging is disabled and the vCPUs have touched all their + * memory again, the hugepage counts should be the same as they were + * right after initial population of memory. + */ + TEST_ASSERT_EQ(stats_populated.pages_2m, stats_repopulated.pages_2m); + TEST_ASSERT_EQ(stats_populated.pages_1g, stats_repopulated.pages_1g); +} + +static void help(char *name) +{ + puts(""); + printf("usage: %s [-h] [-b vcpu bytes] [-s mem type]\n", + name); + puts(""); + printf(" -b: specify the size of the memory region which should be\n" + " dirtied by each vCPU. e.g. 10M or 3G.\n" + " (default: 1G)\n"); + backing_src_help("-s"); + puts(""); +} + +int main(int argc, char *argv[]) +{ + int opt; + + TEST_REQUIRE(get_kvm_param_bool("eager_page_split")); + TEST_REQUIRE(get_kvm_param_bool("tdp_mmu")); + + while ((opt = getopt(argc, argv, "b:hs:")) != -1) { + switch (opt) { + case 'b': + guest_percpu_mem_size = parse_size(optarg); + break; + case 'h': + help(argv[0]); + exit(0); + case 's': + backing_src = parse_backing_src_type(optarg); + break; + default: + help(argv[0]); + exit(1); + } + } + + if (!is_backing_src_hugetlb(backing_src)) { + pr_info("This test will only work reliably with HugeTLB memory. " + "It can work with THP, but that is best effort.\n"); + } + + guest_modes_append_default(); + + dirty_log_manual_caps = 0; + for_each_guest_mode(run_test, NULL); + + dirty_log_manual_caps = + kvm_check_cap(KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2); + + if (dirty_log_manual_caps) { + dirty_log_manual_caps &= (KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE | + KVM_DIRTY_LOG_INITIALLY_SET); + for_each_guest_mode(run_test, NULL); + } else { + pr_info("Skipping testing with MANUAL_PROTECT as it is not supported"); + } + + return 0; +} diff --git a/tools/testing/selftests/kvm/x86_64/evmcs_test.c b/tools/testing/selftests/kvm/x86_64/evmcs_test.c deleted file mode 100644 index 757928199f19..000000000000 --- a/tools/testing/selftests/kvm/x86_64/evmcs_test.c +++ /dev/null @@ -1,166 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (C) 2018, Red Hat, Inc. - * - * Tests for Enlightened VMCS, including nested guest state. - */ -#define _GNU_SOURCE /* for program_invocation_short_name */ -#include <fcntl.h> -#include <stdio.h> -#include <stdlib.h> -#include <string.h> -#include <sys/ioctl.h> - -#include "test_util.h" - -#include "kvm_util.h" - -#include "vmx.h" - -#define VCPU_ID 5 - -void l2_guest_code(void) -{ - GUEST_SYNC(7); - - GUEST_SYNC(8); - - /* Done, exit to L1 and never come back. */ - vmcall(); -} - -void l1_guest_code(struct vmx_pages *vmx_pages) -{ -#define L2_GUEST_STACK_SIZE 64 - unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; - - enable_vp_assist(vmx_pages->vp_assist_gpa, vmx_pages->vp_assist); - - GUEST_ASSERT(vmx_pages->vmcs_gpa); - GUEST_ASSERT(prepare_for_vmx_operation(vmx_pages)); - GUEST_SYNC(3); - GUEST_ASSERT(load_vmcs(vmx_pages)); - GUEST_ASSERT(vmptrstz() == vmx_pages->enlightened_vmcs_gpa); - - GUEST_SYNC(4); - GUEST_ASSERT(vmptrstz() == vmx_pages->enlightened_vmcs_gpa); - - prepare_vmcs(vmx_pages, l2_guest_code, - &l2_guest_stack[L2_GUEST_STACK_SIZE]); - - GUEST_SYNC(5); - GUEST_ASSERT(vmptrstz() == vmx_pages->enlightened_vmcs_gpa); - current_evmcs->revision_id = -1u; - GUEST_ASSERT(vmlaunch()); - current_evmcs->revision_id = EVMCS_VERSION; - GUEST_SYNC(6); - - GUEST_ASSERT(!vmlaunch()); - GUEST_ASSERT(vmptrstz() == vmx_pages->enlightened_vmcs_gpa); - GUEST_SYNC(9); - GUEST_ASSERT(!vmresume()); - GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL); - GUEST_SYNC(10); -} - -void guest_code(struct vmx_pages *vmx_pages) -{ - GUEST_SYNC(1); - GUEST_SYNC(2); - - if (vmx_pages) - l1_guest_code(vmx_pages); - - GUEST_DONE(); - - /* Try enlightened vmptrld with an incorrect GPA */ - evmcs_vmptrld(0xdeadbeef, vmx_pages->enlightened_vmcs); - GUEST_ASSERT(vmlaunch()); -} - -int main(int argc, char *argv[]) -{ - vm_vaddr_t vmx_pages_gva = 0; - - struct kvm_regs regs1, regs2; - struct kvm_vm *vm; - struct kvm_run *run; - struct kvm_x86_state *state; - struct ucall uc; - int stage; - - /* Create VM */ - vm = vm_create_default(VCPU_ID, 0, guest_code); - - vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid()); - - if (!nested_vmx_supported() || - !kvm_check_cap(KVM_CAP_NESTED_STATE) || - !kvm_check_cap(KVM_CAP_HYPERV_ENLIGHTENED_VMCS)) { - print_skip("Enlightened VMCS is unsupported"); - exit(KSFT_SKIP); - } - - vcpu_enable_evmcs(vm, VCPU_ID); - - run = vcpu_state(vm, VCPU_ID); - - vcpu_regs_get(vm, VCPU_ID, ®s1); - - vcpu_alloc_vmx(vm, &vmx_pages_gva); - vcpu_args_set(vm, VCPU_ID, 1, vmx_pages_gva); - - for (stage = 1;; stage++) { - _vcpu_run(vm, VCPU_ID); - TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, - "Stage %d: unexpected exit reason: %u (%s),\n", - stage, run->exit_reason, - exit_reason_str(run->exit_reason)); - - switch (get_ucall(vm, VCPU_ID, &uc)) { - case UCALL_ABORT: - TEST_FAIL("%s at %s:%ld", (const char *)uc.args[0], - __FILE__, uc.args[1]); - /* NOT REACHED */ - case UCALL_SYNC: - break; - case UCALL_DONE: - goto part1_done; - default: - TEST_FAIL("Unknown ucall %lu", uc.cmd); - } - - /* UCALL_SYNC is handled here. */ - TEST_ASSERT(!strcmp((const char *)uc.args[0], "hello") && - uc.args[1] == stage, "Stage %d: Unexpected register values vmexit, got %lx", - stage, (ulong)uc.args[1]); - - state = vcpu_save_state(vm, VCPU_ID); - memset(®s1, 0, sizeof(regs1)); - vcpu_regs_get(vm, VCPU_ID, ®s1); - - kvm_vm_release(vm); - - /* Restore state in a new VM. */ - kvm_vm_restart(vm, O_RDWR); - vm_vcpu_add(vm, VCPU_ID); - vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid()); - vcpu_enable_evmcs(vm, VCPU_ID); - vcpu_load_state(vm, VCPU_ID, state); - run = vcpu_state(vm, VCPU_ID); - free(state); - - memset(®s2, 0, sizeof(regs2)); - vcpu_regs_get(vm, VCPU_ID, ®s2); - TEST_ASSERT(!memcmp(®s1, ®s2, sizeof(regs2)), - "Unexpected register values after vcpu_load_state; rdi: %lx rsi: %lx", - (ulong) regs2.rdi, (ulong) regs2.rsi); - } - -part1_done: - _vcpu_run(vm, VCPU_ID); - TEST_ASSERT(run->exit_reason == KVM_EXIT_SHUTDOWN, - "Unexpected successful VMEnter with invalid eVMCS pointer!"); - - kvm_vm_free(vm); -} diff --git a/tools/testing/selftests/kvm/x86_64/exit_on_emulation_failure_test.c b/tools/testing/selftests/kvm/x86_64/exit_on_emulation_failure_test.c new file mode 100644 index 000000000000..81055476d394 --- /dev/null +++ b/tools/testing/selftests/kvm/x86_64/exit_on_emulation_failure_test.c @@ -0,0 +1,39 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2022, Google LLC. + * + * Test for KVM_CAP_EXIT_ON_EMULATION_FAILURE. + */ +#include "flds_emulation.h" +#include "test_util.h" +#include "ucall_common.h" + +#define MMIO_GPA 0x700000000 +#define MMIO_GVA MMIO_GPA + +static void guest_code(void) +{ + /* Execute flds with an MMIO address to force KVM to emulate it. */ + flds(MMIO_GVA); + GUEST_DONE(); +} + +int main(int argc, char *argv[]) +{ + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; + + TEST_REQUIRE(kvm_has_cap(KVM_CAP_EXIT_ON_EMULATION_FAILURE)); + + vm = vm_create_with_one_vcpu(&vcpu, guest_code); + vm_enable_cap(vm, KVM_CAP_EXIT_ON_EMULATION_FAILURE, 1); + virt_map(vm, MMIO_GVA, MMIO_GPA, 1); + + vcpu_run(vcpu); + handle_flds_emulation_failure_exit(vcpu); + vcpu_run(vcpu); + TEST_ASSERT_EQ(get_ucall(vcpu, NULL), UCALL_DONE); + + kvm_vm_free(vm); + return 0; +} diff --git a/tools/testing/selftests/kvm/x86_64/fix_hypercall_test.c b/tools/testing/selftests/kvm/x86_64/fix_hypercall_test.c new file mode 100644 index 000000000000..762628f7d4ba --- /dev/null +++ b/tools/testing/selftests/kvm/x86_64/fix_hypercall_test.c @@ -0,0 +1,142 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2020, Google LLC. + * + * Tests for KVM paravirtual feature disablement + */ +#include <asm/kvm_para.h> +#include <linux/kvm_para.h> +#include <linux/stringify.h> +#include <stdint.h> + +#include "kvm_test_harness.h" +#include "apic.h" +#include "test_util.h" +#include "kvm_util.h" +#include "processor.h" + +/* VMCALL and VMMCALL are both 3-byte opcodes. */ +#define HYPERCALL_INSN_SIZE 3 + +static bool quirk_disabled; + +static void guest_ud_handler(struct ex_regs *regs) +{ + regs->rax = -EFAULT; + regs->rip += HYPERCALL_INSN_SIZE; +} + +static const uint8_t vmx_vmcall[HYPERCALL_INSN_SIZE] = { 0x0f, 0x01, 0xc1 }; +static const uint8_t svm_vmmcall[HYPERCALL_INSN_SIZE] = { 0x0f, 0x01, 0xd9 }; + +extern uint8_t hypercall_insn[HYPERCALL_INSN_SIZE]; +static uint64_t do_sched_yield(uint8_t apic_id) +{ + uint64_t ret; + + asm volatile("hypercall_insn:\n\t" + ".byte 0xcc,0xcc,0xcc\n\t" + : "=a"(ret) + : "a"((uint64_t)KVM_HC_SCHED_YIELD), "b"((uint64_t)apic_id) + : "memory"); + + return ret; +} + +static void guest_main(void) +{ + const uint8_t *native_hypercall_insn; + const uint8_t *other_hypercall_insn; + uint64_t ret; + + if (host_cpu_is_intel) { + native_hypercall_insn = vmx_vmcall; + other_hypercall_insn = svm_vmmcall; + } else if (host_cpu_is_amd) { + native_hypercall_insn = svm_vmmcall; + other_hypercall_insn = vmx_vmcall; + } else { + GUEST_ASSERT(0); + /* unreachable */ + return; + } + + memcpy(hypercall_insn, other_hypercall_insn, HYPERCALL_INSN_SIZE); + + ret = do_sched_yield(GET_APIC_ID_FIELD(xapic_read_reg(APIC_ID))); + + /* + * If the quirk is disabled, verify that guest_ud_handler() "returned" + * -EFAULT and that KVM did NOT patch the hypercall. If the quirk is + * enabled, verify that the hypercall succeeded and that KVM patched in + * the "right" hypercall. + */ + if (quirk_disabled) { + GUEST_ASSERT(ret == (uint64_t)-EFAULT); + GUEST_ASSERT(!memcmp(other_hypercall_insn, hypercall_insn, + HYPERCALL_INSN_SIZE)); + } else { + GUEST_ASSERT(!ret); + GUEST_ASSERT(!memcmp(native_hypercall_insn, hypercall_insn, + HYPERCALL_INSN_SIZE)); + } + + GUEST_DONE(); +} + +KVM_ONE_VCPU_TEST_SUITE(fix_hypercall); + +static void enter_guest(struct kvm_vcpu *vcpu) +{ + struct kvm_run *run = vcpu->run; + struct ucall uc; + + vcpu_run(vcpu); + switch (get_ucall(vcpu, &uc)) { + case UCALL_SYNC: + pr_info("%s: %016lx\n", (const char *)uc.args[2], uc.args[3]); + break; + case UCALL_DONE: + return; + case UCALL_ABORT: + REPORT_GUEST_ASSERT(uc); + default: + TEST_FAIL("Unhandled ucall: %ld\nexit_reason: %u (%s)", + uc.cmd, run->exit_reason, exit_reason_str(run->exit_reason)); + } +} + +static void test_fix_hypercall(struct kvm_vcpu *vcpu, bool disable_quirk) +{ + struct kvm_vm *vm = vcpu->vm; + + vm_install_exception_handler(vcpu->vm, UD_VECTOR, guest_ud_handler); + + if (disable_quirk) + vm_enable_cap(vm, KVM_CAP_DISABLE_QUIRKS2, + KVM_X86_QUIRK_FIX_HYPERCALL_INSN); + + quirk_disabled = disable_quirk; + sync_global_to_guest(vm, quirk_disabled); + + virt_pg_map(vm, APIC_DEFAULT_GPA, APIC_DEFAULT_GPA); + + enter_guest(vcpu); +} + +KVM_ONE_VCPU_TEST(fix_hypercall, enable_quirk, guest_main) +{ + test_fix_hypercall(vcpu, false); +} + +KVM_ONE_VCPU_TEST(fix_hypercall, disable_quirk, guest_main) +{ + test_fix_hypercall(vcpu, true); +} + +int main(int argc, char *argv[]) +{ + TEST_REQUIRE(kvm_check_cap(KVM_CAP_DISABLE_QUIRKS2) & KVM_X86_QUIRK_FIX_HYPERCALL_INSN); + + return test_harness_run(argc, argv); +} diff --git a/tools/testing/selftests/kvm/x86_64/flds_emulation.h b/tools/testing/selftests/kvm/x86_64/flds_emulation.h new file mode 100644 index 000000000000..37b1a9f52864 --- /dev/null +++ b/tools/testing/selftests/kvm/x86_64/flds_emulation.h @@ -0,0 +1,52 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef SELFTEST_KVM_FLDS_EMULATION_H +#define SELFTEST_KVM_FLDS_EMULATION_H + +#include "kvm_util.h" + +#define FLDS_MEM_EAX ".byte 0xd9, 0x00" + +/* + * flds is an instruction that the KVM instruction emulator is known not to + * support. This can be used in guest code along with a mechanism to force + * KVM to emulate the instruction (e.g. by providing an MMIO address) to + * exercise emulation failures. + */ +static inline void flds(uint64_t address) +{ + __asm__ __volatile__(FLDS_MEM_EAX :: "a"(address)); +} + +static inline void handle_flds_emulation_failure_exit(struct kvm_vcpu *vcpu) +{ + struct kvm_run *run = vcpu->run; + struct kvm_regs regs; + uint8_t *insn_bytes; + uint64_t flags; + + TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_INTERNAL_ERROR); + + TEST_ASSERT(run->emulation_failure.suberror == KVM_INTERNAL_ERROR_EMULATION, + "Unexpected suberror: %u", + run->emulation_failure.suberror); + + flags = run->emulation_failure.flags; + TEST_ASSERT(run->emulation_failure.ndata >= 3 && + flags & KVM_INTERNAL_ERROR_EMULATION_FLAG_INSTRUCTION_BYTES, + "run->emulation_failure is missing instruction bytes"); + + TEST_ASSERT(run->emulation_failure.insn_size >= 2, + "Expected a 2-byte opcode for 'flds', got %d bytes", + run->emulation_failure.insn_size); + + insn_bytes = run->emulation_failure.insn_bytes; + TEST_ASSERT(insn_bytes[0] == 0xd9 && insn_bytes[1] == 0, + "Expected 'flds [eax]', opcode '0xd9 0x00', got opcode 0x%02x 0x%02x", + insn_bytes[0], insn_bytes[1]); + + vcpu_regs_get(vcpu, ®s); + regs.rip += 2; + vcpu_regs_set(vcpu, ®s); +} + +#endif /* !SELFTEST_KVM_FLDS_EMULATION_H */ diff --git a/tools/testing/selftests/kvm/x86_64/get_msr_index_features.c b/tools/testing/selftests/kvm/x86_64/get_msr_index_features.c new file mode 100644 index 000000000000..d09b3cbcadc6 --- /dev/null +++ b/tools/testing/selftests/kvm/x86_64/get_msr_index_features.c @@ -0,0 +1,35 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Test that KVM_GET_MSR_INDEX_LIST and + * KVM_GET_MSR_FEATURE_INDEX_LIST work as intended + * + * Copyright (C) 2020, Red Hat, Inc. + */ +#include <fcntl.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/ioctl.h> + +#include "test_util.h" +#include "kvm_util.h" +#include "processor.h" + +int main(int argc, char *argv[]) +{ + const struct kvm_msr_list *feature_list; + int i; + + /* + * Skip the entire test if MSR_FEATURES isn't supported, other tests + * will cover the "regular" list of MSRs, the coverage here is purely + * opportunistic and not interesting on its own. + */ + TEST_REQUIRE(kvm_has_cap(KVM_CAP_GET_MSR_FEATURES)); + + (void)kvm_get_msr_index_list(); + + feature_list = kvm_get_feature_msr_index_list(); + for (i = 0; i < feature_list->nmsrs; i++) + kvm_get_feature_msr(feature_list->indices[i]); +} diff --git a/tools/testing/selftests/kvm/x86_64/hwcr_msr_test.c b/tools/testing/selftests/kvm/x86_64/hwcr_msr_test.c new file mode 100644 index 000000000000..10b1b0ba374e --- /dev/null +++ b/tools/testing/selftests/kvm/x86_64/hwcr_msr_test.c @@ -0,0 +1,45 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2023, Google LLC. + */ +#include <sys/ioctl.h> + +#include "test_util.h" +#include "kvm_util.h" +#include "vmx.h" + +void test_hwcr_bit(struct kvm_vcpu *vcpu, unsigned int bit) +{ + const uint64_t ignored = BIT_ULL(3) | BIT_ULL(6) | BIT_ULL(8); + const uint64_t valid = BIT_ULL(18) | BIT_ULL(24); + const uint64_t legal = ignored | valid; + uint64_t val = BIT_ULL(bit); + uint64_t actual; + int r; + + r = _vcpu_set_msr(vcpu, MSR_K7_HWCR, val); + TEST_ASSERT(val & ~legal ? !r : r == 1, + "Expected KVM_SET_MSRS(MSR_K7_HWCR) = 0x%lx to %s", + val, val & ~legal ? "fail" : "succeed"); + + actual = vcpu_get_msr(vcpu, MSR_K7_HWCR); + TEST_ASSERT(actual == (val & valid), + "Bit %u: unexpected HWCR 0x%lx; expected 0x%lx", + bit, actual, (val & valid)); + + vcpu_set_msr(vcpu, MSR_K7_HWCR, 0); +} + +int main(int argc, char *argv[]) +{ + struct kvm_vm *vm; + struct kvm_vcpu *vcpu; + unsigned int bit; + + vm = vm_create_with_one_vcpu(&vcpu, NULL); + + for (bit = 0; bit < BITS_PER_LONG; bit++) + test_hwcr_bit(vcpu, bit); + + kvm_vm_free(vm); +} diff --git a/tools/testing/selftests/kvm/x86_64/hyperv_clock.c b/tools/testing/selftests/kvm/x86_64/hyperv_clock.c new file mode 100644 index 000000000000..e058bc676cd6 --- /dev/null +++ b/tools/testing/selftests/kvm/x86_64/hyperv_clock.c @@ -0,0 +1,263 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2021, Red Hat, Inc. + * + * Tests for Hyper-V clocksources + */ +#include "test_util.h" +#include "kvm_util.h" +#include "processor.h" +#include "hyperv.h" + +struct ms_hyperv_tsc_page { + volatile u32 tsc_sequence; + u32 reserved1; + volatile u64 tsc_scale; + volatile s64 tsc_offset; +} __packed; + +/* Simplified mul_u64_u64_shr() */ +static inline u64 mul_u64_u64_shr64(u64 a, u64 b) +{ + union { + u64 ll; + struct { + u32 low, high; + } l; + } rm, rn, rh, a0, b0; + u64 c; + + a0.ll = a; + b0.ll = b; + + rm.ll = (u64)a0.l.low * b0.l.high; + rn.ll = (u64)a0.l.high * b0.l.low; + rh.ll = (u64)a0.l.high * b0.l.high; + + rh.l.low = c = rm.l.high + rn.l.high + rh.l.low; + rh.l.high = (c >> 32) + rh.l.high; + + return rh.ll; +} + +static inline void nop_loop(void) +{ + int i; + + for (i = 0; i < 100000000; i++) + asm volatile("nop"); +} + +static inline void check_tsc_msr_rdtsc(void) +{ + u64 tsc_freq, r1, r2, t1, t2; + s64 delta_ns; + + tsc_freq = rdmsr(HV_X64_MSR_TSC_FREQUENCY); + GUEST_ASSERT(tsc_freq > 0); + + /* For increased accuracy, take mean rdtsc() before and afrer rdmsr() */ + r1 = rdtsc(); + t1 = rdmsr(HV_X64_MSR_TIME_REF_COUNT); + r1 = (r1 + rdtsc()) / 2; + nop_loop(); + r2 = rdtsc(); + t2 = rdmsr(HV_X64_MSR_TIME_REF_COUNT); + r2 = (r2 + rdtsc()) / 2; + + GUEST_ASSERT(r2 > r1 && t2 > t1); + + /* HV_X64_MSR_TIME_REF_COUNT is in 100ns */ + delta_ns = ((t2 - t1) * 100) - ((r2 - r1) * 1000000000 / tsc_freq); + if (delta_ns < 0) + delta_ns = -delta_ns; + + /* 1% tolerance */ + GUEST_ASSERT(delta_ns * 100 < (t2 - t1) * 100); +} + +static inline u64 get_tscpage_ts(struct ms_hyperv_tsc_page *tsc_page) +{ + return mul_u64_u64_shr64(rdtsc(), tsc_page->tsc_scale) + tsc_page->tsc_offset; +} + +static inline void check_tsc_msr_tsc_page(struct ms_hyperv_tsc_page *tsc_page) +{ + u64 r1, r2, t1, t2; + + /* Compare TSC page clocksource with HV_X64_MSR_TIME_REF_COUNT */ + t1 = get_tscpage_ts(tsc_page); + r1 = rdmsr(HV_X64_MSR_TIME_REF_COUNT); + + /* 10 ms tolerance */ + GUEST_ASSERT(r1 >= t1 && r1 - t1 < 100000); + nop_loop(); + + t2 = get_tscpage_ts(tsc_page); + r2 = rdmsr(HV_X64_MSR_TIME_REF_COUNT); + GUEST_ASSERT(r2 >= t1 && r2 - t2 < 100000); +} + +static void guest_main(struct ms_hyperv_tsc_page *tsc_page, vm_paddr_t tsc_page_gpa) +{ + u64 tsc_scale, tsc_offset; + + /* Set Guest OS id to enable Hyper-V emulation */ + GUEST_SYNC(1); + wrmsr(HV_X64_MSR_GUEST_OS_ID, HYPERV_LINUX_OS_ID); + GUEST_SYNC(2); + + check_tsc_msr_rdtsc(); + + GUEST_SYNC(3); + + /* Set up TSC page is disabled state, check that it's clean */ + wrmsr(HV_X64_MSR_REFERENCE_TSC, tsc_page_gpa); + GUEST_ASSERT(tsc_page->tsc_sequence == 0); + GUEST_ASSERT(tsc_page->tsc_scale == 0); + GUEST_ASSERT(tsc_page->tsc_offset == 0); + + GUEST_SYNC(4); + + /* Set up TSC page is enabled state */ + wrmsr(HV_X64_MSR_REFERENCE_TSC, tsc_page_gpa | 0x1); + GUEST_ASSERT(tsc_page->tsc_sequence != 0); + + GUEST_SYNC(5); + + check_tsc_msr_tsc_page(tsc_page); + + GUEST_SYNC(6); + + tsc_offset = tsc_page->tsc_offset; + /* Call KVM_SET_CLOCK from userspace, check that TSC page was updated */ + + GUEST_SYNC(7); + /* Sanity check TSC page timestamp, it should be close to 0 */ + GUEST_ASSERT(get_tscpage_ts(tsc_page) < 100000); + + GUEST_ASSERT(tsc_page->tsc_offset != tsc_offset); + + nop_loop(); + + /* + * Enable Re-enlightenment and check that TSC page stays constant across + * KVM_SET_CLOCK. + */ + wrmsr(HV_X64_MSR_REENLIGHTENMENT_CONTROL, 0x1 << 16 | 0xff); + wrmsr(HV_X64_MSR_TSC_EMULATION_CONTROL, 0x1); + tsc_offset = tsc_page->tsc_offset; + tsc_scale = tsc_page->tsc_scale; + GUEST_SYNC(8); + GUEST_ASSERT(tsc_page->tsc_offset == tsc_offset); + GUEST_ASSERT(tsc_page->tsc_scale == tsc_scale); + + GUEST_SYNC(9); + + check_tsc_msr_tsc_page(tsc_page); + + /* + * Disable re-enlightenment and TSC page, check that KVM doesn't update + * it anymore. + */ + wrmsr(HV_X64_MSR_REENLIGHTENMENT_CONTROL, 0); + wrmsr(HV_X64_MSR_TSC_EMULATION_CONTROL, 0); + wrmsr(HV_X64_MSR_REFERENCE_TSC, 0); + memset(tsc_page, 0, sizeof(*tsc_page)); + + GUEST_SYNC(10); + GUEST_ASSERT(tsc_page->tsc_sequence == 0); + GUEST_ASSERT(tsc_page->tsc_offset == 0); + GUEST_ASSERT(tsc_page->tsc_scale == 0); + + GUEST_DONE(); +} + +static void host_check_tsc_msr_rdtsc(struct kvm_vcpu *vcpu) +{ + u64 tsc_freq, r1, r2, t1, t2; + s64 delta_ns; + + tsc_freq = vcpu_get_msr(vcpu, HV_X64_MSR_TSC_FREQUENCY); + TEST_ASSERT(tsc_freq > 0, "TSC frequency must be nonzero"); + + /* For increased accuracy, take mean rdtsc() before and afrer ioctl */ + r1 = rdtsc(); + t1 = vcpu_get_msr(vcpu, HV_X64_MSR_TIME_REF_COUNT); + r1 = (r1 + rdtsc()) / 2; + nop_loop(); + r2 = rdtsc(); + t2 = vcpu_get_msr(vcpu, HV_X64_MSR_TIME_REF_COUNT); + r2 = (r2 + rdtsc()) / 2; + + TEST_ASSERT(t2 > t1, "Time reference MSR is not monotonic (%ld <= %ld)", t1, t2); + + /* HV_X64_MSR_TIME_REF_COUNT is in 100ns */ + delta_ns = ((t2 - t1) * 100) - ((r2 - r1) * 1000000000 / tsc_freq); + if (delta_ns < 0) + delta_ns = -delta_ns; + + /* 1% tolerance */ + TEST_ASSERT(delta_ns * 100 < (t2 - t1) * 100, + "Elapsed time does not match (MSR=%ld, TSC=%ld)", + (t2 - t1) * 100, (r2 - r1) * 1000000000 / tsc_freq); +} + +int main(void) +{ + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; + struct ucall uc; + vm_vaddr_t tsc_page_gva; + int stage; + + TEST_REQUIRE(kvm_has_cap(KVM_CAP_HYPERV_TIME)); + TEST_REQUIRE(sys_clocksource_is_based_on_tsc()); + + vm = vm_create_with_one_vcpu(&vcpu, guest_main); + + vcpu_set_hv_cpuid(vcpu); + + tsc_page_gva = vm_vaddr_alloc_page(vm); + memset(addr_gva2hva(vm, tsc_page_gva), 0x0, getpagesize()); + TEST_ASSERT((addr_gva2gpa(vm, tsc_page_gva) & (getpagesize() - 1)) == 0, + "TSC page has to be page aligned"); + vcpu_args_set(vcpu, 2, tsc_page_gva, addr_gva2gpa(vm, tsc_page_gva)); + + host_check_tsc_msr_rdtsc(vcpu); + + for (stage = 1;; stage++) { + vcpu_run(vcpu); + TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO); + + switch (get_ucall(vcpu, &uc)) { + case UCALL_ABORT: + REPORT_GUEST_ASSERT(uc); + /* NOT REACHED */ + case UCALL_SYNC: + break; + case UCALL_DONE: + /* Keep in sync with guest_main() */ + TEST_ASSERT(stage == 11, "Testing ended prematurely, stage %d", + stage); + goto out; + default: + TEST_FAIL("Unknown ucall %lu", uc.cmd); + } + + TEST_ASSERT(!strcmp((const char *)uc.args[0], "hello") && + uc.args[1] == stage, + "Stage %d: Unexpected register values vmexit, got %lx", + stage, (ulong)uc.args[1]); + + /* Reset kvmclock triggering TSC page update */ + if (stage == 7 || stage == 8 || stage == 10) { + struct kvm_clock_data clock = {0}; + + vm_ioctl(vm, KVM_SET_CLOCK, &clock); + } + } + +out: + kvm_vm_free(vm); +} diff --git a/tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c b/tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c index 745b708c2d3b..4f5881d4ef66 100644 --- a/tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c +++ b/tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c @@ -7,8 +7,6 @@ * This work is licensed under the terms of the GNU GPL, version 2. * */ - -#define _GNU_SOURCE /* for program_invocation_short_name */ #include <fcntl.h> #include <stdio.h> #include <stdlib.h> @@ -20,8 +18,6 @@ #include "processor.h" #include "vmx.h" -#define VCPU_ID 0 - static void guest_code(void) { } @@ -45,32 +41,26 @@ static bool smt_possible(void) return res; } -static void test_hv_cpuid(struct kvm_cpuid2 *hv_cpuid_entries, - bool evmcs_enabled) +static void test_hv_cpuid(const struct kvm_cpuid2 *hv_cpuid_entries, + bool evmcs_expected) { int i; - int nent = 9; + int nent_expected = 10; u32 test_val; - if (evmcs_enabled) - nent += 1; /* 0x4000000A */ - - TEST_ASSERT(hv_cpuid_entries->nent == nent, + TEST_ASSERT(hv_cpuid_entries->nent == nent_expected, "KVM_GET_SUPPORTED_HV_CPUID should return %d entries" - " with evmcs=%d (returned %d)", - nent, evmcs_enabled, hv_cpuid_entries->nent); + " (returned %d)", + nent_expected, hv_cpuid_entries->nent); for (i = 0; i < hv_cpuid_entries->nent; i++) { - struct kvm_cpuid_entry2 *entry = &hv_cpuid_entries->entries[i]; + const struct kvm_cpuid_entry2 *entry = &hv_cpuid_entries->entries[i]; TEST_ASSERT((entry->function >= 0x40000000) && (entry->function <= 0x40000082), "function %x is our of supported range", entry->function); - TEST_ASSERT(evmcs_enabled || (entry->function != 0x4000000A), - "0x4000000A leaf should not be reported"); - TEST_ASSERT(entry->index == 0, ".index field should be zero"); @@ -87,7 +77,7 @@ static void test_hv_cpuid(struct kvm_cpuid2 *hv_cpuid_entries, TEST_ASSERT(entry->eax == test_val, "Wrong max leaf report in 0x40000000.EAX: %x" " (evmcs=%d)", - entry->eax, evmcs_enabled + entry->eax, evmcs_expected ); break; case 0x40000004: @@ -97,8 +87,20 @@ static void test_hv_cpuid(struct kvm_cpuid2 *hv_cpuid_entries, "NoNonArchitecturalCoreSharing bit" " doesn't reflect SMT setting"); break; - } + case 0x4000000A: + TEST_ASSERT(entry->eax & (1UL << 19), + "Enlightened MSR-Bitmap should always be supported" + " 0x40000000.EAX: %x", entry->eax); + if (evmcs_expected) + TEST_ASSERT((entry->eax & 0xffff) == 0x101, + "Supported Enlightened VMCS version range is supposed to be 1:1" + " 0x40000000.EAX: %x", entry->eax); + + break; + default: + break; + } /* * If needed for debug: * fprintf(stdout, @@ -107,84 +109,64 @@ static void test_hv_cpuid(struct kvm_cpuid2 *hv_cpuid_entries, * entry->edx); */ } - } -void test_hv_cpuid_e2big(struct kvm_vm *vm) +void test_hv_cpuid_e2big(struct kvm_vm *vm, struct kvm_vcpu *vcpu) { static struct kvm_cpuid2 cpuid = {.nent = 0}; int ret; - ret = _vcpu_ioctl(vm, VCPU_ID, KVM_GET_SUPPORTED_HV_CPUID, &cpuid); + if (vcpu) + ret = __vcpu_ioctl(vcpu, KVM_GET_SUPPORTED_HV_CPUID, &cpuid); + else + ret = __kvm_ioctl(vm->kvm_fd, KVM_GET_SUPPORTED_HV_CPUID, &cpuid); TEST_ASSERT(ret == -1 && errno == E2BIG, - "KVM_GET_SUPPORTED_HV_CPUID didn't fail with -E2BIG when" - " it should have: %d %d", ret, errno); + "%s KVM_GET_SUPPORTED_HV_CPUID didn't fail with -E2BIG when" + " it should have: %d %d", !vcpu ? "KVM" : "vCPU", ret, errno); } - -struct kvm_cpuid2 *kvm_get_supported_hv_cpuid(struct kvm_vm *vm) +int main(int argc, char *argv[]) { - int nent = 20; /* should be enough */ - static struct kvm_cpuid2 *cpuid; - - cpuid = malloc(sizeof(*cpuid) + nent * sizeof(struct kvm_cpuid_entry2)); - - if (!cpuid) { - perror("malloc"); - abort(); - } - - cpuid->nent = nent; + struct kvm_vm *vm; + const struct kvm_cpuid2 *hv_cpuid_entries; + struct kvm_vcpu *vcpu; - vcpu_ioctl(vm, VCPU_ID, KVM_GET_SUPPORTED_HV_CPUID, cpuid); + TEST_REQUIRE(kvm_has_cap(KVM_CAP_HYPERV_CPUID)); - return cpuid; -} + vm = vm_create_with_one_vcpu(&vcpu, guest_code); + /* Test vCPU ioctl version */ + test_hv_cpuid_e2big(vm, vcpu); -int main(int argc, char *argv[]) -{ - struct kvm_vm *vm; - int rv, stage; - struct kvm_cpuid2 *hv_cpuid_entries; - bool evmcs_enabled; + hv_cpuid_entries = vcpu_get_supported_hv_cpuid(vcpu); + test_hv_cpuid(hv_cpuid_entries, false); + free((void *)hv_cpuid_entries); - /* Tell stdout not to buffer its content */ - setbuf(stdout, NULL); - - rv = kvm_check_cap(KVM_CAP_HYPERV_CPUID); - if (!rv) { - print_skip("KVM_CAP_HYPERV_CPUID not supported"); - exit(KSFT_SKIP); + if (!kvm_cpu_has(X86_FEATURE_VMX) || + !kvm_has_cap(KVM_CAP_HYPERV_ENLIGHTENED_VMCS)) { + print_skip("Enlightened VMCS is unsupported"); + goto do_sys; + } + vcpu_enable_evmcs(vcpu); + hv_cpuid_entries = vcpu_get_supported_hv_cpuid(vcpu); + test_hv_cpuid(hv_cpuid_entries, true); + free((void *)hv_cpuid_entries); + +do_sys: + /* Test system ioctl version */ + if (!kvm_has_cap(KVM_CAP_SYS_HYPERV_CPUID)) { + print_skip("KVM_CAP_SYS_HYPERV_CPUID not supported"); + goto out; } - for (stage = 0; stage < 3; stage++) { - evmcs_enabled = false; + test_hv_cpuid_e2big(vm, NULL); - vm = vm_create_default(VCPU_ID, 0, guest_code); - switch (stage) { - case 0: - test_hv_cpuid_e2big(vm); - continue; - case 1: - break; - case 2: - if (!nested_vmx_supported() || - !kvm_check_cap(KVM_CAP_HYPERV_ENLIGHTENED_VMCS)) { - print_skip("Enlightened VMCS is unsupported"); - continue; - } - vcpu_enable_evmcs(vm, VCPU_ID); - evmcs_enabled = true; - break; - } + hv_cpuid_entries = kvm_get_supported_hv_cpuid(); + test_hv_cpuid(hv_cpuid_entries, kvm_cpu_has(X86_FEATURE_VMX)); - hv_cpuid_entries = kvm_get_supported_hv_cpuid(vm); - test_hv_cpuid(hv_cpuid_entries, evmcs_enabled); - free(hv_cpuid_entries); - kvm_vm_free(vm); - } +out: + kvm_vm_free(vm); return 0; } diff --git a/tools/testing/selftests/kvm/x86_64/hyperv_evmcs.c b/tools/testing/selftests/kvm/x86_64/hyperv_evmcs.c new file mode 100644 index 000000000000..e192720bfe14 --- /dev/null +++ b/tools/testing/selftests/kvm/x86_64/hyperv_evmcs.c @@ -0,0 +1,307 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2018, Red Hat, Inc. + * + * Tests for Enlightened VMCS, including nested guest state. + */ +#include <fcntl.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/ioctl.h> +#include <linux/bitmap.h> + +#include "test_util.h" + +#include "kvm_util.h" + +#include "hyperv.h" +#include "vmx.h" + +static int ud_count; + +static void guest_ud_handler(struct ex_regs *regs) +{ + ud_count++; + regs->rip += 3; /* VMLAUNCH */ +} + +static void guest_nmi_handler(struct ex_regs *regs) +{ +} + +static inline void rdmsr_from_l2(uint32_t msr) +{ + /* Currently, L1 doesn't preserve GPRs during vmexits. */ + __asm__ __volatile__ ("rdmsr" : : "c"(msr) : + "rax", "rbx", "rdx", "rsi", "rdi", "r8", "r9", + "r10", "r11", "r12", "r13", "r14", "r15"); +} + +/* Exit to L1 from L2 with RDMSR instruction */ +void l2_guest_code(void) +{ + u64 unused; + + GUEST_SYNC(7); + + GUEST_SYNC(8); + + /* Forced exit to L1 upon restore */ + GUEST_SYNC(9); + + vmcall(); + + /* MSR-Bitmap tests */ + rdmsr_from_l2(MSR_FS_BASE); /* intercepted */ + rdmsr_from_l2(MSR_FS_BASE); /* intercepted */ + rdmsr_from_l2(MSR_GS_BASE); /* not intercepted */ + vmcall(); + rdmsr_from_l2(MSR_GS_BASE); /* intercepted */ + + /* L2 TLB flush tests */ + hyperv_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE | HV_HYPERCALL_FAST_BIT, 0x0, + HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES | HV_FLUSH_ALL_PROCESSORS); + rdmsr_from_l2(MSR_FS_BASE); + /* + * Note: hypercall status (RAX) is not preserved correctly by L1 after + * synthetic vmexit, use unchecked version. + */ + __hyperv_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE | HV_HYPERCALL_FAST_BIT, 0x0, + HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES | HV_FLUSH_ALL_PROCESSORS, + &unused); + + /* Done, exit to L1 and never come back. */ + vmcall(); +} + +void guest_code(struct vmx_pages *vmx_pages, struct hyperv_test_pages *hv_pages, + vm_vaddr_t hv_hcall_page_gpa) +{ +#define L2_GUEST_STACK_SIZE 64 + unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; + + wrmsr(HV_X64_MSR_GUEST_OS_ID, HYPERV_LINUX_OS_ID); + wrmsr(HV_X64_MSR_HYPERCALL, hv_hcall_page_gpa); + + x2apic_enable(); + + GUEST_SYNC(1); + GUEST_SYNC(2); + + enable_vp_assist(hv_pages->vp_assist_gpa, hv_pages->vp_assist); + evmcs_enable(); + + GUEST_ASSERT(prepare_for_vmx_operation(vmx_pages)); + GUEST_SYNC(3); + GUEST_ASSERT(load_evmcs(hv_pages)); + GUEST_ASSERT(vmptrstz() == hv_pages->enlightened_vmcs_gpa); + + GUEST_SYNC(4); + GUEST_ASSERT(vmptrstz() == hv_pages->enlightened_vmcs_gpa); + + prepare_vmcs(vmx_pages, l2_guest_code, + &l2_guest_stack[L2_GUEST_STACK_SIZE]); + + GUEST_SYNC(5); + GUEST_ASSERT(vmptrstz() == hv_pages->enlightened_vmcs_gpa); + current_evmcs->revision_id = -1u; + GUEST_ASSERT(vmlaunch()); + current_evmcs->revision_id = EVMCS_VERSION; + GUEST_SYNC(6); + + vmwrite(PIN_BASED_VM_EXEC_CONTROL, vmreadz(PIN_BASED_VM_EXEC_CONTROL) | + PIN_BASED_NMI_EXITING); + + /* L2 TLB flush setup */ + current_evmcs->partition_assist_page = hv_pages->partition_assist_gpa; + current_evmcs->hv_enlightenments_control.nested_flush_hypercall = 1; + current_evmcs->hv_vm_id = 1; + current_evmcs->hv_vp_id = 1; + current_vp_assist->nested_control.features.directhypercall = 1; + *(u32 *)(hv_pages->partition_assist) = 0; + + GUEST_ASSERT(!vmlaunch()); + GUEST_ASSERT_EQ(vmreadz(VM_EXIT_REASON), EXIT_REASON_EXCEPTION_NMI); + GUEST_ASSERT_EQ((vmreadz(VM_EXIT_INTR_INFO) & 0xff), NMI_VECTOR); + GUEST_ASSERT(vmptrstz() == hv_pages->enlightened_vmcs_gpa); + + /* + * NMI forces L2->L1 exit, resuming L2 and hope that EVMCS is + * up-to-date (RIP points where it should and not at the beginning + * of l2_guest_code(). GUEST_SYNC(9) checkes that. + */ + GUEST_ASSERT(!vmresume()); + + GUEST_SYNC(10); + + GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL); + current_evmcs->guest_rip += 3; /* vmcall */ + + /* Intercept RDMSR 0xc0000100 */ + vmwrite(CPU_BASED_VM_EXEC_CONTROL, vmreadz(CPU_BASED_VM_EXEC_CONTROL) | + CPU_BASED_USE_MSR_BITMAPS); + __set_bit(MSR_FS_BASE & 0x1fff, vmx_pages->msr + 0x400); + GUEST_ASSERT(!vmresume()); + GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_MSR_READ); + current_evmcs->guest_rip += 2; /* rdmsr */ + + /* Enable enlightened MSR bitmap */ + current_evmcs->hv_enlightenments_control.msr_bitmap = 1; + GUEST_ASSERT(!vmresume()); + GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_MSR_READ); + current_evmcs->guest_rip += 2; /* rdmsr */ + + /* Intercept RDMSR 0xc0000101 without telling KVM about it */ + __set_bit(MSR_GS_BASE & 0x1fff, vmx_pages->msr + 0x400); + /* Make sure HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP is set */ + current_evmcs->hv_clean_fields |= HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP; + GUEST_ASSERT(!vmresume()); + /* Make sure we don't see EXIT_REASON_MSR_READ here so eMSR bitmap works */ + GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL); + current_evmcs->guest_rip += 3; /* vmcall */ + + /* Now tell KVM we've changed MSR-Bitmap */ + current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP; + GUEST_ASSERT(!vmresume()); + GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_MSR_READ); + current_evmcs->guest_rip += 2; /* rdmsr */ + + /* + * L2 TLB flush test. First VMCALL should be handled directly by L0, + * no VMCALL exit expected. + */ + GUEST_ASSERT(!vmresume()); + GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_MSR_READ); + current_evmcs->guest_rip += 2; /* rdmsr */ + /* Enable synthetic vmexit */ + *(u32 *)(hv_pages->partition_assist) = 1; + GUEST_ASSERT(!vmresume()); + GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == HV_VMX_SYNTHETIC_EXIT_REASON_TRAP_AFTER_FLUSH); + + GUEST_ASSERT(!vmresume()); + GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL); + GUEST_SYNC(11); + + /* Try enlightened vmptrld with an incorrect GPA */ + evmcs_vmptrld(0xdeadbeef, hv_pages->enlightened_vmcs); + GUEST_ASSERT(vmlaunch()); + GUEST_ASSERT(ud_count == 1); + GUEST_DONE(); +} + +void inject_nmi(struct kvm_vcpu *vcpu) +{ + struct kvm_vcpu_events events; + + vcpu_events_get(vcpu, &events); + + events.nmi.pending = 1; + events.flags |= KVM_VCPUEVENT_VALID_NMI_PENDING; + + vcpu_events_set(vcpu, &events); +} + +static struct kvm_vcpu *save_restore_vm(struct kvm_vm *vm, + struct kvm_vcpu *vcpu) +{ + struct kvm_regs regs1, regs2; + struct kvm_x86_state *state; + + state = vcpu_save_state(vcpu); + memset(®s1, 0, sizeof(regs1)); + vcpu_regs_get(vcpu, ®s1); + + kvm_vm_release(vm); + + /* Restore state in a new VM. */ + vcpu = vm_recreate_with_one_vcpu(vm); + vcpu_set_hv_cpuid(vcpu); + vcpu_enable_evmcs(vcpu); + vcpu_load_state(vcpu, state); + kvm_x86_state_cleanup(state); + + memset(®s2, 0, sizeof(regs2)); + vcpu_regs_get(vcpu, ®s2); + TEST_ASSERT(!memcmp(®s1, ®s2, sizeof(regs2)), + "Unexpected register values after vcpu_load_state; rdi: %lx rsi: %lx", + (ulong) regs2.rdi, (ulong) regs2.rsi); + return vcpu; +} + +int main(int argc, char *argv[]) +{ + vm_vaddr_t vmx_pages_gva = 0, hv_pages_gva = 0; + vm_vaddr_t hcall_page; + + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; + struct ucall uc; + int stage; + + TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX)); + TEST_REQUIRE(kvm_has_cap(KVM_CAP_NESTED_STATE)); + TEST_REQUIRE(kvm_has_cap(KVM_CAP_HYPERV_ENLIGHTENED_VMCS)); + TEST_REQUIRE(kvm_has_cap(KVM_CAP_HYPERV_DIRECT_TLBFLUSH)); + + vm = vm_create_with_one_vcpu(&vcpu, guest_code); + + hcall_page = vm_vaddr_alloc_pages(vm, 1); + memset(addr_gva2hva(vm, hcall_page), 0x0, getpagesize()); + + vcpu_set_hv_cpuid(vcpu); + vcpu_enable_evmcs(vcpu); + + vcpu_alloc_vmx(vm, &vmx_pages_gva); + vcpu_alloc_hyperv_test_pages(vm, &hv_pages_gva); + vcpu_args_set(vcpu, 3, vmx_pages_gva, hv_pages_gva, addr_gva2gpa(vm, hcall_page)); + vcpu_set_msr(vcpu, HV_X64_MSR_VP_INDEX, vcpu->id); + + vm_install_exception_handler(vm, UD_VECTOR, guest_ud_handler); + vm_install_exception_handler(vm, NMI_VECTOR, guest_nmi_handler); + + pr_info("Running L1 which uses EVMCS to run L2\n"); + + for (stage = 1;; stage++) { + vcpu_run(vcpu); + TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO); + + switch (get_ucall(vcpu, &uc)) { + case UCALL_ABORT: + REPORT_GUEST_ASSERT(uc); + /* NOT REACHED */ + case UCALL_SYNC: + break; + case UCALL_DONE: + goto done; + default: + TEST_FAIL("Unknown ucall %lu", uc.cmd); + } + + /* UCALL_SYNC is handled here. */ + TEST_ASSERT(!strcmp((const char *)uc.args[0], "hello") && + uc.args[1] == stage, "Stage %d: Unexpected register values vmexit, got %lx", + stage, (ulong)uc.args[1]); + + vcpu = save_restore_vm(vm, vcpu); + + /* Force immediate L2->L1 exit before resuming */ + if (stage == 8) { + pr_info("Injecting NMI into L1 before L2 had a chance to run after restore\n"); + inject_nmi(vcpu); + } + + /* + * Do KVM_GET_NESTED_STATE/KVM_SET_NESTED_STATE for a freshly + * restored VM (before the first KVM_RUN) to check that + * KVM_STATE_NESTED_EVMCS is not lost. + */ + if (stage == 9) { + pr_info("Trying extra KVM_GET_NESTED_STATE/KVM_SET_NESTED_STATE cycle\n"); + vcpu = save_restore_vm(vm, vcpu); + } + } + +done: + kvm_vm_free(vm); +} diff --git a/tools/testing/selftests/kvm/x86_64/hyperv_extended_hypercalls.c b/tools/testing/selftests/kvm/x86_64/hyperv_extended_hypercalls.c new file mode 100644 index 000000000000..949e08e98f31 --- /dev/null +++ b/tools/testing/selftests/kvm/x86_64/hyperv_extended_hypercalls.c @@ -0,0 +1,98 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Test Hyper-V extended hypercall, HV_EXT_CALL_QUERY_CAPABILITIES (0x8001), + * exit to userspace and receive result in guest. + * + * Negative tests are present in hyperv_features.c + * + * Copyright 2022 Google LLC + * Author: Vipin Sharma <vipinsh@google.com> + */ +#include "kvm_util.h" +#include "processor.h" +#include "hyperv.h" + +/* Any value is fine */ +#define EXT_CAPABILITIES 0xbull + +static void guest_code(vm_paddr_t in_pg_gpa, vm_paddr_t out_pg_gpa, + vm_vaddr_t out_pg_gva) +{ + uint64_t *output_gva; + + wrmsr(HV_X64_MSR_GUEST_OS_ID, HYPERV_LINUX_OS_ID); + wrmsr(HV_X64_MSR_HYPERCALL, in_pg_gpa); + + output_gva = (uint64_t *)out_pg_gva; + + hyperv_hypercall(HV_EXT_CALL_QUERY_CAPABILITIES, in_pg_gpa, out_pg_gpa); + + /* TLFS states output will be a uint64_t value */ + GUEST_ASSERT_EQ(*output_gva, EXT_CAPABILITIES); + + GUEST_DONE(); +} + +int main(void) +{ + vm_vaddr_t hcall_out_page; + vm_vaddr_t hcall_in_page; + struct kvm_vcpu *vcpu; + struct kvm_run *run; + struct kvm_vm *vm; + uint64_t *outval; + struct ucall uc; + + TEST_REQUIRE(kvm_has_cap(KVM_CAP_HYPERV_CPUID)); + + /* Verify if extended hypercalls are supported */ + if (!kvm_cpuid_has(kvm_get_supported_hv_cpuid(), + HV_ENABLE_EXTENDED_HYPERCALLS)) { + print_skip("Extended calls not supported by the kernel"); + exit(KSFT_SKIP); + } + + vm = vm_create_with_one_vcpu(&vcpu, guest_code); + run = vcpu->run; + vcpu_set_hv_cpuid(vcpu); + + /* Hypercall input */ + hcall_in_page = vm_vaddr_alloc_pages(vm, 1); + memset(addr_gva2hva(vm, hcall_in_page), 0x0, vm->page_size); + + /* Hypercall output */ + hcall_out_page = vm_vaddr_alloc_pages(vm, 1); + memset(addr_gva2hva(vm, hcall_out_page), 0x0, vm->page_size); + + vcpu_args_set(vcpu, 3, addr_gva2gpa(vm, hcall_in_page), + addr_gva2gpa(vm, hcall_out_page), hcall_out_page); + + vcpu_run(vcpu); + + TEST_ASSERT(run->exit_reason == KVM_EXIT_HYPERV, + "Unexpected exit reason: %u (%s)", + run->exit_reason, exit_reason_str(run->exit_reason)); + + outval = addr_gpa2hva(vm, run->hyperv.u.hcall.params[1]); + *outval = EXT_CAPABILITIES; + run->hyperv.u.hcall.result = HV_STATUS_SUCCESS; + + vcpu_run(vcpu); + + TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, + "Unexpected exit reason: %u (%s)", + run->exit_reason, exit_reason_str(run->exit_reason)); + + switch (get_ucall(vcpu, &uc)) { + case UCALL_ABORT: + REPORT_GUEST_ASSERT(uc); + break; + case UCALL_DONE: + break; + default: + TEST_FAIL("Unhandled ucall: %ld", uc.cmd); + } + + kvm_vm_free(vm); + return 0; +} diff --git a/tools/testing/selftests/kvm/x86_64/hyperv_features.c b/tools/testing/selftests/kvm/x86_64/hyperv_features.c new file mode 100644 index 000000000000..068e9c69710d --- /dev/null +++ b/tools/testing/selftests/kvm/x86_64/hyperv_features.c @@ -0,0 +1,695 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2021, Red Hat, Inc. + * + * Tests for Hyper-V features enablement + */ +#include <asm/kvm_para.h> +#include <linux/kvm_para.h> +#include <stdint.h> + +#include "test_util.h" +#include "kvm_util.h" +#include "processor.h" +#include "hyperv.h" + +/* + * HYPERV_CPUID_ENLIGHTMENT_INFO.EBX is not a 'feature' CPUID leaf + * but to activate the feature it is sufficient to set it to a non-zero + * value. Use BIT(0) for that. + */ +#define HV_PV_SPINLOCKS_TEST \ + KVM_X86_CPU_FEATURE(HYPERV_CPUID_ENLIGHTMENT_INFO, 0, EBX, 0) + +struct msr_data { + uint32_t idx; + bool fault_expected; + bool write; + u64 write_val; +}; + +struct hcall_data { + uint64_t control; + uint64_t expect; + bool ud_expected; +}; + +static bool is_write_only_msr(uint32_t msr) +{ + return msr == HV_X64_MSR_EOI; +} + +static void guest_msr(struct msr_data *msr) +{ + uint8_t vector = 0; + uint64_t msr_val = 0; + + GUEST_ASSERT(msr->idx); + + if (msr->write) + vector = wrmsr_safe(msr->idx, msr->write_val); + + if (!vector && (!msr->write || !is_write_only_msr(msr->idx))) + vector = rdmsr_safe(msr->idx, &msr_val); + + if (msr->fault_expected) + __GUEST_ASSERT(vector == GP_VECTOR, + "Expected #GP on %sMSR(0x%x), got vector '0x%x'", + msr->write ? "WR" : "RD", msr->idx, vector); + else + __GUEST_ASSERT(!vector, + "Expected success on %sMSR(0x%x), got vector '0x%x'", + msr->write ? "WR" : "RD", msr->idx, vector); + + if (vector || is_write_only_msr(msr->idx)) + goto done; + + if (msr->write) + __GUEST_ASSERT(!vector, + "WRMSR(0x%x) to '0x%lx', RDMSR read '0x%lx'", + msr->idx, msr->write_val, msr_val); + + /* Invariant TSC bit appears when TSC invariant control MSR is written to */ + if (msr->idx == HV_X64_MSR_TSC_INVARIANT_CONTROL) { + if (!this_cpu_has(HV_ACCESS_TSC_INVARIANT)) + GUEST_ASSERT(this_cpu_has(X86_FEATURE_INVTSC)); + else + GUEST_ASSERT(this_cpu_has(X86_FEATURE_INVTSC) == + !!(msr_val & HV_INVARIANT_TSC_EXPOSED)); + } + +done: + GUEST_DONE(); +} + +static void guest_hcall(vm_vaddr_t pgs_gpa, struct hcall_data *hcall) +{ + u64 res, input, output; + uint8_t vector; + + GUEST_ASSERT_NE(hcall->control, 0); + + wrmsr(HV_X64_MSR_GUEST_OS_ID, HYPERV_LINUX_OS_ID); + wrmsr(HV_X64_MSR_HYPERCALL, pgs_gpa); + + if (!(hcall->control & HV_HYPERCALL_FAST_BIT)) { + input = pgs_gpa; + output = pgs_gpa + 4096; + } else { + input = output = 0; + } + + vector = __hyperv_hypercall(hcall->control, input, output, &res); + if (hcall->ud_expected) { + __GUEST_ASSERT(vector == UD_VECTOR, + "Expected #UD for control '%lu', got vector '0x%x'", + hcall->control, vector); + } else { + __GUEST_ASSERT(!vector, + "Expected no exception for control '%lu', got vector '0x%x'", + hcall->control, vector); + GUEST_ASSERT_EQ(res, hcall->expect); + } + + GUEST_DONE(); +} + +static void vcpu_reset_hv_cpuid(struct kvm_vcpu *vcpu) +{ + /* + * Enable all supported Hyper-V features, then clear the leafs holding + * the features that will be tested one by one. + */ + vcpu_set_hv_cpuid(vcpu); + + vcpu_clear_cpuid_entry(vcpu, HYPERV_CPUID_FEATURES); + vcpu_clear_cpuid_entry(vcpu, HYPERV_CPUID_ENLIGHTMENT_INFO); + vcpu_clear_cpuid_entry(vcpu, HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES); +} + +static void guest_test_msrs_access(void) +{ + struct kvm_cpuid2 *prev_cpuid = NULL; + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; + struct ucall uc; + int stage = 0; + vm_vaddr_t msr_gva; + struct msr_data *msr; + bool has_invtsc = kvm_cpu_has(X86_FEATURE_INVTSC); + + while (true) { + vm = vm_create_with_one_vcpu(&vcpu, guest_msr); + + msr_gva = vm_vaddr_alloc_page(vm); + memset(addr_gva2hva(vm, msr_gva), 0x0, getpagesize()); + msr = addr_gva2hva(vm, msr_gva); + + vcpu_args_set(vcpu, 1, msr_gva); + vcpu_enable_cap(vcpu, KVM_CAP_HYPERV_ENFORCE_CPUID, 1); + + if (!prev_cpuid) { + vcpu_reset_hv_cpuid(vcpu); + + prev_cpuid = allocate_kvm_cpuid2(vcpu->cpuid->nent); + } else { + vcpu_init_cpuid(vcpu, prev_cpuid); + } + + /* TODO: Make this entire test easier to maintain. */ + if (stage >= 21) + vcpu_enable_cap(vcpu, KVM_CAP_HYPERV_SYNIC2, 0); + + switch (stage) { + case 0: + /* + * Only available when Hyper-V identification is set + */ + msr->idx = HV_X64_MSR_GUEST_OS_ID; + msr->write = false; + msr->fault_expected = true; + break; + case 1: + msr->idx = HV_X64_MSR_HYPERCALL; + msr->write = false; + msr->fault_expected = true; + break; + case 2: + vcpu_set_cpuid_feature(vcpu, HV_MSR_HYPERCALL_AVAILABLE); + /* + * HV_X64_MSR_GUEST_OS_ID has to be written first to make + * HV_X64_MSR_HYPERCALL available. + */ + msr->idx = HV_X64_MSR_GUEST_OS_ID; + msr->write = true; + msr->write_val = HYPERV_LINUX_OS_ID; + msr->fault_expected = false; + break; + case 3: + msr->idx = HV_X64_MSR_GUEST_OS_ID; + msr->write = false; + msr->fault_expected = false; + break; + case 4: + msr->idx = HV_X64_MSR_HYPERCALL; + msr->write = false; + msr->fault_expected = false; + break; + + case 5: + msr->idx = HV_X64_MSR_VP_RUNTIME; + msr->write = false; + msr->fault_expected = true; + break; + case 6: + vcpu_set_cpuid_feature(vcpu, HV_MSR_VP_RUNTIME_AVAILABLE); + msr->idx = HV_X64_MSR_VP_RUNTIME; + msr->write = false; + msr->fault_expected = false; + break; + case 7: + /* Read only */ + msr->idx = HV_X64_MSR_VP_RUNTIME; + msr->write = true; + msr->write_val = 1; + msr->fault_expected = true; + break; + + case 8: + msr->idx = HV_X64_MSR_TIME_REF_COUNT; + msr->write = false; + msr->fault_expected = true; + break; + case 9: + vcpu_set_cpuid_feature(vcpu, HV_MSR_TIME_REF_COUNT_AVAILABLE); + msr->idx = HV_X64_MSR_TIME_REF_COUNT; + msr->write = false; + msr->fault_expected = false; + break; + case 10: + /* Read only */ + msr->idx = HV_X64_MSR_TIME_REF_COUNT; + msr->write = true; + msr->write_val = 1; + msr->fault_expected = true; + break; + + case 11: + msr->idx = HV_X64_MSR_VP_INDEX; + msr->write = false; + msr->fault_expected = true; + break; + case 12: + vcpu_set_cpuid_feature(vcpu, HV_MSR_VP_INDEX_AVAILABLE); + msr->idx = HV_X64_MSR_VP_INDEX; + msr->write = false; + msr->fault_expected = false; + break; + case 13: + /* Read only */ + msr->idx = HV_X64_MSR_VP_INDEX; + msr->write = true; + msr->write_val = 1; + msr->fault_expected = true; + break; + + case 14: + msr->idx = HV_X64_MSR_RESET; + msr->write = false; + msr->fault_expected = true; + break; + case 15: + vcpu_set_cpuid_feature(vcpu, HV_MSR_RESET_AVAILABLE); + msr->idx = HV_X64_MSR_RESET; + msr->write = false; + msr->fault_expected = false; + break; + case 16: + msr->idx = HV_X64_MSR_RESET; + msr->write = true; + /* + * TODO: the test only writes '0' to HV_X64_MSR_RESET + * at the moment, writing some other value there will + * trigger real vCPU reset and the code is not prepared + * to handle it yet. + */ + msr->write_val = 0; + msr->fault_expected = false; + break; + + case 17: + msr->idx = HV_X64_MSR_REFERENCE_TSC; + msr->write = false; + msr->fault_expected = true; + break; + case 18: + vcpu_set_cpuid_feature(vcpu, HV_MSR_REFERENCE_TSC_AVAILABLE); + msr->idx = HV_X64_MSR_REFERENCE_TSC; + msr->write = false; + msr->fault_expected = false; + break; + case 19: + msr->idx = HV_X64_MSR_REFERENCE_TSC; + msr->write = true; + msr->write_val = 0; + msr->fault_expected = false; + break; + + case 20: + msr->idx = HV_X64_MSR_EOM; + msr->write = false; + msr->fault_expected = true; + break; + case 21: + /* + * Remains unavailable even with KVM_CAP_HYPERV_SYNIC2 + * capability enabled and guest visible CPUID bit unset. + */ + msr->idx = HV_X64_MSR_EOM; + msr->write = false; + msr->fault_expected = true; + break; + case 22: + vcpu_set_cpuid_feature(vcpu, HV_MSR_SYNIC_AVAILABLE); + msr->idx = HV_X64_MSR_EOM; + msr->write = false; + msr->fault_expected = false; + break; + case 23: + msr->idx = HV_X64_MSR_EOM; + msr->write = true; + msr->write_val = 0; + msr->fault_expected = false; + break; + + case 24: + msr->idx = HV_X64_MSR_STIMER0_CONFIG; + msr->write = false; + msr->fault_expected = true; + break; + case 25: + vcpu_set_cpuid_feature(vcpu, HV_MSR_SYNTIMER_AVAILABLE); + msr->idx = HV_X64_MSR_STIMER0_CONFIG; + msr->write = false; + msr->fault_expected = false; + break; + case 26: + msr->idx = HV_X64_MSR_STIMER0_CONFIG; + msr->write = true; + msr->write_val = 0; + msr->fault_expected = false; + break; + case 27: + /* Direct mode test */ + msr->idx = HV_X64_MSR_STIMER0_CONFIG; + msr->write = true; + msr->write_val = 1 << 12; + msr->fault_expected = true; + break; + case 28: + vcpu_set_cpuid_feature(vcpu, HV_STIMER_DIRECT_MODE_AVAILABLE); + msr->idx = HV_X64_MSR_STIMER0_CONFIG; + msr->write = true; + msr->write_val = 1 << 12; + msr->fault_expected = false; + break; + + case 29: + msr->idx = HV_X64_MSR_EOI; + msr->write = false; + msr->fault_expected = true; + break; + case 30: + vcpu_set_cpuid_feature(vcpu, HV_MSR_APIC_ACCESS_AVAILABLE); + msr->idx = HV_X64_MSR_EOI; + msr->write = true; + msr->write_val = 1; + msr->fault_expected = false; + break; + + case 31: + msr->idx = HV_X64_MSR_TSC_FREQUENCY; + msr->write = false; + msr->fault_expected = true; + break; + case 32: + vcpu_set_cpuid_feature(vcpu, HV_ACCESS_FREQUENCY_MSRS); + msr->idx = HV_X64_MSR_TSC_FREQUENCY; + msr->write = false; + msr->fault_expected = false; + break; + case 33: + /* Read only */ + msr->idx = HV_X64_MSR_TSC_FREQUENCY; + msr->write = true; + msr->write_val = 1; + msr->fault_expected = true; + break; + + case 34: + msr->idx = HV_X64_MSR_REENLIGHTENMENT_CONTROL; + msr->write = false; + msr->fault_expected = true; + break; + case 35: + vcpu_set_cpuid_feature(vcpu, HV_ACCESS_REENLIGHTENMENT); + msr->idx = HV_X64_MSR_REENLIGHTENMENT_CONTROL; + msr->write = false; + msr->fault_expected = false; + break; + case 36: + msr->idx = HV_X64_MSR_REENLIGHTENMENT_CONTROL; + msr->write = true; + msr->write_val = 1; + msr->fault_expected = false; + break; + case 37: + /* Can only write '0' */ + msr->idx = HV_X64_MSR_TSC_EMULATION_STATUS; + msr->write = true; + msr->write_val = 1; + msr->fault_expected = true; + break; + + case 38: + msr->idx = HV_X64_MSR_CRASH_P0; + msr->write = false; + msr->fault_expected = true; + break; + case 39: + vcpu_set_cpuid_feature(vcpu, HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE); + msr->idx = HV_X64_MSR_CRASH_P0; + msr->write = false; + msr->fault_expected = false; + break; + case 40: + msr->idx = HV_X64_MSR_CRASH_P0; + msr->write = true; + msr->write_val = 1; + msr->fault_expected = false; + break; + + case 41: + msr->idx = HV_X64_MSR_SYNDBG_STATUS; + msr->write = false; + msr->fault_expected = true; + break; + case 42: + vcpu_set_cpuid_feature(vcpu, HV_FEATURE_DEBUG_MSRS_AVAILABLE); + vcpu_set_cpuid_feature(vcpu, HV_X64_SYNDBG_CAP_ALLOW_KERNEL_DEBUGGING); + msr->idx = HV_X64_MSR_SYNDBG_STATUS; + msr->write = false; + msr->fault_expected = false; + break; + case 43: + msr->idx = HV_X64_MSR_SYNDBG_STATUS; + msr->write = true; + msr->write_val = 0; + msr->fault_expected = false; + break; + + case 44: + /* MSR is not available when CPUID feature bit is unset */ + if (!has_invtsc) + goto next_stage; + msr->idx = HV_X64_MSR_TSC_INVARIANT_CONTROL; + msr->write = false; + msr->fault_expected = true; + break; + case 45: + /* MSR is vailable when CPUID feature bit is set */ + if (!has_invtsc) + goto next_stage; + vcpu_set_cpuid_feature(vcpu, HV_ACCESS_TSC_INVARIANT); + msr->idx = HV_X64_MSR_TSC_INVARIANT_CONTROL; + msr->write = false; + msr->fault_expected = false; + break; + case 46: + /* Writing bits other than 0 is forbidden */ + if (!has_invtsc) + goto next_stage; + msr->idx = HV_X64_MSR_TSC_INVARIANT_CONTROL; + msr->write = true; + msr->write_val = 0xdeadbeef; + msr->fault_expected = true; + break; + case 47: + /* Setting bit 0 enables the feature */ + if (!has_invtsc) + goto next_stage; + msr->idx = HV_X64_MSR_TSC_INVARIANT_CONTROL; + msr->write = true; + msr->write_val = 1; + msr->fault_expected = false; + break; + + default: + kvm_vm_free(vm); + return; + } + + vcpu_set_cpuid(vcpu); + + memcpy(prev_cpuid, vcpu->cpuid, kvm_cpuid2_size(vcpu->cpuid->nent)); + + pr_debug("Stage %d: testing msr: 0x%x for %s\n", stage, + msr->idx, msr->write ? "write" : "read"); + + vcpu_run(vcpu); + TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO); + + switch (get_ucall(vcpu, &uc)) { + case UCALL_ABORT: + REPORT_GUEST_ASSERT(uc); + return; + case UCALL_DONE: + break; + default: + TEST_FAIL("Unhandled ucall: %ld", uc.cmd); + return; + } + +next_stage: + stage++; + kvm_vm_free(vm); + } +} + +static void guest_test_hcalls_access(void) +{ + struct kvm_cpuid2 *prev_cpuid = NULL; + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; + struct ucall uc; + int stage = 0; + vm_vaddr_t hcall_page, hcall_params; + struct hcall_data *hcall; + + while (true) { + vm = vm_create_with_one_vcpu(&vcpu, guest_hcall); + + /* Hypercall input/output */ + hcall_page = vm_vaddr_alloc_pages(vm, 2); + memset(addr_gva2hva(vm, hcall_page), 0x0, 2 * getpagesize()); + + hcall_params = vm_vaddr_alloc_page(vm); + memset(addr_gva2hva(vm, hcall_params), 0x0, getpagesize()); + hcall = addr_gva2hva(vm, hcall_params); + + vcpu_args_set(vcpu, 2, addr_gva2gpa(vm, hcall_page), hcall_params); + vcpu_enable_cap(vcpu, KVM_CAP_HYPERV_ENFORCE_CPUID, 1); + + if (!prev_cpuid) { + vcpu_reset_hv_cpuid(vcpu); + + prev_cpuid = allocate_kvm_cpuid2(vcpu->cpuid->nent); + } else { + vcpu_init_cpuid(vcpu, prev_cpuid); + } + + switch (stage) { + case 0: + vcpu_set_cpuid_feature(vcpu, HV_MSR_HYPERCALL_AVAILABLE); + hcall->control = 0xbeef; + hcall->expect = HV_STATUS_INVALID_HYPERCALL_CODE; + break; + + case 1: + hcall->control = HVCALL_POST_MESSAGE; + hcall->expect = HV_STATUS_ACCESS_DENIED; + break; + case 2: + vcpu_set_cpuid_feature(vcpu, HV_POST_MESSAGES); + hcall->control = HVCALL_POST_MESSAGE; + hcall->expect = HV_STATUS_INVALID_HYPERCALL_INPUT; + break; + + case 3: + hcall->control = HVCALL_SIGNAL_EVENT; + hcall->expect = HV_STATUS_ACCESS_DENIED; + break; + case 4: + vcpu_set_cpuid_feature(vcpu, HV_SIGNAL_EVENTS); + hcall->control = HVCALL_SIGNAL_EVENT; + hcall->expect = HV_STATUS_INVALID_HYPERCALL_INPUT; + break; + + case 5: + hcall->control = HVCALL_RESET_DEBUG_SESSION; + hcall->expect = HV_STATUS_INVALID_HYPERCALL_CODE; + break; + case 6: + vcpu_set_cpuid_feature(vcpu, HV_X64_SYNDBG_CAP_ALLOW_KERNEL_DEBUGGING); + hcall->control = HVCALL_RESET_DEBUG_SESSION; + hcall->expect = HV_STATUS_ACCESS_DENIED; + break; + case 7: + vcpu_set_cpuid_feature(vcpu, HV_DEBUGGING); + hcall->control = HVCALL_RESET_DEBUG_SESSION; + hcall->expect = HV_STATUS_OPERATION_DENIED; + break; + + case 8: + hcall->control = HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE; + hcall->expect = HV_STATUS_ACCESS_DENIED; + break; + case 9: + vcpu_set_cpuid_feature(vcpu, HV_X64_REMOTE_TLB_FLUSH_RECOMMENDED); + hcall->control = HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE; + hcall->expect = HV_STATUS_SUCCESS; + break; + case 10: + hcall->control = HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX; + hcall->expect = HV_STATUS_ACCESS_DENIED; + break; + case 11: + vcpu_set_cpuid_feature(vcpu, HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED); + hcall->control = HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX; + hcall->expect = HV_STATUS_SUCCESS; + break; + + case 12: + hcall->control = HVCALL_SEND_IPI; + hcall->expect = HV_STATUS_ACCESS_DENIED; + break; + case 13: + vcpu_set_cpuid_feature(vcpu, HV_X64_CLUSTER_IPI_RECOMMENDED); + hcall->control = HVCALL_SEND_IPI; + hcall->expect = HV_STATUS_INVALID_HYPERCALL_INPUT; + break; + case 14: + /* Nothing in 'sparse banks' -> success */ + hcall->control = HVCALL_SEND_IPI_EX; + hcall->expect = HV_STATUS_SUCCESS; + break; + + case 15: + hcall->control = HVCALL_NOTIFY_LONG_SPIN_WAIT; + hcall->expect = HV_STATUS_ACCESS_DENIED; + break; + case 16: + vcpu_set_cpuid_feature(vcpu, HV_PV_SPINLOCKS_TEST); + hcall->control = HVCALL_NOTIFY_LONG_SPIN_WAIT; + hcall->expect = HV_STATUS_SUCCESS; + break; + case 17: + /* XMM fast hypercall */ + hcall->control = HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE | HV_HYPERCALL_FAST_BIT; + hcall->ud_expected = true; + break; + case 18: + vcpu_set_cpuid_feature(vcpu, HV_X64_HYPERCALL_XMM_INPUT_AVAILABLE); + hcall->control = HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE | HV_HYPERCALL_FAST_BIT; + hcall->ud_expected = false; + hcall->expect = HV_STATUS_SUCCESS; + break; + case 19: + hcall->control = HV_EXT_CALL_QUERY_CAPABILITIES; + hcall->expect = HV_STATUS_ACCESS_DENIED; + break; + case 20: + vcpu_set_cpuid_feature(vcpu, HV_ENABLE_EXTENDED_HYPERCALLS); + hcall->control = HV_EXT_CALL_QUERY_CAPABILITIES | HV_HYPERCALL_FAST_BIT; + hcall->expect = HV_STATUS_INVALID_PARAMETER; + break; + case 21: + kvm_vm_free(vm); + return; + } + + vcpu_set_cpuid(vcpu); + + memcpy(prev_cpuid, vcpu->cpuid, kvm_cpuid2_size(vcpu->cpuid->nent)); + + pr_debug("Stage %d: testing hcall: 0x%lx\n", stage, hcall->control); + + vcpu_run(vcpu); + TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO); + + switch (get_ucall(vcpu, &uc)) { + case UCALL_ABORT: + REPORT_GUEST_ASSERT(uc); + return; + case UCALL_DONE: + break; + default: + TEST_FAIL("Unhandled ucall: %ld", uc.cmd); + return; + } + + stage++; + kvm_vm_free(vm); + } +} + +int main(void) +{ + TEST_REQUIRE(kvm_has_cap(KVM_CAP_HYPERV_ENFORCE_CPUID)); + + pr_info("Testing access to Hyper-V specific MSRs\n"); + guest_test_msrs_access(); + + pr_info("Testing access to Hyper-V hypercalls\n"); + guest_test_hcalls_access(); +} diff --git a/tools/testing/selftests/kvm/x86_64/hyperv_ipi.c b/tools/testing/selftests/kvm/x86_64/hyperv_ipi.c new file mode 100644 index 000000000000..22c0c124582f --- /dev/null +++ b/tools/testing/selftests/kvm/x86_64/hyperv_ipi.c @@ -0,0 +1,308 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Hyper-V HvCallSendSyntheticClusterIpi{,Ex} tests + * + * Copyright (C) 2022, Red Hat, Inc. + * + */ +#include <pthread.h> +#include <inttypes.h> + +#include "kvm_util.h" +#include "hyperv.h" +#include "test_util.h" +#include "vmx.h" + +#define RECEIVER_VCPU_ID_1 2 +#define RECEIVER_VCPU_ID_2 65 + +#define IPI_VECTOR 0xfe + +static volatile uint64_t ipis_rcvd[RECEIVER_VCPU_ID_2 + 1]; + +struct hv_vpset { + u64 format; + u64 valid_bank_mask; + u64 bank_contents[2]; +}; + +enum HV_GENERIC_SET_FORMAT { + HV_GENERIC_SET_SPARSE_4K, + HV_GENERIC_SET_ALL, +}; + +/* HvCallSendSyntheticClusterIpi hypercall */ +struct hv_send_ipi { + u32 vector; + u32 reserved; + u64 cpu_mask; +}; + +/* HvCallSendSyntheticClusterIpiEx hypercall */ +struct hv_send_ipi_ex { + u32 vector; + u32 reserved; + struct hv_vpset vp_set; +}; + +static inline void hv_init(vm_vaddr_t pgs_gpa) +{ + wrmsr(HV_X64_MSR_GUEST_OS_ID, HYPERV_LINUX_OS_ID); + wrmsr(HV_X64_MSR_HYPERCALL, pgs_gpa); +} + +static void receiver_code(void *hcall_page, vm_vaddr_t pgs_gpa) +{ + u32 vcpu_id; + + x2apic_enable(); + hv_init(pgs_gpa); + + vcpu_id = rdmsr(HV_X64_MSR_VP_INDEX); + + /* Signal sender vCPU we're ready */ + ipis_rcvd[vcpu_id] = (u64)-1; + + for (;;) + asm volatile("sti; hlt; cli"); +} + +static void guest_ipi_handler(struct ex_regs *regs) +{ + u32 vcpu_id = rdmsr(HV_X64_MSR_VP_INDEX); + + ipis_rcvd[vcpu_id]++; + wrmsr(HV_X64_MSR_EOI, 1); +} + +static inline void nop_loop(void) +{ + int i; + + for (i = 0; i < 100000000; i++) + asm volatile("nop"); +} + +static void sender_guest_code(void *hcall_page, vm_vaddr_t pgs_gpa) +{ + struct hv_send_ipi *ipi = (struct hv_send_ipi *)hcall_page; + struct hv_send_ipi_ex *ipi_ex = (struct hv_send_ipi_ex *)hcall_page; + int stage = 1, ipis_expected[2] = {0}; + + hv_init(pgs_gpa); + GUEST_SYNC(stage++); + + /* Wait for receiver vCPUs to come up */ + while (!ipis_rcvd[RECEIVER_VCPU_ID_1] || !ipis_rcvd[RECEIVER_VCPU_ID_2]) + nop_loop(); + ipis_rcvd[RECEIVER_VCPU_ID_1] = ipis_rcvd[RECEIVER_VCPU_ID_2] = 0; + + /* 'Slow' HvCallSendSyntheticClusterIpi to RECEIVER_VCPU_ID_1 */ + ipi->vector = IPI_VECTOR; + ipi->cpu_mask = 1 << RECEIVER_VCPU_ID_1; + hyperv_hypercall(HVCALL_SEND_IPI, pgs_gpa, pgs_gpa + 4096); + nop_loop(); + GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_1] == ++ipis_expected[0]); + GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_2] == ipis_expected[1]); + GUEST_SYNC(stage++); + /* 'Fast' HvCallSendSyntheticClusterIpi to RECEIVER_VCPU_ID_1 */ + hyperv_hypercall(HVCALL_SEND_IPI | HV_HYPERCALL_FAST_BIT, + IPI_VECTOR, 1 << RECEIVER_VCPU_ID_1); + nop_loop(); + GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_1] == ++ipis_expected[0]); + GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_2] == ipis_expected[1]); + GUEST_SYNC(stage++); + + /* 'Slow' HvCallSendSyntheticClusterIpiEx to RECEIVER_VCPU_ID_1 */ + memset(hcall_page, 0, 4096); + ipi_ex->vector = IPI_VECTOR; + ipi_ex->vp_set.format = HV_GENERIC_SET_SPARSE_4K; + ipi_ex->vp_set.valid_bank_mask = 1 << 0; + ipi_ex->vp_set.bank_contents[0] = BIT(RECEIVER_VCPU_ID_1); + hyperv_hypercall(HVCALL_SEND_IPI_EX | (1 << HV_HYPERCALL_VARHEAD_OFFSET), + pgs_gpa, pgs_gpa + 4096); + nop_loop(); + GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_1] == ++ipis_expected[0]); + GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_2] == ipis_expected[1]); + GUEST_SYNC(stage++); + /* 'XMM Fast' HvCallSendSyntheticClusterIpiEx to RECEIVER_VCPU_ID_1 */ + hyperv_write_xmm_input(&ipi_ex->vp_set.valid_bank_mask, 1); + hyperv_hypercall(HVCALL_SEND_IPI_EX | HV_HYPERCALL_FAST_BIT | + (1 << HV_HYPERCALL_VARHEAD_OFFSET), + IPI_VECTOR, HV_GENERIC_SET_SPARSE_4K); + nop_loop(); + GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_1] == ++ipis_expected[0]); + GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_2] == ipis_expected[1]); + GUEST_SYNC(stage++); + + /* 'Slow' HvCallSendSyntheticClusterIpiEx to RECEIVER_VCPU_ID_2 */ + memset(hcall_page, 0, 4096); + ipi_ex->vector = IPI_VECTOR; + ipi_ex->vp_set.format = HV_GENERIC_SET_SPARSE_4K; + ipi_ex->vp_set.valid_bank_mask = 1 << 1; + ipi_ex->vp_set.bank_contents[0] = BIT(RECEIVER_VCPU_ID_2 - 64); + hyperv_hypercall(HVCALL_SEND_IPI_EX | (1 << HV_HYPERCALL_VARHEAD_OFFSET), + pgs_gpa, pgs_gpa + 4096); + nop_loop(); + GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_1] == ipis_expected[0]); + GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_2] == ++ipis_expected[1]); + GUEST_SYNC(stage++); + /* 'XMM Fast' HvCallSendSyntheticClusterIpiEx to RECEIVER_VCPU_ID_2 */ + hyperv_write_xmm_input(&ipi_ex->vp_set.valid_bank_mask, 1); + hyperv_hypercall(HVCALL_SEND_IPI_EX | HV_HYPERCALL_FAST_BIT | + (1 << HV_HYPERCALL_VARHEAD_OFFSET), + IPI_VECTOR, HV_GENERIC_SET_SPARSE_4K); + nop_loop(); + GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_1] == ipis_expected[0]); + GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_2] == ++ipis_expected[1]); + GUEST_SYNC(stage++); + + /* 'Slow' HvCallSendSyntheticClusterIpiEx to both RECEIVER_VCPU_ID_{1,2} */ + memset(hcall_page, 0, 4096); + ipi_ex->vector = IPI_VECTOR; + ipi_ex->vp_set.format = HV_GENERIC_SET_SPARSE_4K; + ipi_ex->vp_set.valid_bank_mask = 1 << 1 | 1; + ipi_ex->vp_set.bank_contents[0] = BIT(RECEIVER_VCPU_ID_1); + ipi_ex->vp_set.bank_contents[1] = BIT(RECEIVER_VCPU_ID_2 - 64); + hyperv_hypercall(HVCALL_SEND_IPI_EX | (2 << HV_HYPERCALL_VARHEAD_OFFSET), + pgs_gpa, pgs_gpa + 4096); + nop_loop(); + GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_1] == ++ipis_expected[0]); + GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_2] == ++ipis_expected[1]); + GUEST_SYNC(stage++); + /* 'XMM Fast' HvCallSendSyntheticClusterIpiEx to both RECEIVER_VCPU_ID_{1, 2} */ + hyperv_write_xmm_input(&ipi_ex->vp_set.valid_bank_mask, 2); + hyperv_hypercall(HVCALL_SEND_IPI_EX | HV_HYPERCALL_FAST_BIT | + (2 << HV_HYPERCALL_VARHEAD_OFFSET), + IPI_VECTOR, HV_GENERIC_SET_SPARSE_4K); + nop_loop(); + GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_1] == ++ipis_expected[0]); + GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_2] == ++ipis_expected[1]); + GUEST_SYNC(stage++); + + /* 'Slow' HvCallSendSyntheticClusterIpiEx to HV_GENERIC_SET_ALL */ + memset(hcall_page, 0, 4096); + ipi_ex->vector = IPI_VECTOR; + ipi_ex->vp_set.format = HV_GENERIC_SET_ALL; + hyperv_hypercall(HVCALL_SEND_IPI_EX, pgs_gpa, pgs_gpa + 4096); + nop_loop(); + GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_1] == ++ipis_expected[0]); + GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_2] == ++ipis_expected[1]); + GUEST_SYNC(stage++); + /* + * 'XMM Fast' HvCallSendSyntheticClusterIpiEx to HV_GENERIC_SET_ALL. + */ + ipi_ex->vp_set.valid_bank_mask = 0; + hyperv_write_xmm_input(&ipi_ex->vp_set.valid_bank_mask, 2); + hyperv_hypercall(HVCALL_SEND_IPI_EX | HV_HYPERCALL_FAST_BIT, + IPI_VECTOR, HV_GENERIC_SET_ALL); + nop_loop(); + GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_1] == ++ipis_expected[0]); + GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_2] == ++ipis_expected[1]); + GUEST_SYNC(stage++); + + GUEST_DONE(); +} + +static void *vcpu_thread(void *arg) +{ + struct kvm_vcpu *vcpu = (struct kvm_vcpu *)arg; + int old, r; + + r = pthread_setcanceltype(PTHREAD_CANCEL_ASYNCHRONOUS, &old); + TEST_ASSERT(!r, "pthread_setcanceltype failed on vcpu_id=%u with errno=%d", + vcpu->id, r); + + vcpu_run(vcpu); + + TEST_FAIL("vCPU %u exited unexpectedly", vcpu->id); + + return NULL; +} + +static void cancel_join_vcpu_thread(pthread_t thread, struct kvm_vcpu *vcpu) +{ + void *retval; + int r; + + r = pthread_cancel(thread); + TEST_ASSERT(!r, "pthread_cancel on vcpu_id=%d failed with errno=%d", + vcpu->id, r); + + r = pthread_join(thread, &retval); + TEST_ASSERT(!r, "pthread_join on vcpu_id=%d failed with errno=%d", + vcpu->id, r); + TEST_ASSERT(retval == PTHREAD_CANCELED, + "expected retval=%p, got %p", PTHREAD_CANCELED, + retval); +} + +int main(int argc, char *argv[]) +{ + struct kvm_vm *vm; + struct kvm_vcpu *vcpu[3]; + vm_vaddr_t hcall_page; + pthread_t threads[2]; + int stage = 1, r; + struct ucall uc; + + TEST_REQUIRE(kvm_has_cap(KVM_CAP_HYPERV_SEND_IPI)); + + vm = vm_create_with_one_vcpu(&vcpu[0], sender_guest_code); + + /* Hypercall input/output */ + hcall_page = vm_vaddr_alloc_pages(vm, 2); + memset(addr_gva2hva(vm, hcall_page), 0x0, 2 * getpagesize()); + + + vcpu[1] = vm_vcpu_add(vm, RECEIVER_VCPU_ID_1, receiver_code); + vcpu_args_set(vcpu[1], 2, hcall_page, addr_gva2gpa(vm, hcall_page)); + vcpu_set_msr(vcpu[1], HV_X64_MSR_VP_INDEX, RECEIVER_VCPU_ID_1); + vcpu_set_hv_cpuid(vcpu[1]); + + vcpu[2] = vm_vcpu_add(vm, RECEIVER_VCPU_ID_2, receiver_code); + vcpu_args_set(vcpu[2], 2, hcall_page, addr_gva2gpa(vm, hcall_page)); + vcpu_set_msr(vcpu[2], HV_X64_MSR_VP_INDEX, RECEIVER_VCPU_ID_2); + vcpu_set_hv_cpuid(vcpu[2]); + + vm_install_exception_handler(vm, IPI_VECTOR, guest_ipi_handler); + + vcpu_args_set(vcpu[0], 2, hcall_page, addr_gva2gpa(vm, hcall_page)); + vcpu_set_hv_cpuid(vcpu[0]); + + r = pthread_create(&threads[0], NULL, vcpu_thread, vcpu[1]); + TEST_ASSERT(!r, "pthread_create failed errno=%d", r); + + r = pthread_create(&threads[1], NULL, vcpu_thread, vcpu[2]); + TEST_ASSERT(!r, "pthread_create failed errno=%d", errno); + + while (true) { + vcpu_run(vcpu[0]); + + TEST_ASSERT_KVM_EXIT_REASON(vcpu[0], KVM_EXIT_IO); + + switch (get_ucall(vcpu[0], &uc)) { + case UCALL_SYNC: + TEST_ASSERT(uc.args[1] == stage, + "Unexpected stage: %ld (%d expected)", + uc.args[1], stage); + break; + case UCALL_DONE: + goto done; + case UCALL_ABORT: + REPORT_GUEST_ASSERT(uc); + /* NOT REACHED */ + default: + TEST_FAIL("Unknown ucall %lu", uc.cmd); + } + + stage++; + } + +done: + cancel_join_vcpu_thread(threads[0], vcpu[1]); + cancel_join_vcpu_thread(threads[1], vcpu[2]); + kvm_vm_free(vm); + + return r; +} diff --git a/tools/testing/selftests/kvm/x86_64/hyperv_svm_test.c b/tools/testing/selftests/kvm/x86_64/hyperv_svm_test.c new file mode 100644 index 000000000000..b987a3d79715 --- /dev/null +++ b/tools/testing/selftests/kvm/x86_64/hyperv_svm_test.c @@ -0,0 +1,199 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2022, Red Hat, Inc. + * + * Tests for Hyper-V extensions to SVM. + */ +#include <fcntl.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/ioctl.h> +#include <linux/bitmap.h> + +#include "test_util.h" + +#include "kvm_util.h" +#include "processor.h" +#include "svm_util.h" +#include "hyperv.h" + +#define L2_GUEST_STACK_SIZE 256 + +/* Exit to L1 from L2 with RDMSR instruction */ +static inline void rdmsr_from_l2(uint32_t msr) +{ + /* Currently, L1 doesn't preserve GPRs during vmexits. */ + __asm__ __volatile__ ("rdmsr" : : "c"(msr) : + "rax", "rbx", "rdx", "rsi", "rdi", "r8", "r9", + "r10", "r11", "r12", "r13", "r14", "r15"); +} + +void l2_guest_code(void) +{ + u64 unused; + + GUEST_SYNC(3); + /* Exit to L1 */ + vmmcall(); + + /* MSR-Bitmap tests */ + rdmsr_from_l2(MSR_FS_BASE); /* intercepted */ + rdmsr_from_l2(MSR_FS_BASE); /* intercepted */ + rdmsr_from_l2(MSR_GS_BASE); /* not intercepted */ + vmmcall(); + rdmsr_from_l2(MSR_GS_BASE); /* intercepted */ + + GUEST_SYNC(5); + + /* L2 TLB flush tests */ + hyperv_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE | + HV_HYPERCALL_FAST_BIT, 0x0, + HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES | + HV_FLUSH_ALL_PROCESSORS); + rdmsr_from_l2(MSR_FS_BASE); + /* + * Note: hypercall status (RAX) is not preserved correctly by L1 after + * synthetic vmexit, use unchecked version. + */ + __hyperv_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE | + HV_HYPERCALL_FAST_BIT, 0x0, + HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES | + HV_FLUSH_ALL_PROCESSORS, &unused); + + /* Done, exit to L1 and never come back. */ + vmmcall(); +} + +static void __attribute__((__flatten__)) guest_code(struct svm_test_data *svm, + struct hyperv_test_pages *hv_pages, + vm_vaddr_t pgs_gpa) +{ + unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; + struct vmcb *vmcb = svm->vmcb; + struct hv_vmcb_enlightenments *hve = &vmcb->control.hv_enlightenments; + + GUEST_SYNC(1); + + wrmsr(HV_X64_MSR_GUEST_OS_ID, HYPERV_LINUX_OS_ID); + wrmsr(HV_X64_MSR_HYPERCALL, pgs_gpa); + enable_vp_assist(hv_pages->vp_assist_gpa, hv_pages->vp_assist); + + GUEST_ASSERT(svm->vmcb_gpa); + /* Prepare for L2 execution. */ + generic_svm_setup(svm, l2_guest_code, + &l2_guest_stack[L2_GUEST_STACK_SIZE]); + + /* L2 TLB flush setup */ + hve->partition_assist_page = hv_pages->partition_assist_gpa; + hve->hv_enlightenments_control.nested_flush_hypercall = 1; + hve->hv_vm_id = 1; + hve->hv_vp_id = 1; + current_vp_assist->nested_control.features.directhypercall = 1; + *(u32 *)(hv_pages->partition_assist) = 0; + + GUEST_SYNC(2); + run_guest(vmcb, svm->vmcb_gpa); + GUEST_ASSERT(vmcb->control.exit_code == SVM_EXIT_VMMCALL); + GUEST_SYNC(4); + vmcb->save.rip += 3; + + /* Intercept RDMSR 0xc0000100 */ + vmcb->control.intercept |= 1ULL << INTERCEPT_MSR_PROT; + __set_bit(2 * (MSR_FS_BASE & 0x1fff), svm->msr + 0x800); + run_guest(vmcb, svm->vmcb_gpa); + GUEST_ASSERT(vmcb->control.exit_code == SVM_EXIT_MSR); + vmcb->save.rip += 2; /* rdmsr */ + + /* Enable enlightened MSR bitmap */ + hve->hv_enlightenments_control.msr_bitmap = 1; + run_guest(vmcb, svm->vmcb_gpa); + GUEST_ASSERT(vmcb->control.exit_code == SVM_EXIT_MSR); + vmcb->save.rip += 2; /* rdmsr */ + + /* Intercept RDMSR 0xc0000101 without telling KVM about it */ + __set_bit(2 * (MSR_GS_BASE & 0x1fff), svm->msr + 0x800); + /* Make sure HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP is set */ + vmcb->control.clean |= HV_VMCB_NESTED_ENLIGHTENMENTS; + run_guest(vmcb, svm->vmcb_gpa); + /* Make sure we don't see SVM_EXIT_MSR here so eMSR bitmap works */ + GUEST_ASSERT(vmcb->control.exit_code == SVM_EXIT_VMMCALL); + vmcb->save.rip += 3; /* vmcall */ + + /* Now tell KVM we've changed MSR-Bitmap */ + vmcb->control.clean &= ~HV_VMCB_NESTED_ENLIGHTENMENTS; + run_guest(vmcb, svm->vmcb_gpa); + GUEST_ASSERT(vmcb->control.exit_code == SVM_EXIT_MSR); + vmcb->save.rip += 2; /* rdmsr */ + + + /* + * L2 TLB flush test. First VMCALL should be handled directly by L0, + * no VMCALL exit expected. + */ + run_guest(vmcb, svm->vmcb_gpa); + GUEST_ASSERT(vmcb->control.exit_code == SVM_EXIT_MSR); + vmcb->save.rip += 2; /* rdmsr */ + /* Enable synthetic vmexit */ + *(u32 *)(hv_pages->partition_assist) = 1; + run_guest(vmcb, svm->vmcb_gpa); + GUEST_ASSERT(vmcb->control.exit_code == HV_SVM_EXITCODE_ENL); + GUEST_ASSERT(vmcb->control.exit_info_1 == HV_SVM_ENL_EXITCODE_TRAP_AFTER_FLUSH); + + run_guest(vmcb, svm->vmcb_gpa); + GUEST_ASSERT(vmcb->control.exit_code == SVM_EXIT_VMMCALL); + GUEST_SYNC(6); + + GUEST_DONE(); +} + +int main(int argc, char *argv[]) +{ + vm_vaddr_t nested_gva = 0, hv_pages_gva = 0; + vm_vaddr_t hcall_page; + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; + struct ucall uc; + int stage; + + TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_SVM)); + TEST_REQUIRE(kvm_has_cap(KVM_CAP_HYPERV_DIRECT_TLBFLUSH)); + + /* Create VM */ + vm = vm_create_with_one_vcpu(&vcpu, guest_code); + vcpu_set_hv_cpuid(vcpu); + vcpu_alloc_svm(vm, &nested_gva); + vcpu_alloc_hyperv_test_pages(vm, &hv_pages_gva); + + hcall_page = vm_vaddr_alloc_pages(vm, 1); + memset(addr_gva2hva(vm, hcall_page), 0x0, getpagesize()); + + vcpu_args_set(vcpu, 3, nested_gva, hv_pages_gva, addr_gva2gpa(vm, hcall_page)); + vcpu_set_msr(vcpu, HV_X64_MSR_VP_INDEX, vcpu->id); + + for (stage = 1;; stage++) { + vcpu_run(vcpu); + TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO); + + switch (get_ucall(vcpu, &uc)) { + case UCALL_ABORT: + REPORT_GUEST_ASSERT(uc); + /* NOT REACHED */ + case UCALL_SYNC: + break; + case UCALL_DONE: + goto done; + default: + TEST_FAIL("Unknown ucall %lu", uc.cmd); + } + + /* UCALL_SYNC is handled here. */ + TEST_ASSERT(!strcmp((const char *)uc.args[0], "hello") && + uc.args[1] == stage, "Stage %d: Unexpected register values vmexit, got %lx", + stage, (ulong)uc.args[1]); + + } + +done: + kvm_vm_free(vm); +} diff --git a/tools/testing/selftests/kvm/x86_64/hyperv_tlb_flush.c b/tools/testing/selftests/kvm/x86_64/hyperv_tlb_flush.c new file mode 100644 index 000000000000..077cd0ec3040 --- /dev/null +++ b/tools/testing/selftests/kvm/x86_64/hyperv_tlb_flush.c @@ -0,0 +1,680 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Hyper-V HvFlushVirtualAddress{List,Space}{,Ex} tests + * + * Copyright (C) 2022, Red Hat, Inc. + * + */ +#include <asm/barrier.h> +#include <pthread.h> +#include <inttypes.h> + +#include "kvm_util.h" +#include "processor.h" +#include "hyperv.h" +#include "test_util.h" +#include "vmx.h" + +#define WORKER_VCPU_ID_1 2 +#define WORKER_VCPU_ID_2 65 + +#define NTRY 100 +#define NTEST_PAGES 2 + +struct hv_vpset { + u64 format; + u64 valid_bank_mask; + u64 bank_contents[]; +}; + +enum HV_GENERIC_SET_FORMAT { + HV_GENERIC_SET_SPARSE_4K, + HV_GENERIC_SET_ALL, +}; + +#define HV_FLUSH_ALL_PROCESSORS BIT(0) +#define HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES BIT(1) +#define HV_FLUSH_NON_GLOBAL_MAPPINGS_ONLY BIT(2) +#define HV_FLUSH_USE_EXTENDED_RANGE_FORMAT BIT(3) + +/* HvFlushVirtualAddressSpace, HvFlushVirtualAddressList hypercalls */ +struct hv_tlb_flush { + u64 address_space; + u64 flags; + u64 processor_mask; + u64 gva_list[]; +} __packed; + +/* HvFlushVirtualAddressSpaceEx, HvFlushVirtualAddressListEx hypercalls */ +struct hv_tlb_flush_ex { + u64 address_space; + u64 flags; + struct hv_vpset hv_vp_set; + u64 gva_list[]; +} __packed; + +/* + * Pass the following info to 'workers' and 'sender' + * - Hypercall page's GVA + * - Hypercall page's GPA + * - Test pages GVA + * - GVAs of the test pages' PTEs + */ +struct test_data { + vm_vaddr_t hcall_gva; + vm_paddr_t hcall_gpa; + vm_vaddr_t test_pages; + vm_vaddr_t test_pages_pte[NTEST_PAGES]; +}; + +/* 'Worker' vCPU code checking the contents of the test page */ +static void worker_guest_code(vm_vaddr_t test_data) +{ + struct test_data *data = (struct test_data *)test_data; + u32 vcpu_id = rdmsr(HV_X64_MSR_VP_INDEX); + void *exp_page = (void *)data->test_pages + PAGE_SIZE * NTEST_PAGES; + u64 *this_cpu = (u64 *)(exp_page + vcpu_id * sizeof(u64)); + u64 expected, val; + + x2apic_enable(); + wrmsr(HV_X64_MSR_GUEST_OS_ID, HYPERV_LINUX_OS_ID); + + for (;;) { + cpu_relax(); + + expected = READ_ONCE(*this_cpu); + + /* + * Make sure the value in the test page is read after reading + * the expectation for the first time. Pairs with wmb() in + * prepare_to_test(). + */ + rmb(); + + val = READ_ONCE(*(u64 *)data->test_pages); + + /* + * Make sure the value in the test page is read after before + * reading the expectation for the second time. Pairs with wmb() + * post_test(). + */ + rmb(); + + /* + * '0' indicates the sender is between iterations, wait until + * the sender is ready for this vCPU to start checking again. + */ + if (!expected) + continue; + + /* + * Re-read the per-vCPU byte to ensure the sender didn't move + * onto a new iteration. + */ + if (expected != READ_ONCE(*this_cpu)) + continue; + + GUEST_ASSERT(val == expected); + } +} + +/* + * Write per-CPU info indicating what each 'worker' CPU is supposed to see in + * test page. '0' means don't check. + */ +static void set_expected_val(void *addr, u64 val, int vcpu_id) +{ + void *exp_page = addr + PAGE_SIZE * NTEST_PAGES; + + *(u64 *)(exp_page + vcpu_id * sizeof(u64)) = val; +} + +/* + * Update PTEs swapping two test pages. + * TODO: use swap()/xchg() when these are provided. + */ +static void swap_two_test_pages(vm_paddr_t pte_gva1, vm_paddr_t pte_gva2) +{ + uint64_t tmp = *(uint64_t *)pte_gva1; + + *(uint64_t *)pte_gva1 = *(uint64_t *)pte_gva2; + *(uint64_t *)pte_gva2 = tmp; +} + +/* + * TODO: replace the silly NOP loop with a proper udelay() implementation. + */ +static inline void do_delay(void) +{ + int i; + + for (i = 0; i < 1000000; i++) + asm volatile("nop"); +} + +/* + * Prepare to test: 'disable' workers by setting the expectation to '0', + * clear hypercall input page and then swap two test pages. + */ +static inline void prepare_to_test(struct test_data *data) +{ + /* Clear hypercall input page */ + memset((void *)data->hcall_gva, 0, PAGE_SIZE); + + /* 'Disable' workers */ + set_expected_val((void *)data->test_pages, 0x0, WORKER_VCPU_ID_1); + set_expected_val((void *)data->test_pages, 0x0, WORKER_VCPU_ID_2); + + /* Make sure workers are 'disabled' before we swap PTEs. */ + wmb(); + + /* Make sure workers have enough time to notice */ + do_delay(); + + /* Swap test page mappings */ + swap_two_test_pages(data->test_pages_pte[0], data->test_pages_pte[1]); +} + +/* + * Finalize the test: check hypercall resule set the expected val for + * 'worker' CPUs and give them some time to test. + */ +static inline void post_test(struct test_data *data, u64 exp1, u64 exp2) +{ + /* Make sure we change the expectation after swapping PTEs */ + wmb(); + + /* Set the expectation for workers, '0' means don't test */ + set_expected_val((void *)data->test_pages, exp1, WORKER_VCPU_ID_1); + set_expected_val((void *)data->test_pages, exp2, WORKER_VCPU_ID_2); + + /* Make sure workers have enough time to test */ + do_delay(); +} + +#define TESTVAL1 0x0101010101010101 +#define TESTVAL2 0x0202020202020202 + +/* Main vCPU doing the test */ +static void sender_guest_code(vm_vaddr_t test_data) +{ + struct test_data *data = (struct test_data *)test_data; + struct hv_tlb_flush *flush = (struct hv_tlb_flush *)data->hcall_gva; + struct hv_tlb_flush_ex *flush_ex = (struct hv_tlb_flush_ex *)data->hcall_gva; + vm_paddr_t hcall_gpa = data->hcall_gpa; + int i, stage = 1; + + wrmsr(HV_X64_MSR_GUEST_OS_ID, HYPERV_LINUX_OS_ID); + wrmsr(HV_X64_MSR_HYPERCALL, data->hcall_gpa); + + /* "Slow" hypercalls */ + + GUEST_SYNC(stage++); + + /* HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE for WORKER_VCPU_ID_1 */ + for (i = 0; i < NTRY; i++) { + prepare_to_test(data); + flush->flags = HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES; + flush->processor_mask = BIT(WORKER_VCPU_ID_1); + hyperv_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE, hcall_gpa, + hcall_gpa + PAGE_SIZE); + post_test(data, i % 2 ? TESTVAL1 : TESTVAL2, 0x0); + } + + GUEST_SYNC(stage++); + + /* HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST for WORKER_VCPU_ID_1 */ + for (i = 0; i < NTRY; i++) { + prepare_to_test(data); + flush->flags = HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES; + flush->processor_mask = BIT(WORKER_VCPU_ID_1); + flush->gva_list[0] = (u64)data->test_pages; + hyperv_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST | + (1UL << HV_HYPERCALL_REP_COMP_OFFSET), + hcall_gpa, hcall_gpa + PAGE_SIZE); + post_test(data, i % 2 ? TESTVAL1 : TESTVAL2, 0x0); + } + + GUEST_SYNC(stage++); + + /* HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE for HV_FLUSH_ALL_PROCESSORS */ + for (i = 0; i < NTRY; i++) { + prepare_to_test(data); + flush->flags = HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES | + HV_FLUSH_ALL_PROCESSORS; + flush->processor_mask = 0; + hyperv_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE, hcall_gpa, + hcall_gpa + PAGE_SIZE); + post_test(data, i % 2 ? TESTVAL1 : TESTVAL2, i % 2 ? TESTVAL1 : TESTVAL2); + } + + GUEST_SYNC(stage++); + + /* HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST for HV_FLUSH_ALL_PROCESSORS */ + for (i = 0; i < NTRY; i++) { + prepare_to_test(data); + flush->flags = HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES | + HV_FLUSH_ALL_PROCESSORS; + flush->gva_list[0] = (u64)data->test_pages; + hyperv_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST | + (1UL << HV_HYPERCALL_REP_COMP_OFFSET), + hcall_gpa, hcall_gpa + PAGE_SIZE); + post_test(data, i % 2 ? TESTVAL1 : TESTVAL2, + i % 2 ? TESTVAL1 : TESTVAL2); + } + + GUEST_SYNC(stage++); + + /* HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX for WORKER_VCPU_ID_2 */ + for (i = 0; i < NTRY; i++) { + prepare_to_test(data); + flush_ex->flags = HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES; + flush_ex->hv_vp_set.format = HV_GENERIC_SET_SPARSE_4K; + flush_ex->hv_vp_set.valid_bank_mask = BIT_ULL(WORKER_VCPU_ID_2 / 64); + flush_ex->hv_vp_set.bank_contents[0] = BIT_ULL(WORKER_VCPU_ID_2 % 64); + hyperv_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX | + (1 << HV_HYPERCALL_VARHEAD_OFFSET), + hcall_gpa, hcall_gpa + PAGE_SIZE); + post_test(data, 0x0, i % 2 ? TESTVAL1 : TESTVAL2); + } + + GUEST_SYNC(stage++); + + /* HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX for WORKER_VCPU_ID_2 */ + for (i = 0; i < NTRY; i++) { + prepare_to_test(data); + flush_ex->flags = HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES; + flush_ex->hv_vp_set.format = HV_GENERIC_SET_SPARSE_4K; + flush_ex->hv_vp_set.valid_bank_mask = BIT_ULL(WORKER_VCPU_ID_2 / 64); + flush_ex->hv_vp_set.bank_contents[0] = BIT_ULL(WORKER_VCPU_ID_2 % 64); + /* bank_contents and gva_list occupy the same space, thus [1] */ + flush_ex->gva_list[1] = (u64)data->test_pages; + hyperv_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX | + (1 << HV_HYPERCALL_VARHEAD_OFFSET) | + (1UL << HV_HYPERCALL_REP_COMP_OFFSET), + hcall_gpa, hcall_gpa + PAGE_SIZE); + post_test(data, 0x0, i % 2 ? TESTVAL1 : TESTVAL2); + } + + GUEST_SYNC(stage++); + + /* HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX for both vCPUs */ + for (i = 0; i < NTRY; i++) { + prepare_to_test(data); + flush_ex->flags = HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES; + flush_ex->hv_vp_set.format = HV_GENERIC_SET_SPARSE_4K; + flush_ex->hv_vp_set.valid_bank_mask = BIT_ULL(WORKER_VCPU_ID_2 / 64) | + BIT_ULL(WORKER_VCPU_ID_1 / 64); + flush_ex->hv_vp_set.bank_contents[0] = BIT_ULL(WORKER_VCPU_ID_1 % 64); + flush_ex->hv_vp_set.bank_contents[1] = BIT_ULL(WORKER_VCPU_ID_2 % 64); + hyperv_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX | + (2 << HV_HYPERCALL_VARHEAD_OFFSET), + hcall_gpa, hcall_gpa + PAGE_SIZE); + post_test(data, i % 2 ? TESTVAL1 : TESTVAL2, + i % 2 ? TESTVAL1 : TESTVAL2); + } + + GUEST_SYNC(stage++); + + /* HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX for both vCPUs */ + for (i = 0; i < NTRY; i++) { + prepare_to_test(data); + flush_ex->flags = HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES; + flush_ex->hv_vp_set.format = HV_GENERIC_SET_SPARSE_4K; + flush_ex->hv_vp_set.valid_bank_mask = BIT_ULL(WORKER_VCPU_ID_1 / 64) | + BIT_ULL(WORKER_VCPU_ID_2 / 64); + flush_ex->hv_vp_set.bank_contents[0] = BIT_ULL(WORKER_VCPU_ID_1 % 64); + flush_ex->hv_vp_set.bank_contents[1] = BIT_ULL(WORKER_VCPU_ID_2 % 64); + /* bank_contents and gva_list occupy the same space, thus [2] */ + flush_ex->gva_list[2] = (u64)data->test_pages; + hyperv_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX | + (2 << HV_HYPERCALL_VARHEAD_OFFSET) | + (1UL << HV_HYPERCALL_REP_COMP_OFFSET), + hcall_gpa, hcall_gpa + PAGE_SIZE); + post_test(data, i % 2 ? TESTVAL1 : TESTVAL2, + i % 2 ? TESTVAL1 : TESTVAL2); + } + + GUEST_SYNC(stage++); + + /* HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX for HV_GENERIC_SET_ALL */ + for (i = 0; i < NTRY; i++) { + prepare_to_test(data); + flush_ex->flags = HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES; + flush_ex->hv_vp_set.format = HV_GENERIC_SET_ALL; + hyperv_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX, + hcall_gpa, hcall_gpa + PAGE_SIZE); + post_test(data, i % 2 ? TESTVAL1 : TESTVAL2, + i % 2 ? TESTVAL1 : TESTVAL2); + } + + GUEST_SYNC(stage++); + + /* HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX for HV_GENERIC_SET_ALL */ + for (i = 0; i < NTRY; i++) { + prepare_to_test(data); + flush_ex->flags = HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES; + flush_ex->hv_vp_set.format = HV_GENERIC_SET_ALL; + flush_ex->gva_list[0] = (u64)data->test_pages; + hyperv_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX | + (1UL << HV_HYPERCALL_REP_COMP_OFFSET), + hcall_gpa, hcall_gpa + PAGE_SIZE); + post_test(data, i % 2 ? TESTVAL1 : TESTVAL2, + i % 2 ? TESTVAL1 : TESTVAL2); + } + + /* "Fast" hypercalls */ + + GUEST_SYNC(stage++); + + /* HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE for WORKER_VCPU_ID_1 */ + for (i = 0; i < NTRY; i++) { + prepare_to_test(data); + flush->processor_mask = BIT(WORKER_VCPU_ID_1); + hyperv_write_xmm_input(&flush->processor_mask, 1); + hyperv_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE | + HV_HYPERCALL_FAST_BIT, 0x0, + HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES); + post_test(data, i % 2 ? TESTVAL1 : TESTVAL2, 0x0); + } + + GUEST_SYNC(stage++); + + /* HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST for WORKER_VCPU_ID_1 */ + for (i = 0; i < NTRY; i++) { + prepare_to_test(data); + flush->processor_mask = BIT(WORKER_VCPU_ID_1); + flush->gva_list[0] = (u64)data->test_pages; + hyperv_write_xmm_input(&flush->processor_mask, 1); + hyperv_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST | + HV_HYPERCALL_FAST_BIT | + (1UL << HV_HYPERCALL_REP_COMP_OFFSET), + 0x0, HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES); + post_test(data, i % 2 ? TESTVAL1 : TESTVAL2, 0x0); + } + + GUEST_SYNC(stage++); + + /* HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE for HV_FLUSH_ALL_PROCESSORS */ + for (i = 0; i < NTRY; i++) { + prepare_to_test(data); + hyperv_write_xmm_input(&flush->processor_mask, 1); + hyperv_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE | + HV_HYPERCALL_FAST_BIT, 0x0, + HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES | + HV_FLUSH_ALL_PROCESSORS); + post_test(data, i % 2 ? TESTVAL1 : TESTVAL2, + i % 2 ? TESTVAL1 : TESTVAL2); + } + + GUEST_SYNC(stage++); + + /* HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST for HV_FLUSH_ALL_PROCESSORS */ + for (i = 0; i < NTRY; i++) { + prepare_to_test(data); + flush->gva_list[0] = (u64)data->test_pages; + hyperv_write_xmm_input(&flush->processor_mask, 1); + hyperv_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST | + HV_HYPERCALL_FAST_BIT | + (1UL << HV_HYPERCALL_REP_COMP_OFFSET), 0x0, + HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES | + HV_FLUSH_ALL_PROCESSORS); + post_test(data, i % 2 ? TESTVAL1 : TESTVAL2, + i % 2 ? TESTVAL1 : TESTVAL2); + } + + GUEST_SYNC(stage++); + + /* HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX for WORKER_VCPU_ID_2 */ + for (i = 0; i < NTRY; i++) { + prepare_to_test(data); + flush_ex->hv_vp_set.format = HV_GENERIC_SET_SPARSE_4K; + flush_ex->hv_vp_set.valid_bank_mask = BIT_ULL(WORKER_VCPU_ID_2 / 64); + flush_ex->hv_vp_set.bank_contents[0] = BIT_ULL(WORKER_VCPU_ID_2 % 64); + hyperv_write_xmm_input(&flush_ex->hv_vp_set, 2); + hyperv_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX | + HV_HYPERCALL_FAST_BIT | + (1 << HV_HYPERCALL_VARHEAD_OFFSET), + 0x0, HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES); + post_test(data, 0x0, i % 2 ? TESTVAL1 : TESTVAL2); + } + + GUEST_SYNC(stage++); + + /* HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX for WORKER_VCPU_ID_2 */ + for (i = 0; i < NTRY; i++) { + prepare_to_test(data); + flush_ex->hv_vp_set.format = HV_GENERIC_SET_SPARSE_4K; + flush_ex->hv_vp_set.valid_bank_mask = BIT_ULL(WORKER_VCPU_ID_2 / 64); + flush_ex->hv_vp_set.bank_contents[0] = BIT_ULL(WORKER_VCPU_ID_2 % 64); + /* bank_contents and gva_list occupy the same space, thus [1] */ + flush_ex->gva_list[1] = (u64)data->test_pages; + hyperv_write_xmm_input(&flush_ex->hv_vp_set, 2); + hyperv_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX | + HV_HYPERCALL_FAST_BIT | + (1 << HV_HYPERCALL_VARHEAD_OFFSET) | + (1UL << HV_HYPERCALL_REP_COMP_OFFSET), + 0x0, HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES); + post_test(data, 0x0, i % 2 ? TESTVAL1 : TESTVAL2); + } + + GUEST_SYNC(stage++); + + /* HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX for both vCPUs */ + for (i = 0; i < NTRY; i++) { + prepare_to_test(data); + flush_ex->hv_vp_set.format = HV_GENERIC_SET_SPARSE_4K; + flush_ex->hv_vp_set.valid_bank_mask = BIT_ULL(WORKER_VCPU_ID_2 / 64) | + BIT_ULL(WORKER_VCPU_ID_1 / 64); + flush_ex->hv_vp_set.bank_contents[0] = BIT_ULL(WORKER_VCPU_ID_1 % 64); + flush_ex->hv_vp_set.bank_contents[1] = BIT_ULL(WORKER_VCPU_ID_2 % 64); + hyperv_write_xmm_input(&flush_ex->hv_vp_set, 2); + hyperv_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX | + HV_HYPERCALL_FAST_BIT | + (2 << HV_HYPERCALL_VARHEAD_OFFSET), + 0x0, HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES); + post_test(data, i % 2 ? TESTVAL1 : + TESTVAL2, i % 2 ? TESTVAL1 : TESTVAL2); + } + + GUEST_SYNC(stage++); + + /* HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX for both vCPUs */ + for (i = 0; i < NTRY; i++) { + prepare_to_test(data); + flush_ex->hv_vp_set.format = HV_GENERIC_SET_SPARSE_4K; + flush_ex->hv_vp_set.valid_bank_mask = BIT_ULL(WORKER_VCPU_ID_1 / 64) | + BIT_ULL(WORKER_VCPU_ID_2 / 64); + flush_ex->hv_vp_set.bank_contents[0] = BIT_ULL(WORKER_VCPU_ID_1 % 64); + flush_ex->hv_vp_set.bank_contents[1] = BIT_ULL(WORKER_VCPU_ID_2 % 64); + /* bank_contents and gva_list occupy the same space, thus [2] */ + flush_ex->gva_list[2] = (u64)data->test_pages; + hyperv_write_xmm_input(&flush_ex->hv_vp_set, 3); + hyperv_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX | + HV_HYPERCALL_FAST_BIT | + (2 << HV_HYPERCALL_VARHEAD_OFFSET) | + (1UL << HV_HYPERCALL_REP_COMP_OFFSET), + 0x0, HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES); + post_test(data, i % 2 ? TESTVAL1 : TESTVAL2, + i % 2 ? TESTVAL1 : TESTVAL2); + } + + GUEST_SYNC(stage++); + + /* HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX for HV_GENERIC_SET_ALL */ + for (i = 0; i < NTRY; i++) { + prepare_to_test(data); + flush_ex->flags = HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES; + flush_ex->hv_vp_set.format = HV_GENERIC_SET_ALL; + hyperv_write_xmm_input(&flush_ex->hv_vp_set, 2); + hyperv_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX | + HV_HYPERCALL_FAST_BIT, + 0x0, HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES); + post_test(data, i % 2 ? TESTVAL1 : TESTVAL2, + i % 2 ? TESTVAL1 : TESTVAL2); + } + + GUEST_SYNC(stage++); + + /* HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX for HV_GENERIC_SET_ALL */ + for (i = 0; i < NTRY; i++) { + prepare_to_test(data); + flush_ex->flags = HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES; + flush_ex->hv_vp_set.format = HV_GENERIC_SET_ALL; + flush_ex->gva_list[0] = (u64)data->test_pages; + hyperv_write_xmm_input(&flush_ex->hv_vp_set, 2); + hyperv_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX | + HV_HYPERCALL_FAST_BIT | + (1UL << HV_HYPERCALL_REP_COMP_OFFSET), + 0x0, HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES); + post_test(data, i % 2 ? TESTVAL1 : TESTVAL2, + i % 2 ? TESTVAL1 : TESTVAL2); + } + + GUEST_DONE(); +} + +static void *vcpu_thread(void *arg) +{ + struct kvm_vcpu *vcpu = (struct kvm_vcpu *)arg; + struct ucall uc; + int old; + int r; + + r = pthread_setcanceltype(PTHREAD_CANCEL_ASYNCHRONOUS, &old); + TEST_ASSERT(!r, "pthread_setcanceltype failed on vcpu_id=%u with errno=%d", + vcpu->id, r); + + vcpu_run(vcpu); + TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO); + + switch (get_ucall(vcpu, &uc)) { + case UCALL_ABORT: + REPORT_GUEST_ASSERT(uc); + /* NOT REACHED */ + default: + TEST_FAIL("Unexpected ucall %lu, vCPU %d", uc.cmd, vcpu->id); + } + + return NULL; +} + +static void cancel_join_vcpu_thread(pthread_t thread, struct kvm_vcpu *vcpu) +{ + void *retval; + int r; + + r = pthread_cancel(thread); + TEST_ASSERT(!r, "pthread_cancel on vcpu_id=%d failed with errno=%d", + vcpu->id, r); + + r = pthread_join(thread, &retval); + TEST_ASSERT(!r, "pthread_join on vcpu_id=%d failed with errno=%d", + vcpu->id, r); + TEST_ASSERT(retval == PTHREAD_CANCELED, + "expected retval=%p, got %p", PTHREAD_CANCELED, + retval); +} + +int main(int argc, char *argv[]) +{ + struct kvm_vm *vm; + struct kvm_vcpu *vcpu[3]; + pthread_t threads[2]; + vm_vaddr_t test_data_page, gva; + vm_paddr_t gpa; + uint64_t *pte; + struct test_data *data; + struct ucall uc; + int stage = 1, r, i; + + TEST_REQUIRE(kvm_has_cap(KVM_CAP_HYPERV_TLBFLUSH)); + + vm = vm_create_with_one_vcpu(&vcpu[0], sender_guest_code); + + /* Test data page */ + test_data_page = vm_vaddr_alloc_page(vm); + data = (struct test_data *)addr_gva2hva(vm, test_data_page); + + /* Hypercall input/output */ + data->hcall_gva = vm_vaddr_alloc_pages(vm, 2); + data->hcall_gpa = addr_gva2gpa(vm, data->hcall_gva); + memset(addr_gva2hva(vm, data->hcall_gva), 0x0, 2 * PAGE_SIZE); + + /* + * Test pages: the first one is filled with '0x01's, the second with '0x02's + * and the test will swap their mappings. The third page keeps the indication + * about the current state of mappings. + */ + data->test_pages = vm_vaddr_alloc_pages(vm, NTEST_PAGES + 1); + for (i = 0; i < NTEST_PAGES; i++) + memset(addr_gva2hva(vm, data->test_pages + PAGE_SIZE * i), + (u8)(i + 1), PAGE_SIZE); + set_expected_val(addr_gva2hva(vm, data->test_pages), 0x0, WORKER_VCPU_ID_1); + set_expected_val(addr_gva2hva(vm, data->test_pages), 0x0, WORKER_VCPU_ID_2); + + /* + * Get PTE pointers for test pages and map them inside the guest. + * Use separate page for each PTE for simplicity. + */ + gva = vm_vaddr_unused_gap(vm, NTEST_PAGES * PAGE_SIZE, KVM_UTIL_MIN_VADDR); + for (i = 0; i < NTEST_PAGES; i++) { + pte = vm_get_page_table_entry(vm, data->test_pages + i * PAGE_SIZE); + gpa = addr_hva2gpa(vm, pte); + __virt_pg_map(vm, gva + PAGE_SIZE * i, gpa & PAGE_MASK, PG_LEVEL_4K); + data->test_pages_pte[i] = gva + (gpa & ~PAGE_MASK); + } + + /* + * Sender vCPU which performs the test: swaps test pages, sets expectation + * for 'workers' and issues TLB flush hypercalls. + */ + vcpu_args_set(vcpu[0], 1, test_data_page); + vcpu_set_hv_cpuid(vcpu[0]); + + /* Create worker vCPUs which check the contents of the test pages */ + vcpu[1] = vm_vcpu_add(vm, WORKER_VCPU_ID_1, worker_guest_code); + vcpu_args_set(vcpu[1], 1, test_data_page); + vcpu_set_msr(vcpu[1], HV_X64_MSR_VP_INDEX, WORKER_VCPU_ID_1); + vcpu_set_hv_cpuid(vcpu[1]); + + vcpu[2] = vm_vcpu_add(vm, WORKER_VCPU_ID_2, worker_guest_code); + vcpu_args_set(vcpu[2], 1, test_data_page); + vcpu_set_msr(vcpu[2], HV_X64_MSR_VP_INDEX, WORKER_VCPU_ID_2); + vcpu_set_hv_cpuid(vcpu[2]); + + r = pthread_create(&threads[0], NULL, vcpu_thread, vcpu[1]); + TEST_ASSERT(!r, "pthread_create() failed"); + + r = pthread_create(&threads[1], NULL, vcpu_thread, vcpu[2]); + TEST_ASSERT(!r, "pthread_create() failed"); + + while (true) { + vcpu_run(vcpu[0]); + TEST_ASSERT_KVM_EXIT_REASON(vcpu[0], KVM_EXIT_IO); + + switch (get_ucall(vcpu[0], &uc)) { + case UCALL_SYNC: + TEST_ASSERT(uc.args[1] == stage, + "Unexpected stage: %ld (%d expected)", + uc.args[1], stage); + break; + case UCALL_ABORT: + REPORT_GUEST_ASSERT(uc); + /* NOT REACHED */ + case UCALL_DONE: + goto done; + default: + TEST_FAIL("Unknown ucall %lu", uc.cmd); + } + + stage++; + } + +done: + cancel_join_vcpu_thread(threads[0], vcpu[1]); + cancel_join_vcpu_thread(threads[1], vcpu[2]); + kvm_vm_free(vm); + + return 0; +} diff --git a/tools/testing/selftests/kvm/x86_64/kvm_clock_test.c b/tools/testing/selftests/kvm/x86_64/kvm_clock_test.c new file mode 100644 index 000000000000..5bc12222d87a --- /dev/null +++ b/tools/testing/selftests/kvm/x86_64/kvm_clock_test.c @@ -0,0 +1,156 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2021, Google LLC. + * + * Tests for adjusting the KVM clock from userspace + */ +#include <asm/kvm_para.h> +#include <asm/pvclock.h> +#include <asm/pvclock-abi.h> +#include <stdint.h> +#include <string.h> +#include <sys/stat.h> +#include <time.h> + +#include "test_util.h" +#include "kvm_util.h" +#include "processor.h" + +struct test_case { + uint64_t kvmclock_base; + int64_t realtime_offset; +}; + +static struct test_case test_cases[] = { + { .kvmclock_base = 0 }, + { .kvmclock_base = 180 * NSEC_PER_SEC }, + { .kvmclock_base = 0, .realtime_offset = -180 * NSEC_PER_SEC }, + { .kvmclock_base = 0, .realtime_offset = 180 * NSEC_PER_SEC }, +}; + +#define GUEST_SYNC_CLOCK(__stage, __val) \ + GUEST_SYNC_ARGS(__stage, __val, 0, 0, 0) + +static void guest_main(vm_paddr_t pvti_pa, struct pvclock_vcpu_time_info *pvti) +{ + int i; + + wrmsr(MSR_KVM_SYSTEM_TIME_NEW, pvti_pa | KVM_MSR_ENABLED); + for (i = 0; i < ARRAY_SIZE(test_cases); i++) + GUEST_SYNC_CLOCK(i, __pvclock_read_cycles(pvti, rdtsc())); +} + +#define EXPECTED_FLAGS (KVM_CLOCK_REALTIME | KVM_CLOCK_HOST_TSC) + +static inline void assert_flags(struct kvm_clock_data *data) +{ + TEST_ASSERT((data->flags & EXPECTED_FLAGS) == EXPECTED_FLAGS, + "unexpected clock data flags: %x (want set: %x)", + data->flags, EXPECTED_FLAGS); +} + +static void handle_sync(struct ucall *uc, struct kvm_clock_data *start, + struct kvm_clock_data *end) +{ + uint64_t obs, exp_lo, exp_hi; + + obs = uc->args[2]; + exp_lo = start->clock; + exp_hi = end->clock; + + assert_flags(start); + assert_flags(end); + + TEST_ASSERT(exp_lo <= obs && obs <= exp_hi, + "unexpected kvm-clock value: %"PRIu64" expected range: [%"PRIu64", %"PRIu64"]", + obs, exp_lo, exp_hi); + + pr_info("kvm-clock value: %"PRIu64" expected range [%"PRIu64", %"PRIu64"]\n", + obs, exp_lo, exp_hi); +} + +static void handle_abort(struct ucall *uc) +{ + REPORT_GUEST_ASSERT(*uc); +} + +static void setup_clock(struct kvm_vm *vm, struct test_case *test_case) +{ + struct kvm_clock_data data; + + memset(&data, 0, sizeof(data)); + + data.clock = test_case->kvmclock_base; + if (test_case->realtime_offset) { + struct timespec ts; + int r; + + data.flags |= KVM_CLOCK_REALTIME; + do { + r = clock_gettime(CLOCK_REALTIME, &ts); + if (!r) + break; + } while (errno == EINTR); + + TEST_ASSERT(!r, "clock_gettime() failed: %d", r); + + data.realtime = ts.tv_sec * NSEC_PER_SEC; + data.realtime += ts.tv_nsec; + data.realtime += test_case->realtime_offset; + } + + vm_ioctl(vm, KVM_SET_CLOCK, &data); +} + +static void enter_guest(struct kvm_vcpu *vcpu) +{ + struct kvm_clock_data start, end; + struct kvm_vm *vm = vcpu->vm; + struct ucall uc; + int i; + + for (i = 0; i < ARRAY_SIZE(test_cases); i++) { + setup_clock(vm, &test_cases[i]); + + vm_ioctl(vm, KVM_GET_CLOCK, &start); + + vcpu_run(vcpu); + vm_ioctl(vm, KVM_GET_CLOCK, &end); + + TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO); + + switch (get_ucall(vcpu, &uc)) { + case UCALL_SYNC: + handle_sync(&uc, &start, &end); + break; + case UCALL_ABORT: + handle_abort(&uc); + return; + default: + TEST_ASSERT(0, "unhandled ucall: %ld", uc.cmd); + } + } +} + +int main(void) +{ + struct kvm_vcpu *vcpu; + vm_vaddr_t pvti_gva; + vm_paddr_t pvti_gpa; + struct kvm_vm *vm; + int flags; + + flags = kvm_check_cap(KVM_CAP_ADJUST_CLOCK); + TEST_REQUIRE(flags & KVM_CLOCK_REALTIME); + + TEST_REQUIRE(sys_clocksource_is_based_on_tsc()); + + vm = vm_create_with_one_vcpu(&vcpu, guest_main); + + pvti_gva = vm_vaddr_alloc(vm, getpagesize(), 0x10000); + pvti_gpa = addr_gva2gpa(vm, pvti_gva); + vcpu_args_set(vcpu, 2, pvti_gpa, pvti_gva); + + enter_guest(vcpu); + kvm_vm_free(vm); +} diff --git a/tools/testing/selftests/kvm/x86_64/kvm_pv_test.c b/tools/testing/selftests/kvm/x86_64/kvm_pv_test.c new file mode 100644 index 000000000000..78878b3a2725 --- /dev/null +++ b/tools/testing/selftests/kvm/x86_64/kvm_pv_test.c @@ -0,0 +1,190 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2020, Google LLC. + * + * Tests for KVM paravirtual feature disablement + */ +#include <asm/kvm_para.h> +#include <linux/kvm_para.h> +#include <stdint.h> + +#include "test_util.h" +#include "kvm_util.h" +#include "processor.h" + +struct msr_data { + uint32_t idx; + const char *name; +}; + +#define TEST_MSR(msr) { .idx = msr, .name = #msr } +#define UCALL_PR_MSR 0xdeadbeef +#define PR_MSR(msr) ucall(UCALL_PR_MSR, 1, msr) + +/* + * KVM paravirtual msrs to test. Expect a #GP if any of these msrs are read or + * written, as the KVM_CPUID_FEATURES leaf is cleared. + */ +static struct msr_data msrs_to_test[] = { + TEST_MSR(MSR_KVM_SYSTEM_TIME), + TEST_MSR(MSR_KVM_SYSTEM_TIME_NEW), + TEST_MSR(MSR_KVM_WALL_CLOCK), + TEST_MSR(MSR_KVM_WALL_CLOCK_NEW), + TEST_MSR(MSR_KVM_ASYNC_PF_EN), + TEST_MSR(MSR_KVM_STEAL_TIME), + TEST_MSR(MSR_KVM_PV_EOI_EN), + TEST_MSR(MSR_KVM_POLL_CONTROL), + TEST_MSR(MSR_KVM_ASYNC_PF_INT), + TEST_MSR(MSR_KVM_ASYNC_PF_ACK), +}; + +static void test_msr(struct msr_data *msr) +{ + uint64_t ignored; + uint8_t vector; + + PR_MSR(msr); + + vector = rdmsr_safe(msr->idx, &ignored); + GUEST_ASSERT_EQ(vector, GP_VECTOR); + + vector = wrmsr_safe(msr->idx, 0); + GUEST_ASSERT_EQ(vector, GP_VECTOR); +} + +struct hcall_data { + uint64_t nr; + const char *name; +}; + +#define TEST_HCALL(hc) { .nr = hc, .name = #hc } +#define UCALL_PR_HCALL 0xdeadc0de +#define PR_HCALL(hc) ucall(UCALL_PR_HCALL, 1, hc) + +/* + * KVM hypercalls to test. Expect -KVM_ENOSYS when called, as the corresponding + * features have been cleared in KVM_CPUID_FEATURES. + */ +static struct hcall_data hcalls_to_test[] = { + TEST_HCALL(KVM_HC_KICK_CPU), + TEST_HCALL(KVM_HC_SEND_IPI), + TEST_HCALL(KVM_HC_SCHED_YIELD), +}; + +static void test_hcall(struct hcall_data *hc) +{ + uint64_t r; + + PR_HCALL(hc); + r = kvm_hypercall(hc->nr, 0, 0, 0, 0); + GUEST_ASSERT_EQ(r, -KVM_ENOSYS); +} + +static void guest_main(void) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(msrs_to_test); i++) { + test_msr(&msrs_to_test[i]); + } + + for (i = 0; i < ARRAY_SIZE(hcalls_to_test); i++) { + test_hcall(&hcalls_to_test[i]); + } + + GUEST_DONE(); +} + +static void pr_msr(struct ucall *uc) +{ + struct msr_data *msr = (struct msr_data *)uc->args[0]; + + pr_info("testing msr: %s (%#x)\n", msr->name, msr->idx); +} + +static void pr_hcall(struct ucall *uc) +{ + struct hcall_data *hc = (struct hcall_data *)uc->args[0]; + + pr_info("testing hcall: %s (%lu)\n", hc->name, hc->nr); +} + +static void enter_guest(struct kvm_vcpu *vcpu) +{ + struct ucall uc; + + while (true) { + vcpu_run(vcpu); + TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO); + + switch (get_ucall(vcpu, &uc)) { + case UCALL_PR_MSR: + pr_msr(&uc); + break; + case UCALL_PR_HCALL: + pr_hcall(&uc); + break; + case UCALL_ABORT: + REPORT_GUEST_ASSERT(uc); + return; + case UCALL_DONE: + return; + } + } +} + +static void test_pv_unhalt(void) +{ + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; + struct kvm_cpuid_entry2 *ent; + u32 kvm_sig_old; + + pr_info("testing KVM_FEATURE_PV_UNHALT\n"); + + TEST_REQUIRE(KVM_CAP_X86_DISABLE_EXITS); + + /* KVM_PV_UNHALT test */ + vm = vm_create_with_one_vcpu(&vcpu, guest_main); + vcpu_set_cpuid_feature(vcpu, X86_FEATURE_KVM_PV_UNHALT); + + TEST_ASSERT(vcpu_cpuid_has(vcpu, X86_FEATURE_KVM_PV_UNHALT), + "Enabling X86_FEATURE_KVM_PV_UNHALT had no effect"); + + /* Make sure KVM clears vcpu->arch.kvm_cpuid */ + ent = vcpu_get_cpuid_entry(vcpu, KVM_CPUID_SIGNATURE); + kvm_sig_old = ent->ebx; + ent->ebx = 0xdeadbeef; + vcpu_set_cpuid(vcpu); + + vm_enable_cap(vm, KVM_CAP_X86_DISABLE_EXITS, KVM_X86_DISABLE_EXITS_HLT); + ent = vcpu_get_cpuid_entry(vcpu, KVM_CPUID_SIGNATURE); + ent->ebx = kvm_sig_old; + vcpu_set_cpuid(vcpu); + + TEST_ASSERT(!vcpu_cpuid_has(vcpu, X86_FEATURE_KVM_PV_UNHALT), + "KVM_FEATURE_PV_UNHALT is set with KVM_CAP_X86_DISABLE_EXITS"); + + /* FIXME: actually test KVM_FEATURE_PV_UNHALT feature */ + + kvm_vm_free(vm); +} + +int main(void) +{ + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; + + TEST_REQUIRE(kvm_has_cap(KVM_CAP_ENFORCE_PV_FEATURE_CPUID)); + + vm = vm_create_with_one_vcpu(&vcpu, guest_main); + + vcpu_enable_cap(vcpu, KVM_CAP_ENFORCE_PV_FEATURE_CPUID, 1); + + vcpu_clear_cpuid_entry(vcpu, KVM_CPUID_FEATURES); + + enter_guest(vcpu); + kvm_vm_free(vm); + + test_pv_unhalt(); +} diff --git a/tools/testing/selftests/kvm/x86_64/max_vcpuid_cap_test.c b/tools/testing/selftests/kvm/x86_64/max_vcpuid_cap_test.c new file mode 100644 index 000000000000..3cc4b86832fe --- /dev/null +++ b/tools/testing/selftests/kvm/x86_64/max_vcpuid_cap_test.c @@ -0,0 +1,44 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * maximum APIC ID capability tests + * + * Copyright (C) 2022, Intel, Inc. + * + * Tests for getting/setting maximum APIC ID capability + */ + +#include "kvm_util.h" + +#define MAX_VCPU_ID 2 + +int main(int argc, char *argv[]) +{ + struct kvm_vm *vm; + int ret; + + vm = vm_create_barebones(); + + /* Get KVM_CAP_MAX_VCPU_ID cap supported in KVM */ + ret = vm_check_cap(vm, KVM_CAP_MAX_VCPU_ID); + + /* Try to set KVM_CAP_MAX_VCPU_ID beyond KVM cap */ + ret = __vm_enable_cap(vm, KVM_CAP_MAX_VCPU_ID, ret + 1); + TEST_ASSERT(ret < 0, + "Setting KVM_CAP_MAX_VCPU_ID beyond KVM cap should fail"); + + /* Set KVM_CAP_MAX_VCPU_ID */ + vm_enable_cap(vm, KVM_CAP_MAX_VCPU_ID, MAX_VCPU_ID); + + + /* Try to set KVM_CAP_MAX_VCPU_ID again */ + ret = __vm_enable_cap(vm, KVM_CAP_MAX_VCPU_ID, MAX_VCPU_ID + 1); + TEST_ASSERT(ret < 0, + "Setting KVM_CAP_MAX_VCPU_ID multiple times should fail"); + + /* Create vCPU with id beyond KVM_CAP_MAX_VCPU_ID cap*/ + ret = __vm_ioctl(vm, KVM_CREATE_VCPU, (void *)MAX_VCPU_ID); + TEST_ASSERT(ret < 0, "Creating vCPU with ID > MAX_VCPU_ID should fail"); + + kvm_vm_free(vm); + return 0; +} diff --git a/tools/testing/selftests/kvm/x86_64/mmio_warning_test.c b/tools/testing/selftests/kvm/x86_64/mmio_warning_test.c deleted file mode 100644 index e6480fd5c4bd..000000000000 --- a/tools/testing/selftests/kvm/x86_64/mmio_warning_test.c +++ /dev/null @@ -1,126 +0,0 @@ -/* - * mmio_warning_test - * - * Copyright (C) 2019, Google LLC. - * - * This work is licensed under the terms of the GNU GPL, version 2. - * - * Test that we don't get a kernel warning when we call KVM_RUN after a - * triple fault occurs. To get the triple fault to occur we call KVM_RUN - * on a VCPU that hasn't been properly setup. - * - */ - -#define _GNU_SOURCE -#include <fcntl.h> -#include <kvm_util.h> -#include <linux/kvm.h> -#include <processor.h> -#include <pthread.h> -#include <stdio.h> -#include <stdlib.h> -#include <string.h> -#include <sys/ioctl.h> -#include <sys/mman.h> -#include <sys/stat.h> -#include <sys/types.h> -#include <sys/wait.h> -#include <test_util.h> -#include <unistd.h> - -#define NTHREAD 4 -#define NPROCESS 5 - -struct thread_context { - int kvmcpu; - struct kvm_run *run; -}; - -void *thr(void *arg) -{ - struct thread_context *tc = (struct thread_context *)arg; - int res; - int kvmcpu = tc->kvmcpu; - struct kvm_run *run = tc->run; - - res = ioctl(kvmcpu, KVM_RUN, 0); - pr_info("ret1=%d exit_reason=%d suberror=%d\n", - res, run->exit_reason, run->internal.suberror); - - return 0; -} - -void test(void) -{ - int i, kvm, kvmvm, kvmcpu; - pthread_t th[NTHREAD]; - struct kvm_run *run; - struct thread_context tc; - - kvm = open("/dev/kvm", O_RDWR); - TEST_ASSERT(kvm != -1, "failed to open /dev/kvm"); - kvmvm = ioctl(kvm, KVM_CREATE_VM, 0); - TEST_ASSERT(kvmvm != -1, "KVM_CREATE_VM failed"); - kvmcpu = ioctl(kvmvm, KVM_CREATE_VCPU, 0); - TEST_ASSERT(kvmcpu != -1, "KVM_CREATE_VCPU failed"); - run = (struct kvm_run *)mmap(0, 4096, PROT_READ|PROT_WRITE, MAP_SHARED, - kvmcpu, 0); - tc.kvmcpu = kvmcpu; - tc.run = run; - srand(getpid()); - for (i = 0; i < NTHREAD; i++) { - pthread_create(&th[i], NULL, thr, (void *)(uintptr_t)&tc); - usleep(rand() % 10000); - } - for (i = 0; i < NTHREAD; i++) - pthread_join(th[i], NULL); -} - -int get_warnings_count(void) -{ - int warnings; - FILE *f; - - f = popen("dmesg | grep \"WARNING:\" | wc -l", "r"); - fscanf(f, "%d", &warnings); - fclose(f); - - return warnings; -} - -int main(void) -{ - int warnings_before, warnings_after; - - if (!is_intel_cpu()) { - print_skip("Must be run on an Intel CPU"); - exit(KSFT_SKIP); - } - - if (vm_is_unrestricted_guest(NULL)) { - print_skip("Unrestricted guest must be disabled"); - exit(KSFT_SKIP); - } - - warnings_before = get_warnings_count(); - - for (int i = 0; i < NPROCESS; ++i) { - int status; - int pid = fork(); - - if (pid < 0) - exit(1); - if (pid == 0) { - test(); - exit(0); - } - while (waitpid(pid, &status, __WALL) != pid) - ; - } - - warnings_after = get_warnings_count(); - TEST_ASSERT(warnings_before == warnings_after, - "Warnings found in kernel. Run 'dmesg' to inspect them."); - - return 0; -} diff --git a/tools/testing/selftests/kvm/x86_64/monitor_mwait_test.c b/tools/testing/selftests/kvm/x86_64/monitor_mwait_test.c new file mode 100644 index 000000000000..2b550eff35f1 --- /dev/null +++ b/tools/testing/selftests/kvm/x86_64/monitor_mwait_test.c @@ -0,0 +1,129 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <fcntl.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/ioctl.h> + +#include "kvm_util.h" +#include "processor.h" + +#define CPUID_MWAIT (1u << 3) + +enum monitor_mwait_testcases { + MWAIT_QUIRK_DISABLED = BIT(0), + MISC_ENABLES_QUIRK_DISABLED = BIT(1), + MWAIT_DISABLED = BIT(2), +}; + +/* + * If both MWAIT and its quirk are disabled, MONITOR/MWAIT should #UD, in all + * other scenarios KVM should emulate them as nops. + */ +#define GUEST_ASSERT_MONITOR_MWAIT(insn, testcase, vector) \ +do { \ + bool fault_wanted = ((testcase) & MWAIT_QUIRK_DISABLED) && \ + ((testcase) & MWAIT_DISABLED); \ + \ + if (fault_wanted) \ + __GUEST_ASSERT((vector) == UD_VECTOR, \ + "Expected #UD on " insn " for testcase '0x%x', got '0x%x'", \ + testcase, vector); \ + else \ + __GUEST_ASSERT(!(vector), \ + "Expected success on " insn " for testcase '0x%x', got '0x%x'", \ + testcase, vector); \ +} while (0) + +static void guest_monitor_wait(int testcase) +{ + u8 vector; + + GUEST_SYNC(testcase); + + /* + * Arbitrarily MONITOR this function, SVM performs fault checks before + * intercept checks, so the inputs for MONITOR and MWAIT must be valid. + */ + vector = kvm_asm_safe("monitor", "a"(guest_monitor_wait), "c"(0), "d"(0)); + GUEST_ASSERT_MONITOR_MWAIT("MONITOR", testcase, vector); + + vector = kvm_asm_safe("mwait", "a"(guest_monitor_wait), "c"(0), "d"(0)); + GUEST_ASSERT_MONITOR_MWAIT("MWAIT", testcase, vector); +} + +static void guest_code(void) +{ + guest_monitor_wait(MWAIT_DISABLED); + + guest_monitor_wait(MWAIT_QUIRK_DISABLED | MWAIT_DISABLED); + + guest_monitor_wait(MISC_ENABLES_QUIRK_DISABLED | MWAIT_DISABLED); + guest_monitor_wait(MISC_ENABLES_QUIRK_DISABLED); + + guest_monitor_wait(MISC_ENABLES_QUIRK_DISABLED | MWAIT_QUIRK_DISABLED | MWAIT_DISABLED); + guest_monitor_wait(MISC_ENABLES_QUIRK_DISABLED | MWAIT_QUIRK_DISABLED); + + GUEST_DONE(); +} + +int main(int argc, char *argv[]) +{ + uint64_t disabled_quirks; + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; + struct ucall uc; + int testcase; + + TEST_REQUIRE(this_cpu_has(X86_FEATURE_MWAIT)); + TEST_REQUIRE(kvm_has_cap(KVM_CAP_DISABLE_QUIRKS2)); + + vm = vm_create_with_one_vcpu(&vcpu, guest_code); + vcpu_clear_cpuid_feature(vcpu, X86_FEATURE_MWAIT); + + while (1) { + vcpu_run(vcpu); + TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO); + + switch (get_ucall(vcpu, &uc)) { + case UCALL_SYNC: + testcase = uc.args[1]; + break; + case UCALL_ABORT: + REPORT_GUEST_ASSERT(uc); + goto done; + case UCALL_DONE: + goto done; + default: + TEST_FAIL("Unknown ucall %lu", uc.cmd); + goto done; + } + + disabled_quirks = 0; + if (testcase & MWAIT_QUIRK_DISABLED) + disabled_quirks |= KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS; + if (testcase & MISC_ENABLES_QUIRK_DISABLED) + disabled_quirks |= KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT; + vm_enable_cap(vm, KVM_CAP_DISABLE_QUIRKS2, disabled_quirks); + + /* + * If the MISC_ENABLES quirk (KVM neglects to update CPUID to + * enable/disable MWAIT) is disabled, toggle the ENABLE_MWAIT + * bit in MISC_ENABLES accordingly. If the quirk is enabled, + * the only valid configuration is MWAIT disabled, as CPUID + * can't be manually changed after running the vCPU. + */ + if (!(testcase & MISC_ENABLES_QUIRK_DISABLED)) { + TEST_ASSERT(testcase & MWAIT_DISABLED, + "Can't toggle CPUID features after running vCPU"); + continue; + } + + vcpu_set_msr(vcpu, MSR_IA32_MISC_ENABLE, + (testcase & MWAIT_DISABLED) ? 0 : MSR_IA32_MISC_ENABLE_MWAIT); + } + +done: + kvm_vm_free(vm); + return 0; +} diff --git a/tools/testing/selftests/kvm/x86_64/nested_exceptions_test.c b/tools/testing/selftests/kvm/x86_64/nested_exceptions_test.c new file mode 100644 index 000000000000..3eb0313ffa39 --- /dev/null +++ b/tools/testing/selftests/kvm/x86_64/nested_exceptions_test.c @@ -0,0 +1,288 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include "test_util.h" +#include "kvm_util.h" +#include "processor.h" +#include "vmx.h" +#include "svm_util.h" + +#define L2_GUEST_STACK_SIZE 256 + +/* + * Arbitrary, never shoved into KVM/hardware, just need to avoid conflict with + * the "real" exceptions used, #SS/#GP/#DF (12/13/8). + */ +#define FAKE_TRIPLE_FAULT_VECTOR 0xaa + +/* Arbitrary 32-bit error code injected by this test. */ +#define SS_ERROR_CODE 0xdeadbeef + +/* + * Bit '0' is set on Intel if the exception occurs while delivering a previous + * event/exception. AMD's wording is ambiguous, but presumably the bit is set + * if the exception occurs while delivering an external event, e.g. NMI or INTR, + * but not for exceptions that occur when delivering other exceptions or + * software interrupts. + * + * Note, Intel's name for it, "External event", is misleading and much more + * aligned with AMD's behavior, but the SDM is quite clear on its behavior. + */ +#define ERROR_CODE_EXT_FLAG BIT(0) + +/* + * Bit '1' is set if the fault occurred when looking up a descriptor in the + * IDT, which is the case here as the IDT is empty/NULL. + */ +#define ERROR_CODE_IDT_FLAG BIT(1) + +/* + * The #GP that occurs when vectoring #SS should show the index into the IDT + * for #SS, plus have the "IDT flag" set. + */ +#define GP_ERROR_CODE_AMD ((SS_VECTOR * 8) | ERROR_CODE_IDT_FLAG) +#define GP_ERROR_CODE_INTEL ((SS_VECTOR * 8) | ERROR_CODE_IDT_FLAG | ERROR_CODE_EXT_FLAG) + +/* + * Intel and AMD both shove '0' into the error code on #DF, regardless of what + * led to the double fault. + */ +#define DF_ERROR_CODE 0 + +#define INTERCEPT_SS (BIT_ULL(SS_VECTOR)) +#define INTERCEPT_SS_DF (INTERCEPT_SS | BIT_ULL(DF_VECTOR)) +#define INTERCEPT_SS_GP_DF (INTERCEPT_SS_DF | BIT_ULL(GP_VECTOR)) + +static void l2_ss_pending_test(void) +{ + GUEST_SYNC(SS_VECTOR); +} + +static void l2_ss_injected_gp_test(void) +{ + GUEST_SYNC(GP_VECTOR); +} + +static void l2_ss_injected_df_test(void) +{ + GUEST_SYNC(DF_VECTOR); +} + +static void l2_ss_injected_tf_test(void) +{ + GUEST_SYNC(FAKE_TRIPLE_FAULT_VECTOR); +} + +static void svm_run_l2(struct svm_test_data *svm, void *l2_code, int vector, + uint32_t error_code) +{ + struct vmcb *vmcb = svm->vmcb; + struct vmcb_control_area *ctrl = &vmcb->control; + + vmcb->save.rip = (u64)l2_code; + run_guest(vmcb, svm->vmcb_gpa); + + if (vector == FAKE_TRIPLE_FAULT_VECTOR) + return; + + GUEST_ASSERT_EQ(ctrl->exit_code, (SVM_EXIT_EXCP_BASE + vector)); + GUEST_ASSERT_EQ(ctrl->exit_info_1, error_code); +} + +static void l1_svm_code(struct svm_test_data *svm) +{ + struct vmcb_control_area *ctrl = &svm->vmcb->control; + unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; + + generic_svm_setup(svm, NULL, &l2_guest_stack[L2_GUEST_STACK_SIZE]); + svm->vmcb->save.idtr.limit = 0; + ctrl->intercept |= BIT_ULL(INTERCEPT_SHUTDOWN); + + ctrl->intercept_exceptions = INTERCEPT_SS_GP_DF; + svm_run_l2(svm, l2_ss_pending_test, SS_VECTOR, SS_ERROR_CODE); + svm_run_l2(svm, l2_ss_injected_gp_test, GP_VECTOR, GP_ERROR_CODE_AMD); + + ctrl->intercept_exceptions = INTERCEPT_SS_DF; + svm_run_l2(svm, l2_ss_injected_df_test, DF_VECTOR, DF_ERROR_CODE); + + ctrl->intercept_exceptions = INTERCEPT_SS; + svm_run_l2(svm, l2_ss_injected_tf_test, FAKE_TRIPLE_FAULT_VECTOR, 0); + GUEST_ASSERT_EQ(ctrl->exit_code, SVM_EXIT_SHUTDOWN); + + GUEST_DONE(); +} + +static void vmx_run_l2(void *l2_code, int vector, uint32_t error_code) +{ + GUEST_ASSERT(!vmwrite(GUEST_RIP, (u64)l2_code)); + + GUEST_ASSERT_EQ(vector == SS_VECTOR ? vmlaunch() : vmresume(), 0); + + if (vector == FAKE_TRIPLE_FAULT_VECTOR) + return; + + GUEST_ASSERT_EQ(vmreadz(VM_EXIT_REASON), EXIT_REASON_EXCEPTION_NMI); + GUEST_ASSERT_EQ((vmreadz(VM_EXIT_INTR_INFO) & 0xff), vector); + GUEST_ASSERT_EQ(vmreadz(VM_EXIT_INTR_ERROR_CODE), error_code); +} + +static void l1_vmx_code(struct vmx_pages *vmx) +{ + unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; + + GUEST_ASSERT_EQ(prepare_for_vmx_operation(vmx), true); + + GUEST_ASSERT_EQ(load_vmcs(vmx), true); + + prepare_vmcs(vmx, NULL, &l2_guest_stack[L2_GUEST_STACK_SIZE]); + GUEST_ASSERT_EQ(vmwrite(GUEST_IDTR_LIMIT, 0), 0); + + /* + * VMX disallows injecting an exception with error_code[31:16] != 0, + * and hardware will never generate a VM-Exit with bits 31:16 set. + * KVM should likewise truncate the "bad" userspace value. + */ + GUEST_ASSERT_EQ(vmwrite(EXCEPTION_BITMAP, INTERCEPT_SS_GP_DF), 0); + vmx_run_l2(l2_ss_pending_test, SS_VECTOR, (u16)SS_ERROR_CODE); + vmx_run_l2(l2_ss_injected_gp_test, GP_VECTOR, GP_ERROR_CODE_INTEL); + + GUEST_ASSERT_EQ(vmwrite(EXCEPTION_BITMAP, INTERCEPT_SS_DF), 0); + vmx_run_l2(l2_ss_injected_df_test, DF_VECTOR, DF_ERROR_CODE); + + GUEST_ASSERT_EQ(vmwrite(EXCEPTION_BITMAP, INTERCEPT_SS), 0); + vmx_run_l2(l2_ss_injected_tf_test, FAKE_TRIPLE_FAULT_VECTOR, 0); + GUEST_ASSERT_EQ(vmreadz(VM_EXIT_REASON), EXIT_REASON_TRIPLE_FAULT); + + GUEST_DONE(); +} + +static void __attribute__((__flatten__)) l1_guest_code(void *test_data) +{ + if (this_cpu_has(X86_FEATURE_SVM)) + l1_svm_code(test_data); + else + l1_vmx_code(test_data); +} + +static void assert_ucall_vector(struct kvm_vcpu *vcpu, int vector) +{ + struct ucall uc; + + TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO); + + switch (get_ucall(vcpu, &uc)) { + case UCALL_SYNC: + TEST_ASSERT(vector == uc.args[1], + "Expected L2 to ask for %d, got %ld", vector, uc.args[1]); + break; + case UCALL_DONE: + TEST_ASSERT(vector == -1, + "Expected L2 to ask for %d, L2 says it's done", vector); + break; + case UCALL_ABORT: + REPORT_GUEST_ASSERT(uc); + break; + default: + TEST_FAIL("Expected L2 to ask for %d, got unexpected ucall %lu", vector, uc.cmd); + } +} + +static void queue_ss_exception(struct kvm_vcpu *vcpu, bool inject) +{ + struct kvm_vcpu_events events; + + vcpu_events_get(vcpu, &events); + + TEST_ASSERT(!events.exception.pending, + "Vector %d unexpectedlt pending", events.exception.nr); + TEST_ASSERT(!events.exception.injected, + "Vector %d unexpectedly injected", events.exception.nr); + + events.flags = KVM_VCPUEVENT_VALID_PAYLOAD; + events.exception.pending = !inject; + events.exception.injected = inject; + events.exception.nr = SS_VECTOR; + events.exception.has_error_code = true; + events.exception.error_code = SS_ERROR_CODE; + vcpu_events_set(vcpu, &events); +} + +/* + * Verify KVM_{G,S}ET_EVENTS play nice with pending vs. injected exceptions + * when an exception is being queued for L2. Specifically, verify that KVM + * honors L1 exception intercept controls when a #SS is pending/injected, + * triggers a #GP on vectoring the #SS, morphs to #DF if #GP isn't intercepted + * by L1, and finally causes (nested) SHUTDOWN if #DF isn't intercepted by L1. + */ +int main(int argc, char *argv[]) +{ + vm_vaddr_t nested_test_data_gva; + struct kvm_vcpu_events events; + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; + + TEST_REQUIRE(kvm_has_cap(KVM_CAP_EXCEPTION_PAYLOAD)); + TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_SVM) || kvm_cpu_has(X86_FEATURE_VMX)); + + vm = vm_create_with_one_vcpu(&vcpu, l1_guest_code); + vm_enable_cap(vm, KVM_CAP_EXCEPTION_PAYLOAD, -2ul); + + if (kvm_cpu_has(X86_FEATURE_SVM)) + vcpu_alloc_svm(vm, &nested_test_data_gva); + else + vcpu_alloc_vmx(vm, &nested_test_data_gva); + + vcpu_args_set(vcpu, 1, nested_test_data_gva); + + /* Run L1 => L2. L2 should sync and request #SS. */ + vcpu_run(vcpu); + assert_ucall_vector(vcpu, SS_VECTOR); + + /* Pend #SS and request immediate exit. #SS should still be pending. */ + queue_ss_exception(vcpu, false); + vcpu->run->immediate_exit = true; + vcpu_run_complete_io(vcpu); + + /* Verify the pending events comes back out the same as it went in. */ + vcpu_events_get(vcpu, &events); + TEST_ASSERT_EQ(events.flags & KVM_VCPUEVENT_VALID_PAYLOAD, + KVM_VCPUEVENT_VALID_PAYLOAD); + TEST_ASSERT_EQ(events.exception.pending, true); + TEST_ASSERT_EQ(events.exception.nr, SS_VECTOR); + TEST_ASSERT_EQ(events.exception.has_error_code, true); + TEST_ASSERT_EQ(events.exception.error_code, SS_ERROR_CODE); + + /* + * Run for real with the pending #SS, L1 should get a VM-Exit due to + * #SS interception and re-enter L2 to request #GP (via injected #SS). + */ + vcpu->run->immediate_exit = false; + vcpu_run(vcpu); + assert_ucall_vector(vcpu, GP_VECTOR); + + /* + * Inject #SS, the #SS should bypass interception and cause #GP, which + * L1 should intercept before KVM morphs it to #DF. L1 should then + * disable #GP interception and run L2 to request #DF (via #SS => #GP). + */ + queue_ss_exception(vcpu, true); + vcpu_run(vcpu); + assert_ucall_vector(vcpu, DF_VECTOR); + + /* + * Inject #SS, the #SS should bypass interception and cause #GP, which + * L1 is no longer interception, and so should see a #DF VM-Exit. L1 + * should then signal that is done. + */ + queue_ss_exception(vcpu, true); + vcpu_run(vcpu); + assert_ucall_vector(vcpu, FAKE_TRIPLE_FAULT_VECTOR); + + /* + * Inject #SS yet again. L1 is not intercepting #GP or #DF, and so + * should see nested TRIPLE_FAULT / SHUTDOWN. + */ + queue_ss_exception(vcpu, true); + vcpu_run(vcpu); + assert_ucall_vector(vcpu, -1); + + kvm_vm_free(vm); +} diff --git a/tools/testing/selftests/kvm/x86_64/nx_huge_pages_test.c b/tools/testing/selftests/kvm/x86_64/nx_huge_pages_test.c new file mode 100644 index 000000000000..e7efb2b35f8b --- /dev/null +++ b/tools/testing/selftests/kvm/x86_64/nx_huge_pages_test.c @@ -0,0 +1,266 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Usage: to be run via nx_huge_page_test.sh, which does the necessary + * environment setup and teardown + * + * Copyright (C) 2022, Google LLC. + */ +#include <fcntl.h> +#include <stdint.h> +#include <time.h> + +#include <test_util.h> +#include "kvm_util.h" +#include "processor.h" + +#define HPAGE_SLOT 10 +#define HPAGE_GPA (4UL << 30) /* 4G prevents collision w/ slot 0 */ +#define HPAGE_GVA HPAGE_GPA /* GVA is arbitrary, so use GPA. */ +#define PAGES_PER_2MB_HUGE_PAGE 512 +#define HPAGE_SLOT_NPAGES (3 * PAGES_PER_2MB_HUGE_PAGE) + +/* + * Passed by nx_huge_pages_test.sh to provide an easy warning if this test is + * being run without it. + */ +#define MAGIC_TOKEN 887563923 + +/* + * x86 opcode for the return instruction. Used to call into, and then + * immediately return from, memory backed with hugepages. + */ +#define RETURN_OPCODE 0xC3 + +/* Call the specified memory address. */ +static void guest_do_CALL(uint64_t target) +{ + ((void (*)(void)) target)(); +} + +/* + * Exit the VM after each memory access so that the userspace component of the + * test can make assertions about the pages backing the VM. + * + * See the below for an explanation of how each access should affect the + * backing mappings. + */ +void guest_code(void) +{ + uint64_t hpage_1 = HPAGE_GVA; + uint64_t hpage_2 = hpage_1 + (PAGE_SIZE * 512); + uint64_t hpage_3 = hpage_2 + (PAGE_SIZE * 512); + + READ_ONCE(*(uint64_t *)hpage_1); + GUEST_SYNC(1); + + READ_ONCE(*(uint64_t *)hpage_2); + GUEST_SYNC(2); + + guest_do_CALL(hpage_1); + GUEST_SYNC(3); + + guest_do_CALL(hpage_3); + GUEST_SYNC(4); + + READ_ONCE(*(uint64_t *)hpage_1); + GUEST_SYNC(5); + + READ_ONCE(*(uint64_t *)hpage_3); + GUEST_SYNC(6); +} + +static void check_2m_page_count(struct kvm_vm *vm, int expected_pages_2m) +{ + int actual_pages_2m; + + actual_pages_2m = vm_get_stat(vm, "pages_2m"); + + TEST_ASSERT(actual_pages_2m == expected_pages_2m, + "Unexpected 2m page count. Expected %d, got %d", + expected_pages_2m, actual_pages_2m); +} + +static void check_split_count(struct kvm_vm *vm, int expected_splits) +{ + int actual_splits; + + actual_splits = vm_get_stat(vm, "nx_lpage_splits"); + + TEST_ASSERT(actual_splits == expected_splits, + "Unexpected NX huge page split count. Expected %d, got %d", + expected_splits, actual_splits); +} + +static void wait_for_reclaim(int reclaim_period_ms) +{ + long reclaim_wait_ms; + struct timespec ts; + + reclaim_wait_ms = reclaim_period_ms * 5; + ts.tv_sec = reclaim_wait_ms / 1000; + ts.tv_nsec = (reclaim_wait_ms - (ts.tv_sec * 1000)) * 1000000; + nanosleep(&ts, NULL); +} + +void run_test(int reclaim_period_ms, bool disable_nx_huge_pages, + bool reboot_permissions) +{ + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; + uint64_t nr_bytes; + void *hva; + int r; + + vm = vm_create(1); + + if (disable_nx_huge_pages) { + r = __vm_disable_nx_huge_pages(vm); + if (reboot_permissions) { + TEST_ASSERT(!r, "Disabling NX huge pages should succeed if process has reboot permissions"); + } else { + TEST_ASSERT(r == -1 && errno == EPERM, + "This process should not have permission to disable NX huge pages"); + return; + } + } + + vcpu = vm_vcpu_add(vm, 0, guest_code); + + vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS_HUGETLB, + HPAGE_GPA, HPAGE_SLOT, + HPAGE_SLOT_NPAGES, 0); + + nr_bytes = HPAGE_SLOT_NPAGES * vm->page_size; + + /* + * Ensure that KVM can map HPAGE_SLOT with huge pages by mapping the + * region into the guest with 2MiB pages whenever TDP is disabled (i.e. + * whenever KVM is shadowing the guest page tables). + * + * When TDP is enabled, KVM should be able to map HPAGE_SLOT with huge + * pages irrespective of the guest page size, so map with 4KiB pages + * to test that that is the case. + */ + if (kvm_is_tdp_enabled()) + virt_map_level(vm, HPAGE_GVA, HPAGE_GPA, nr_bytes, PG_LEVEL_4K); + else + virt_map_level(vm, HPAGE_GVA, HPAGE_GPA, nr_bytes, PG_LEVEL_2M); + + hva = addr_gpa2hva(vm, HPAGE_GPA); + memset(hva, RETURN_OPCODE, nr_bytes); + + check_2m_page_count(vm, 0); + check_split_count(vm, 0); + + /* + * The guest code will first read from the first hugepage, resulting + * in a huge page mapping being created. + */ + vcpu_run(vcpu); + check_2m_page_count(vm, 1); + check_split_count(vm, 0); + + /* + * Then the guest code will read from the second hugepage, resulting + * in another huge page mapping being created. + */ + vcpu_run(vcpu); + check_2m_page_count(vm, 2); + check_split_count(vm, 0); + + /* + * Next, the guest will execute from the first huge page, causing it + * to be remapped at 4k. + * + * If NX huge pages are disabled, this should have no effect. + */ + vcpu_run(vcpu); + check_2m_page_count(vm, disable_nx_huge_pages ? 2 : 1); + check_split_count(vm, disable_nx_huge_pages ? 0 : 1); + + /* + * Executing from the third huge page (previously unaccessed) will + * cause part to be mapped at 4k. + * + * If NX huge pages are disabled, it should be mapped at 2M. + */ + vcpu_run(vcpu); + check_2m_page_count(vm, disable_nx_huge_pages ? 3 : 1); + check_split_count(vm, disable_nx_huge_pages ? 0 : 2); + + /* Reading from the first huge page again should have no effect. */ + vcpu_run(vcpu); + check_2m_page_count(vm, disable_nx_huge_pages ? 3 : 1); + check_split_count(vm, disable_nx_huge_pages ? 0 : 2); + + /* Give recovery thread time to run. */ + wait_for_reclaim(reclaim_period_ms); + + /* + * Now that the reclaimer has run, all the split pages should be gone. + * + * If NX huge pages are disabled, the relaimer will not run, so + * nothing should change from here on. + */ + check_2m_page_count(vm, disable_nx_huge_pages ? 3 : 1); + check_split_count(vm, 0); + + /* + * The 4k mapping on hpage 3 should have been removed, so check that + * reading from it causes a huge page mapping to be installed. + */ + vcpu_run(vcpu); + check_2m_page_count(vm, disable_nx_huge_pages ? 3 : 2); + check_split_count(vm, 0); + + kvm_vm_free(vm); +} + +static void help(char *name) +{ + puts(""); + printf("usage: %s [-h] [-p period_ms] [-t token]\n", name); + puts(""); + printf(" -p: The NX reclaim period in milliseconds.\n"); + printf(" -t: The magic token to indicate environment setup is done.\n"); + printf(" -r: The test has reboot permissions and can disable NX huge pages.\n"); + puts(""); + exit(0); +} + +int main(int argc, char **argv) +{ + int reclaim_period_ms = 0, token = 0, opt; + bool reboot_permissions = false; + + while ((opt = getopt(argc, argv, "hp:t:r")) != -1) { + switch (opt) { + case 'p': + reclaim_period_ms = atoi_positive("Reclaim period", optarg); + break; + case 't': + token = atoi_paranoid(optarg); + break; + case 'r': + reboot_permissions = true; + break; + case 'h': + default: + help(argv[0]); + break; + } + } + + TEST_REQUIRE(kvm_has_cap(KVM_CAP_VM_DISABLE_NX_HUGE_PAGES)); + + __TEST_REQUIRE(token == MAGIC_TOKEN, + "This test must be run with the magic token via '-t %d'.\n" + "Running via nx_huge_pages_test.sh, which also handles " + "environment setup, is strongly recommended.", MAGIC_TOKEN); + + run_test(reclaim_period_ms, false, reboot_permissions); + run_test(reclaim_period_ms, true, reboot_permissions); + + return 0; +} + diff --git a/tools/testing/selftests/kvm/x86_64/nx_huge_pages_test.sh b/tools/testing/selftests/kvm/x86_64/nx_huge_pages_test.sh new file mode 100755 index 000000000000..caad084b8bfd --- /dev/null +++ b/tools/testing/selftests/kvm/x86_64/nx_huge_pages_test.sh @@ -0,0 +1,69 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0-only */ +# +# Wrapper script which performs setup and cleanup for nx_huge_pages_test. +# Makes use of root privileges to set up huge pages and KVM module parameters. +# +# Copyright (C) 2022, Google LLC. + +set -e + +NX_HUGE_PAGES=$(cat /sys/module/kvm/parameters/nx_huge_pages) +NX_HUGE_PAGES_RECOVERY_RATIO=$(cat /sys/module/kvm/parameters/nx_huge_pages_recovery_ratio) +NX_HUGE_PAGES_RECOVERY_PERIOD=$(cat /sys/module/kvm/parameters/nx_huge_pages_recovery_period_ms) +HUGE_PAGES=$(cat /sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages) + +# If we're already root, the host might not have sudo. +if [ $(whoami) == "root" ]; then + function do_sudo () { + "$@" + } +else + function do_sudo () { + sudo "$@" + } +fi + +set +e + +function sudo_echo () { + echo "$1" | do_sudo tee -a "$2" > /dev/null +} + +NXECUTABLE="$(dirname $0)/nx_huge_pages_test" + +sudo_echo test /dev/null || exit 4 # KSFT_SKIP=4 + +( + set -e + + sudo_echo 1 /sys/module/kvm/parameters/nx_huge_pages + sudo_echo 1 /sys/module/kvm/parameters/nx_huge_pages_recovery_ratio + sudo_echo 100 /sys/module/kvm/parameters/nx_huge_pages_recovery_period_ms + sudo_echo "$(( $HUGE_PAGES + 3 ))" /sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages + + # Test with reboot permissions + if [ $(whoami) == "root" ] || sudo setcap cap_sys_boot+ep $NXECUTABLE 2> /dev/null; then + echo Running test with CAP_SYS_BOOT enabled + $NXECUTABLE -t 887563923 -p 100 -r + test $(whoami) == "root" || sudo setcap cap_sys_boot-ep $NXECUTABLE + else + echo setcap failed, skipping nx_huge_pages_test with CAP_SYS_BOOT enabled + fi + + # Test without reboot permissions + if [ $(whoami) != "root" ] ; then + echo Running test with CAP_SYS_BOOT disabled + $NXECUTABLE -t 887563923 -p 100 + else + echo Running as root, skipping nx_huge_pages_test with CAP_SYS_BOOT disabled + fi +) +RET=$? + +sudo_echo "$NX_HUGE_PAGES" /sys/module/kvm/parameters/nx_huge_pages +sudo_echo "$NX_HUGE_PAGES_RECOVERY_RATIO" /sys/module/kvm/parameters/nx_huge_pages_recovery_ratio +sudo_echo "$NX_HUGE_PAGES_RECOVERY_PERIOD" /sys/module/kvm/parameters/nx_huge_pages_recovery_period_ms +sudo_echo "$HUGE_PAGES" /sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages + +exit $RET diff --git a/tools/testing/selftests/kvm/x86_64/platform_info_test.c b/tools/testing/selftests/kvm/x86_64/platform_info_test.c index 1e89688cbbbf..eda88080c186 100644 --- a/tools/testing/selftests/kvm/x86_64/platform_info_test.c +++ b/tools/testing/selftests/kvm/x86_64/platform_info_test.c @@ -9,8 +9,6 @@ * Verifies expected behavior of controlling guest access to * MSR_PLATFORM_INFO. */ - -#define _GNU_SOURCE /* for program_invocation_short_name */ #include <fcntl.h> #include <stdio.h> #include <stdlib.h> @@ -21,85 +19,60 @@ #include "kvm_util.h" #include "processor.h" -#define VCPU_ID 0 #define MSR_PLATFORM_INFO_MAX_TURBO_RATIO 0xff00 static void guest_code(void) { uint64_t msr_platform_info; + uint8_t vector; - for (;;) { - msr_platform_info = rdmsr(MSR_PLATFORM_INFO); - GUEST_SYNC(msr_platform_info); - asm volatile ("inc %r11"); - } -} - -static void set_msr_platform_info_enabled(struct kvm_vm *vm, bool enable) -{ - struct kvm_enable_cap cap = {}; - - cap.cap = KVM_CAP_MSR_PLATFORM_INFO; - cap.flags = 0; - cap.args[0] = (int)enable; - vm_enable_cap(vm, &cap); -} + GUEST_SYNC(true); + msr_platform_info = rdmsr(MSR_PLATFORM_INFO); + GUEST_ASSERT_EQ(msr_platform_info & MSR_PLATFORM_INFO_MAX_TURBO_RATIO, + MSR_PLATFORM_INFO_MAX_TURBO_RATIO); -static void test_msr_platform_info_enabled(struct kvm_vm *vm) -{ - struct kvm_run *run = vcpu_state(vm, VCPU_ID); - struct ucall uc; + GUEST_SYNC(false); + vector = rdmsr_safe(MSR_PLATFORM_INFO, &msr_platform_info); + GUEST_ASSERT_EQ(vector, GP_VECTOR); - set_msr_platform_info_enabled(vm, true); - vcpu_run(vm, VCPU_ID); - TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, - "Exit_reason other than KVM_EXIT_IO: %u (%s),\n", - run->exit_reason, - exit_reason_str(run->exit_reason)); - get_ucall(vm, VCPU_ID, &uc); - TEST_ASSERT(uc.cmd == UCALL_SYNC, - "Received ucall other than UCALL_SYNC: %lu\n", uc.cmd); - TEST_ASSERT((uc.args[1] & MSR_PLATFORM_INFO_MAX_TURBO_RATIO) == - MSR_PLATFORM_INFO_MAX_TURBO_RATIO, - "Expected MSR_PLATFORM_INFO to have max turbo ratio mask: %i.", - MSR_PLATFORM_INFO_MAX_TURBO_RATIO); -} - -static void test_msr_platform_info_disabled(struct kvm_vm *vm) -{ - struct kvm_run *run = vcpu_state(vm, VCPU_ID); - - set_msr_platform_info_enabled(vm, false); - vcpu_run(vm, VCPU_ID); - TEST_ASSERT(run->exit_reason == KVM_EXIT_SHUTDOWN, - "Exit_reason other than KVM_EXIT_SHUTDOWN: %u (%s)\n", - run->exit_reason, - exit_reason_str(run->exit_reason)); + GUEST_DONE(); } int main(int argc, char *argv[]) { + struct kvm_vcpu *vcpu; struct kvm_vm *vm; - int rv; uint64_t msr_platform_info; + struct ucall uc; - /* Tell stdout not to buffer its content */ - setbuf(stdout, NULL); + TEST_REQUIRE(kvm_has_cap(KVM_CAP_MSR_PLATFORM_INFO)); - rv = kvm_check_cap(KVM_CAP_MSR_PLATFORM_INFO); - if (!rv) { - print_skip("KVM_CAP_MSR_PLATFORM_INFO not supported"); - exit(KSFT_SKIP); - } + vm = vm_create_with_one_vcpu(&vcpu, guest_code); + + msr_platform_info = vcpu_get_msr(vcpu, MSR_PLATFORM_INFO); + vcpu_set_msr(vcpu, MSR_PLATFORM_INFO, + msr_platform_info | MSR_PLATFORM_INFO_MAX_TURBO_RATIO); - vm = vm_create_default(VCPU_ID, 0, guest_code); + for (;;) { + vcpu_run(vcpu); + TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO); + + switch (get_ucall(vcpu, &uc)) { + case UCALL_SYNC: + vm_enable_cap(vm, KVM_CAP_MSR_PLATFORM_INFO, uc.args[1]); + break; + case UCALL_DONE: + goto done; + case UCALL_ABORT: + REPORT_GUEST_ASSERT(uc); + default: + TEST_FAIL("Unexpected ucall %lu", uc.cmd); + break; + } + } - msr_platform_info = vcpu_get_msr(vm, VCPU_ID, MSR_PLATFORM_INFO); - vcpu_set_msr(vm, VCPU_ID, MSR_PLATFORM_INFO, - msr_platform_info | MSR_PLATFORM_INFO_MAX_TURBO_RATIO); - test_msr_platform_info_enabled(vm); - test_msr_platform_info_disabled(vm); - vcpu_set_msr(vm, VCPU_ID, MSR_PLATFORM_INFO, msr_platform_info); +done: + vcpu_set_msr(vcpu, MSR_PLATFORM_INFO, msr_platform_info); kvm_vm_free(vm); diff --git a/tools/testing/selftests/kvm/x86_64/pmu_counters_test.c b/tools/testing/selftests/kvm/x86_64/pmu_counters_test.c new file mode 100644 index 000000000000..96446134c00b --- /dev/null +++ b/tools/testing/selftests/kvm/x86_64/pmu_counters_test.c @@ -0,0 +1,630 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2023, Tencent, Inc. + */ +#include <x86intrin.h> + +#include "pmu.h" +#include "processor.h" + +/* Number of LOOP instructions for the guest measurement payload. */ +#define NUM_BRANCHES 10 +/* + * Number of "extra" instructions that will be counted, i.e. the number of + * instructions that are needed to set up the loop and then disabled the + * counter. 1 CLFLUSH/CLFLUSHOPT/NOP, 1 MFENCE, 2 MOV, 2 XOR, 1 WRMSR. + */ +#define NUM_EXTRA_INSNS 7 +#define NUM_INSNS_RETIRED (NUM_BRANCHES + NUM_EXTRA_INSNS) + +static uint8_t kvm_pmu_version; +static bool kvm_has_perf_caps; + +static struct kvm_vm *pmu_vm_create_with_one_vcpu(struct kvm_vcpu **vcpu, + void *guest_code, + uint8_t pmu_version, + uint64_t perf_capabilities) +{ + struct kvm_vm *vm; + + vm = vm_create_with_one_vcpu(vcpu, guest_code); + sync_global_to_guest(vm, kvm_pmu_version); + + /* + * Set PERF_CAPABILITIES before PMU version as KVM disallows enabling + * features via PERF_CAPABILITIES if the guest doesn't have a vPMU. + */ + if (kvm_has_perf_caps) + vcpu_set_msr(*vcpu, MSR_IA32_PERF_CAPABILITIES, perf_capabilities); + + vcpu_set_cpuid_property(*vcpu, X86_PROPERTY_PMU_VERSION, pmu_version); + return vm; +} + +static void run_vcpu(struct kvm_vcpu *vcpu) +{ + struct ucall uc; + + do { + vcpu_run(vcpu); + switch (get_ucall(vcpu, &uc)) { + case UCALL_SYNC: + break; + case UCALL_ABORT: + REPORT_GUEST_ASSERT(uc); + break; + case UCALL_PRINTF: + pr_info("%s", uc.buffer); + break; + case UCALL_DONE: + break; + default: + TEST_FAIL("Unexpected ucall: %lu", uc.cmd); + } + } while (uc.cmd != UCALL_DONE); +} + +static uint8_t guest_get_pmu_version(void) +{ + /* + * Return the effective PMU version, i.e. the minimum between what KVM + * supports and what is enumerated to the guest. The host deliberately + * advertises a PMU version to the guest beyond what is actually + * supported by KVM to verify KVM doesn't freak out and do something + * bizarre with an architecturally valid, but unsupported, version. + */ + return min_t(uint8_t, kvm_pmu_version, this_cpu_property(X86_PROPERTY_PMU_VERSION)); +} + +/* + * If an architectural event is supported and guaranteed to generate at least + * one "hit, assert that its count is non-zero. If an event isn't supported or + * the test can't guarantee the associated action will occur, then all bets are + * off regarding the count, i.e. no checks can be done. + * + * Sanity check that in all cases, the event doesn't count when it's disabled, + * and that KVM correctly emulates the write of an arbitrary value. + */ +static void guest_assert_event_count(uint8_t idx, + struct kvm_x86_pmu_feature event, + uint32_t pmc, uint32_t pmc_msr) +{ + uint64_t count; + + count = _rdpmc(pmc); + if (!this_pmu_has(event)) + goto sanity_checks; + + switch (idx) { + case INTEL_ARCH_INSTRUCTIONS_RETIRED_INDEX: + GUEST_ASSERT_EQ(count, NUM_INSNS_RETIRED); + break; + case INTEL_ARCH_BRANCHES_RETIRED_INDEX: + GUEST_ASSERT_EQ(count, NUM_BRANCHES); + break; + case INTEL_ARCH_LLC_REFERENCES_INDEX: + case INTEL_ARCH_LLC_MISSES_INDEX: + if (!this_cpu_has(X86_FEATURE_CLFLUSHOPT) && + !this_cpu_has(X86_FEATURE_CLFLUSH)) + break; + fallthrough; + case INTEL_ARCH_CPU_CYCLES_INDEX: + case INTEL_ARCH_REFERENCE_CYCLES_INDEX: + GUEST_ASSERT_NE(count, 0); + break; + case INTEL_ARCH_TOPDOWN_SLOTS_INDEX: + GUEST_ASSERT(count >= NUM_INSNS_RETIRED); + break; + default: + break; + } + +sanity_checks: + __asm__ __volatile__("loop ." : "+c"((int){NUM_BRANCHES})); + GUEST_ASSERT_EQ(_rdpmc(pmc), count); + + wrmsr(pmc_msr, 0xdead); + GUEST_ASSERT_EQ(_rdpmc(pmc), 0xdead); +} + +/* + * Enable and disable the PMC in a monolithic asm blob to ensure that the + * compiler can't insert _any_ code into the measured sequence. Note, ECX + * doesn't need to be clobbered as the input value, @pmc_msr, is restored + * before the end of the sequence. + * + * If CLFUSH{,OPT} is supported, flush the cacheline containing (at least) the + * start of the loop to force LLC references and misses, i.e. to allow testing + * that those events actually count. + * + * If forced emulation is enabled (and specified), force emulation on a subset + * of the measured code to verify that KVM correctly emulates instructions and + * branches retired events in conjunction with hardware also counting said + * events. + */ +#define GUEST_MEASURE_EVENT(_msr, _value, clflush, FEP) \ +do { \ + __asm__ __volatile__("wrmsr\n\t" \ + clflush "\n\t" \ + "mfence\n\t" \ + "1: mov $" __stringify(NUM_BRANCHES) ", %%ecx\n\t" \ + FEP "loop .\n\t" \ + FEP "mov %%edi, %%ecx\n\t" \ + FEP "xor %%eax, %%eax\n\t" \ + FEP "xor %%edx, %%edx\n\t" \ + "wrmsr\n\t" \ + :: "a"((uint32_t)_value), "d"(_value >> 32), \ + "c"(_msr), "D"(_msr) \ + ); \ +} while (0) + +#define GUEST_TEST_EVENT(_idx, _event, _pmc, _pmc_msr, _ctrl_msr, _value, FEP) \ +do { \ + wrmsr(pmc_msr, 0); \ + \ + if (this_cpu_has(X86_FEATURE_CLFLUSHOPT)) \ + GUEST_MEASURE_EVENT(_ctrl_msr, _value, "clflushopt 1f", FEP); \ + else if (this_cpu_has(X86_FEATURE_CLFLUSH)) \ + GUEST_MEASURE_EVENT(_ctrl_msr, _value, "clflush 1f", FEP); \ + else \ + GUEST_MEASURE_EVENT(_ctrl_msr, _value, "nop", FEP); \ + \ + guest_assert_event_count(_idx, _event, _pmc, _pmc_msr); \ +} while (0) + +static void __guest_test_arch_event(uint8_t idx, struct kvm_x86_pmu_feature event, + uint32_t pmc, uint32_t pmc_msr, + uint32_t ctrl_msr, uint64_t ctrl_msr_value) +{ + GUEST_TEST_EVENT(idx, event, pmc, pmc_msr, ctrl_msr, ctrl_msr_value, ""); + + if (is_forced_emulation_enabled) + GUEST_TEST_EVENT(idx, event, pmc, pmc_msr, ctrl_msr, ctrl_msr_value, KVM_FEP); +} + +#define X86_PMU_FEATURE_NULL \ +({ \ + struct kvm_x86_pmu_feature feature = {}; \ + \ + feature; \ +}) + +static bool pmu_is_null_feature(struct kvm_x86_pmu_feature event) +{ + return !(*(u64 *)&event); +} + +static void guest_test_arch_event(uint8_t idx) +{ + const struct { + struct kvm_x86_pmu_feature gp_event; + struct kvm_x86_pmu_feature fixed_event; + } intel_event_to_feature[] = { + [INTEL_ARCH_CPU_CYCLES_INDEX] = { X86_PMU_FEATURE_CPU_CYCLES, X86_PMU_FEATURE_CPU_CYCLES_FIXED }, + [INTEL_ARCH_INSTRUCTIONS_RETIRED_INDEX] = { X86_PMU_FEATURE_INSNS_RETIRED, X86_PMU_FEATURE_INSNS_RETIRED_FIXED }, + /* + * Note, the fixed counter for reference cycles is NOT the same + * as the general purpose architectural event. The fixed counter + * explicitly counts at the same frequency as the TSC, whereas + * the GP event counts at a fixed, but uarch specific, frequency. + * Bundle them here for simplicity. + */ + [INTEL_ARCH_REFERENCE_CYCLES_INDEX] = { X86_PMU_FEATURE_REFERENCE_CYCLES, X86_PMU_FEATURE_REFERENCE_TSC_CYCLES_FIXED }, + [INTEL_ARCH_LLC_REFERENCES_INDEX] = { X86_PMU_FEATURE_LLC_REFERENCES, X86_PMU_FEATURE_NULL }, + [INTEL_ARCH_LLC_MISSES_INDEX] = { X86_PMU_FEATURE_LLC_MISSES, X86_PMU_FEATURE_NULL }, + [INTEL_ARCH_BRANCHES_RETIRED_INDEX] = { X86_PMU_FEATURE_BRANCH_INSNS_RETIRED, X86_PMU_FEATURE_NULL }, + [INTEL_ARCH_BRANCHES_MISPREDICTED_INDEX] = { X86_PMU_FEATURE_BRANCHES_MISPREDICTED, X86_PMU_FEATURE_NULL }, + [INTEL_ARCH_TOPDOWN_SLOTS_INDEX] = { X86_PMU_FEATURE_TOPDOWN_SLOTS, X86_PMU_FEATURE_TOPDOWN_SLOTS_FIXED }, + }; + + uint32_t nr_gp_counters = this_cpu_property(X86_PROPERTY_PMU_NR_GP_COUNTERS); + uint32_t pmu_version = guest_get_pmu_version(); + /* PERF_GLOBAL_CTRL exists only for Architectural PMU Version 2+. */ + bool guest_has_perf_global_ctrl = pmu_version >= 2; + struct kvm_x86_pmu_feature gp_event, fixed_event; + uint32_t base_pmc_msr; + unsigned int i; + + /* The host side shouldn't invoke this without a guest PMU. */ + GUEST_ASSERT(pmu_version); + + if (this_cpu_has(X86_FEATURE_PDCM) && + rdmsr(MSR_IA32_PERF_CAPABILITIES) & PMU_CAP_FW_WRITES) + base_pmc_msr = MSR_IA32_PMC0; + else + base_pmc_msr = MSR_IA32_PERFCTR0; + + gp_event = intel_event_to_feature[idx].gp_event; + GUEST_ASSERT_EQ(idx, gp_event.f.bit); + + GUEST_ASSERT(nr_gp_counters); + + for (i = 0; i < nr_gp_counters; i++) { + uint64_t eventsel = ARCH_PERFMON_EVENTSEL_OS | + ARCH_PERFMON_EVENTSEL_ENABLE | + intel_pmu_arch_events[idx]; + + wrmsr(MSR_P6_EVNTSEL0 + i, 0); + if (guest_has_perf_global_ctrl) + wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, BIT_ULL(i)); + + __guest_test_arch_event(idx, gp_event, i, base_pmc_msr + i, + MSR_P6_EVNTSEL0 + i, eventsel); + } + + if (!guest_has_perf_global_ctrl) + return; + + fixed_event = intel_event_to_feature[idx].fixed_event; + if (pmu_is_null_feature(fixed_event) || !this_pmu_has(fixed_event)) + return; + + i = fixed_event.f.bit; + + wrmsr(MSR_CORE_PERF_FIXED_CTR_CTRL, FIXED_PMC_CTRL(i, FIXED_PMC_KERNEL)); + + __guest_test_arch_event(idx, fixed_event, i | INTEL_RDPMC_FIXED, + MSR_CORE_PERF_FIXED_CTR0 + i, + MSR_CORE_PERF_GLOBAL_CTRL, + FIXED_PMC_GLOBAL_CTRL_ENABLE(i)); +} + +static void guest_test_arch_events(void) +{ + uint8_t i; + + for (i = 0; i < NR_INTEL_ARCH_EVENTS; i++) + guest_test_arch_event(i); + + GUEST_DONE(); +} + +static void test_arch_events(uint8_t pmu_version, uint64_t perf_capabilities, + uint8_t length, uint8_t unavailable_mask) +{ + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; + + /* Testing arch events requires a vPMU (there are no negative tests). */ + if (!pmu_version) + return; + + vm = pmu_vm_create_with_one_vcpu(&vcpu, guest_test_arch_events, + pmu_version, perf_capabilities); + + vcpu_set_cpuid_property(vcpu, X86_PROPERTY_PMU_EBX_BIT_VECTOR_LENGTH, + length); + vcpu_set_cpuid_property(vcpu, X86_PROPERTY_PMU_EVENTS_MASK, + unavailable_mask); + + run_vcpu(vcpu); + + kvm_vm_free(vm); +} + +/* + * Limit testing to MSRs that are actually defined by Intel (in the SDM). MSRs + * that aren't defined counter MSRs *probably* don't exist, but there's no + * guarantee that currently undefined MSR indices won't be used for something + * other than PMCs in the future. + */ +#define MAX_NR_GP_COUNTERS 8 +#define MAX_NR_FIXED_COUNTERS 3 + +#define GUEST_ASSERT_PMC_MSR_ACCESS(insn, msr, expect_gp, vector) \ +__GUEST_ASSERT(expect_gp ? vector == GP_VECTOR : !vector, \ + "Expected %s on " #insn "(0x%x), got vector %u", \ + expect_gp ? "#GP" : "no fault", msr, vector) \ + +#define GUEST_ASSERT_PMC_VALUE(insn, msr, val, expected) \ + __GUEST_ASSERT(val == expected_val, \ + "Expected " #insn "(0x%x) to yield 0x%lx, got 0x%lx", \ + msr, expected_val, val); + +static void guest_test_rdpmc(uint32_t rdpmc_idx, bool expect_success, + uint64_t expected_val) +{ + uint8_t vector; + uint64_t val; + + vector = rdpmc_safe(rdpmc_idx, &val); + GUEST_ASSERT_PMC_MSR_ACCESS(RDPMC, rdpmc_idx, !expect_success, vector); + if (expect_success) + GUEST_ASSERT_PMC_VALUE(RDPMC, rdpmc_idx, val, expected_val); + + if (!is_forced_emulation_enabled) + return; + + vector = rdpmc_safe_fep(rdpmc_idx, &val); + GUEST_ASSERT_PMC_MSR_ACCESS(RDPMC, rdpmc_idx, !expect_success, vector); + if (expect_success) + GUEST_ASSERT_PMC_VALUE(RDPMC, rdpmc_idx, val, expected_val); +} + +static void guest_rd_wr_counters(uint32_t base_msr, uint8_t nr_possible_counters, + uint8_t nr_counters, uint32_t or_mask) +{ + const bool pmu_has_fast_mode = !guest_get_pmu_version(); + uint8_t i; + + for (i = 0; i < nr_possible_counters; i++) { + /* + * TODO: Test a value that validates full-width writes and the + * width of the counters. + */ + const uint64_t test_val = 0xffff; + const uint32_t msr = base_msr + i; + + /* + * Fixed counters are supported if the counter is less than the + * number of enumerated contiguous counters *or* the counter is + * explicitly enumerated in the supported counters mask. + */ + const bool expect_success = i < nr_counters || (or_mask & BIT(i)); + + /* + * KVM drops writes to MSR_P6_PERFCTR[0|1] if the counters are + * unsupported, i.e. doesn't #GP and reads back '0'. + */ + const uint64_t expected_val = expect_success ? test_val : 0; + const bool expect_gp = !expect_success && msr != MSR_P6_PERFCTR0 && + msr != MSR_P6_PERFCTR1; + uint32_t rdpmc_idx; + uint8_t vector; + uint64_t val; + + vector = wrmsr_safe(msr, test_val); + GUEST_ASSERT_PMC_MSR_ACCESS(WRMSR, msr, expect_gp, vector); + + vector = rdmsr_safe(msr, &val); + GUEST_ASSERT_PMC_MSR_ACCESS(RDMSR, msr, expect_gp, vector); + + /* On #GP, the result of RDMSR is undefined. */ + if (!expect_gp) + GUEST_ASSERT_PMC_VALUE(RDMSR, msr, val, expected_val); + + /* + * Redo the read tests with RDPMC, which has different indexing + * semantics and additional capabilities. + */ + rdpmc_idx = i; + if (base_msr == MSR_CORE_PERF_FIXED_CTR0) + rdpmc_idx |= INTEL_RDPMC_FIXED; + + guest_test_rdpmc(rdpmc_idx, expect_success, expected_val); + + /* + * KVM doesn't support non-architectural PMUs, i.e. it should + * impossible to have fast mode RDPMC. Verify that attempting + * to use fast RDPMC always #GPs. + */ + GUEST_ASSERT(!expect_success || !pmu_has_fast_mode); + rdpmc_idx |= INTEL_RDPMC_FAST; + guest_test_rdpmc(rdpmc_idx, false, -1ull); + + vector = wrmsr_safe(msr, 0); + GUEST_ASSERT_PMC_MSR_ACCESS(WRMSR, msr, expect_gp, vector); + } +} + +static void guest_test_gp_counters(void) +{ + uint8_t pmu_version = guest_get_pmu_version(); + uint8_t nr_gp_counters = 0; + uint32_t base_msr; + + if (pmu_version) + nr_gp_counters = this_cpu_property(X86_PROPERTY_PMU_NR_GP_COUNTERS); + + /* + * For v2+ PMUs, PERF_GLOBAL_CTRL's architectural post-RESET value is + * "Sets bits n-1:0 and clears the upper bits", where 'n' is the number + * of GP counters. If there are no GP counters, require KVM to leave + * PERF_GLOBAL_CTRL '0'. This edge case isn't covered by the SDM, but + * follow the spirit of the architecture and only globally enable GP + * counters, of which there are none. + */ + if (pmu_version > 1) { + uint64_t global_ctrl = rdmsr(MSR_CORE_PERF_GLOBAL_CTRL); + + if (nr_gp_counters) + GUEST_ASSERT_EQ(global_ctrl, GENMASK_ULL(nr_gp_counters - 1, 0)); + else + GUEST_ASSERT_EQ(global_ctrl, 0); + } + + if (this_cpu_has(X86_FEATURE_PDCM) && + rdmsr(MSR_IA32_PERF_CAPABILITIES) & PMU_CAP_FW_WRITES) + base_msr = MSR_IA32_PMC0; + else + base_msr = MSR_IA32_PERFCTR0; + + guest_rd_wr_counters(base_msr, MAX_NR_GP_COUNTERS, nr_gp_counters, 0); + GUEST_DONE(); +} + +static void test_gp_counters(uint8_t pmu_version, uint64_t perf_capabilities, + uint8_t nr_gp_counters) +{ + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; + + vm = pmu_vm_create_with_one_vcpu(&vcpu, guest_test_gp_counters, + pmu_version, perf_capabilities); + + vcpu_set_cpuid_property(vcpu, X86_PROPERTY_PMU_NR_GP_COUNTERS, + nr_gp_counters); + + run_vcpu(vcpu); + + kvm_vm_free(vm); +} + +static void guest_test_fixed_counters(void) +{ + uint64_t supported_bitmask = 0; + uint8_t nr_fixed_counters = 0; + uint8_t i; + + /* Fixed counters require Architectural vPMU Version 2+. */ + if (guest_get_pmu_version() >= 2) + nr_fixed_counters = this_cpu_property(X86_PROPERTY_PMU_NR_FIXED_COUNTERS); + + /* + * The supported bitmask for fixed counters was introduced in PMU + * version 5. + */ + if (guest_get_pmu_version() >= 5) + supported_bitmask = this_cpu_property(X86_PROPERTY_PMU_FIXED_COUNTERS_BITMASK); + + guest_rd_wr_counters(MSR_CORE_PERF_FIXED_CTR0, MAX_NR_FIXED_COUNTERS, + nr_fixed_counters, supported_bitmask); + + for (i = 0; i < MAX_NR_FIXED_COUNTERS; i++) { + uint8_t vector; + uint64_t val; + + if (i >= nr_fixed_counters && !(supported_bitmask & BIT_ULL(i))) { + vector = wrmsr_safe(MSR_CORE_PERF_FIXED_CTR_CTRL, + FIXED_PMC_CTRL(i, FIXED_PMC_KERNEL)); + __GUEST_ASSERT(vector == GP_VECTOR, + "Expected #GP for counter %u in FIXED_CTR_CTRL", i); + + vector = wrmsr_safe(MSR_CORE_PERF_GLOBAL_CTRL, + FIXED_PMC_GLOBAL_CTRL_ENABLE(i)); + __GUEST_ASSERT(vector == GP_VECTOR, + "Expected #GP for counter %u in PERF_GLOBAL_CTRL", i); + continue; + } + + wrmsr(MSR_CORE_PERF_FIXED_CTR0 + i, 0); + wrmsr(MSR_CORE_PERF_FIXED_CTR_CTRL, FIXED_PMC_CTRL(i, FIXED_PMC_KERNEL)); + wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, FIXED_PMC_GLOBAL_CTRL_ENABLE(i)); + __asm__ __volatile__("loop ." : "+c"((int){NUM_BRANCHES})); + wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0); + val = rdmsr(MSR_CORE_PERF_FIXED_CTR0 + i); + + GUEST_ASSERT_NE(val, 0); + } + GUEST_DONE(); +} + +static void test_fixed_counters(uint8_t pmu_version, uint64_t perf_capabilities, + uint8_t nr_fixed_counters, + uint32_t supported_bitmask) +{ + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; + + vm = pmu_vm_create_with_one_vcpu(&vcpu, guest_test_fixed_counters, + pmu_version, perf_capabilities); + + vcpu_set_cpuid_property(vcpu, X86_PROPERTY_PMU_FIXED_COUNTERS_BITMASK, + supported_bitmask); + vcpu_set_cpuid_property(vcpu, X86_PROPERTY_PMU_NR_FIXED_COUNTERS, + nr_fixed_counters); + + run_vcpu(vcpu); + + kvm_vm_free(vm); +} + +static void test_intel_counters(void) +{ + uint8_t nr_arch_events = kvm_cpu_property(X86_PROPERTY_PMU_EBX_BIT_VECTOR_LENGTH); + uint8_t nr_fixed_counters = kvm_cpu_property(X86_PROPERTY_PMU_NR_FIXED_COUNTERS); + uint8_t nr_gp_counters = kvm_cpu_property(X86_PROPERTY_PMU_NR_GP_COUNTERS); + uint8_t pmu_version = kvm_cpu_property(X86_PROPERTY_PMU_VERSION); + unsigned int i; + uint8_t v, j; + uint32_t k; + + const uint64_t perf_caps[] = { + 0, + PMU_CAP_FW_WRITES, + }; + + /* + * Test up to PMU v5, which is the current maximum version defined by + * Intel, i.e. is the last version that is guaranteed to be backwards + * compatible with KVM's existing behavior. + */ + uint8_t max_pmu_version = max_t(typeof(pmu_version), pmu_version, 5); + + /* + * Detect the existence of events that aren't supported by selftests. + * This will (obviously) fail any time the kernel adds support for a + * new event, but it's worth paying that price to keep the test fresh. + */ + TEST_ASSERT(nr_arch_events <= NR_INTEL_ARCH_EVENTS, + "New architectural event(s) detected; please update this test (length = %u, mask = %x)", + nr_arch_events, kvm_cpu_property(X86_PROPERTY_PMU_EVENTS_MASK)); + + /* + * Force iterating over known arch events regardless of whether or not + * KVM/hardware supports a given event. + */ + nr_arch_events = max_t(typeof(nr_arch_events), nr_arch_events, NR_INTEL_ARCH_EVENTS); + + for (v = 0; v <= max_pmu_version; v++) { + for (i = 0; i < ARRAY_SIZE(perf_caps); i++) { + if (!kvm_has_perf_caps && perf_caps[i]) + continue; + + pr_info("Testing arch events, PMU version %u, perf_caps = %lx\n", + v, perf_caps[i]); + /* + * To keep the total runtime reasonable, test every + * possible non-zero, non-reserved bitmap combination + * only with the native PMU version and the full bit + * vector length. + */ + if (v == pmu_version) { + for (k = 1; k < (BIT(nr_arch_events) - 1); k++) + test_arch_events(v, perf_caps[i], nr_arch_events, k); + } + /* + * Test single bits for all PMU version and lengths up + * the number of events +1 (to verify KVM doesn't do + * weird things if the guest length is greater than the + * host length). Explicitly test a mask of '0' and all + * ones i.e. all events being available and unavailable. + */ + for (j = 0; j <= nr_arch_events + 1; j++) { + test_arch_events(v, perf_caps[i], j, 0); + test_arch_events(v, perf_caps[i], j, 0xff); + + for (k = 0; k < nr_arch_events; k++) + test_arch_events(v, perf_caps[i], j, BIT(k)); + } + + pr_info("Testing GP counters, PMU version %u, perf_caps = %lx\n", + v, perf_caps[i]); + for (j = 0; j <= nr_gp_counters; j++) + test_gp_counters(v, perf_caps[i], j); + + pr_info("Testing fixed counters, PMU version %u, perf_caps = %lx\n", + v, perf_caps[i]); + for (j = 0; j <= nr_fixed_counters; j++) { + for (k = 0; k <= (BIT(nr_fixed_counters) - 1); k++) + test_fixed_counters(v, perf_caps[i], j, k); + } + } + } +} + +int main(int argc, char *argv[]) +{ + TEST_REQUIRE(kvm_is_pmu_enabled()); + + TEST_REQUIRE(host_cpu_is_intel); + TEST_REQUIRE(kvm_cpu_has_p(X86_PROPERTY_PMU_VERSION)); + TEST_REQUIRE(kvm_cpu_property(X86_PROPERTY_PMU_VERSION) > 0); + + kvm_pmu_version = kvm_cpu_property(X86_PROPERTY_PMU_VERSION); + kvm_has_perf_caps = kvm_cpu_has(X86_FEATURE_PDCM); + + test_intel_counters(); + + return 0; +} diff --git a/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c b/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c new file mode 100644 index 000000000000..26b3e7efe5dd --- /dev/null +++ b/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c @@ -0,0 +1,901 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Test for x86 KVM_SET_PMU_EVENT_FILTER. + * + * Copyright (C) 2022, Google LLC. + * + * This work is licensed under the terms of the GNU GPL, version 2. + * + * Verifies the expected behavior of allow lists and deny lists for + * virtual PMU events. + */ +#include "kvm_util.h" +#include "pmu.h" +#include "processor.h" +#include "test_util.h" + +#define NUM_BRANCHES 42 +#define MAX_TEST_EVENTS 10 + +#define PMU_EVENT_FILTER_INVALID_ACTION (KVM_PMU_EVENT_DENY + 1) +#define PMU_EVENT_FILTER_INVALID_FLAGS (KVM_PMU_EVENT_FLAGS_VALID_MASK << 1) +#define PMU_EVENT_FILTER_INVALID_NEVENTS (KVM_PMU_EVENT_FILTER_MAX_EVENTS + 1) + +struct __kvm_pmu_event_filter { + __u32 action; + __u32 nevents; + __u32 fixed_counter_bitmap; + __u32 flags; + __u32 pad[4]; + __u64 events[KVM_PMU_EVENT_FILTER_MAX_EVENTS]; +}; + +/* + * This event list comprises Intel's known architectural events, plus AMD's + * "retired branch instructions" for Zen1-Zen3 (and* possibly other AMD CPUs). + * Note, AMD and Intel use the same encoding for instructions retired. + */ +kvm_static_assert(INTEL_ARCH_INSTRUCTIONS_RETIRED == AMD_ZEN_INSTRUCTIONS_RETIRED); + +static const struct __kvm_pmu_event_filter base_event_filter = { + .nevents = ARRAY_SIZE(base_event_filter.events), + .events = { + INTEL_ARCH_CPU_CYCLES, + INTEL_ARCH_INSTRUCTIONS_RETIRED, + INTEL_ARCH_REFERENCE_CYCLES, + INTEL_ARCH_LLC_REFERENCES, + INTEL_ARCH_LLC_MISSES, + INTEL_ARCH_BRANCHES_RETIRED, + INTEL_ARCH_BRANCHES_MISPREDICTED, + INTEL_ARCH_TOPDOWN_SLOTS, + AMD_ZEN_BRANCHES_RETIRED, + }, +}; + +struct { + uint64_t loads; + uint64_t stores; + uint64_t loads_stores; + uint64_t branches_retired; + uint64_t instructions_retired; +} pmc_results; + +/* + * If we encounter a #GP during the guest PMU sanity check, then the guest + * PMU is not functional. Inform the hypervisor via GUEST_SYNC(0). + */ +static void guest_gp_handler(struct ex_regs *regs) +{ + GUEST_SYNC(-EFAULT); +} + +/* + * Check that we can write a new value to the given MSR and read it back. + * The caller should provide a non-empty set of bits that are safe to flip. + * + * Return on success. GUEST_SYNC(0) on error. + */ +static void check_msr(uint32_t msr, uint64_t bits_to_flip) +{ + uint64_t v = rdmsr(msr) ^ bits_to_flip; + + wrmsr(msr, v); + if (rdmsr(msr) != v) + GUEST_SYNC(-EIO); + + v ^= bits_to_flip; + wrmsr(msr, v); + if (rdmsr(msr) != v) + GUEST_SYNC(-EIO); +} + +static void run_and_measure_loop(uint32_t msr_base) +{ + const uint64_t branches_retired = rdmsr(msr_base + 0); + const uint64_t insn_retired = rdmsr(msr_base + 1); + + __asm__ __volatile__("loop ." : "+c"((int){NUM_BRANCHES})); + + pmc_results.branches_retired = rdmsr(msr_base + 0) - branches_retired; + pmc_results.instructions_retired = rdmsr(msr_base + 1) - insn_retired; +} + +static void intel_guest_code(void) +{ + check_msr(MSR_CORE_PERF_GLOBAL_CTRL, 1); + check_msr(MSR_P6_EVNTSEL0, 0xffff); + check_msr(MSR_IA32_PMC0, 0xffff); + GUEST_SYNC(0); + + for (;;) { + wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0); + wrmsr(MSR_P6_EVNTSEL0, ARCH_PERFMON_EVENTSEL_ENABLE | + ARCH_PERFMON_EVENTSEL_OS | INTEL_ARCH_BRANCHES_RETIRED); + wrmsr(MSR_P6_EVNTSEL1, ARCH_PERFMON_EVENTSEL_ENABLE | + ARCH_PERFMON_EVENTSEL_OS | INTEL_ARCH_INSTRUCTIONS_RETIRED); + wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0x3); + + run_and_measure_loop(MSR_IA32_PMC0); + GUEST_SYNC(0); + } +} + +/* + * To avoid needing a check for CPUID.80000001:ECX.PerfCtrExtCore[bit 23], + * this code uses the always-available, legacy K7 PMU MSRs, which alias to + * the first four of the six extended core PMU MSRs. + */ +static void amd_guest_code(void) +{ + check_msr(MSR_K7_EVNTSEL0, 0xffff); + check_msr(MSR_K7_PERFCTR0, 0xffff); + GUEST_SYNC(0); + + for (;;) { + wrmsr(MSR_K7_EVNTSEL0, 0); + wrmsr(MSR_K7_EVNTSEL0, ARCH_PERFMON_EVENTSEL_ENABLE | + ARCH_PERFMON_EVENTSEL_OS | AMD_ZEN_BRANCHES_RETIRED); + wrmsr(MSR_K7_EVNTSEL1, ARCH_PERFMON_EVENTSEL_ENABLE | + ARCH_PERFMON_EVENTSEL_OS | AMD_ZEN_INSTRUCTIONS_RETIRED); + + run_and_measure_loop(MSR_K7_PERFCTR0); + GUEST_SYNC(0); + } +} + +/* + * Run the VM to the next GUEST_SYNC(value), and return the value passed + * to the sync. Any other exit from the guest is fatal. + */ +static uint64_t run_vcpu_to_sync(struct kvm_vcpu *vcpu) +{ + struct ucall uc; + + vcpu_run(vcpu); + TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO); + get_ucall(vcpu, &uc); + TEST_ASSERT(uc.cmd == UCALL_SYNC, + "Received ucall other than UCALL_SYNC: %lu", uc.cmd); + return uc.args[1]; +} + +static void run_vcpu_and_sync_pmc_results(struct kvm_vcpu *vcpu) +{ + uint64_t r; + + memset(&pmc_results, 0, sizeof(pmc_results)); + sync_global_to_guest(vcpu->vm, pmc_results); + + r = run_vcpu_to_sync(vcpu); + TEST_ASSERT(!r, "Unexpected sync value: 0x%lx", r); + + sync_global_from_guest(vcpu->vm, pmc_results); +} + +/* + * In a nested environment or if the vPMU is disabled, the guest PMU + * might not work as architected (accessing the PMU MSRs may raise + * #GP, or writes could simply be discarded). In those situations, + * there is no point in running these tests. The guest code will perform + * a sanity check and then GUEST_SYNC(success). In the case of failure, + * the behavior of the guest on resumption is undefined. + */ +static bool sanity_check_pmu(struct kvm_vcpu *vcpu) +{ + uint64_t r; + + vm_install_exception_handler(vcpu->vm, GP_VECTOR, guest_gp_handler); + r = run_vcpu_to_sync(vcpu); + vm_install_exception_handler(vcpu->vm, GP_VECTOR, NULL); + + return !r; +} + +/* + * Remove the first occurrence of 'event' (if any) from the filter's + * event list. + */ +static void remove_event(struct __kvm_pmu_event_filter *f, uint64_t event) +{ + bool found = false; + int i; + + for (i = 0; i < f->nevents; i++) { + if (found) + f->events[i - 1] = f->events[i]; + else + found = f->events[i] == event; + } + if (found) + f->nevents--; +} + +#define ASSERT_PMC_COUNTING_INSTRUCTIONS() \ +do { \ + uint64_t br = pmc_results.branches_retired; \ + uint64_t ir = pmc_results.instructions_retired; \ + \ + if (br && br != NUM_BRANCHES) \ + pr_info("%s: Branch instructions retired = %lu (expected %u)\n", \ + __func__, br, NUM_BRANCHES); \ + TEST_ASSERT(br, "%s: Branch instructions retired = %lu (expected > 0)", \ + __func__, br); \ + TEST_ASSERT(ir, "%s: Instructions retired = %lu (expected > 0)", \ + __func__, ir); \ +} while (0) + +#define ASSERT_PMC_NOT_COUNTING_INSTRUCTIONS() \ +do { \ + uint64_t br = pmc_results.branches_retired; \ + uint64_t ir = pmc_results.instructions_retired; \ + \ + TEST_ASSERT(!br, "%s: Branch instructions retired = %lu (expected 0)", \ + __func__, br); \ + TEST_ASSERT(!ir, "%s: Instructions retired = %lu (expected 0)", \ + __func__, ir); \ +} while (0) + +static void test_without_filter(struct kvm_vcpu *vcpu) +{ + run_vcpu_and_sync_pmc_results(vcpu); + + ASSERT_PMC_COUNTING_INSTRUCTIONS(); +} + +static void test_with_filter(struct kvm_vcpu *vcpu, + struct __kvm_pmu_event_filter *__f) +{ + struct kvm_pmu_event_filter *f = (void *)__f; + + vm_ioctl(vcpu->vm, KVM_SET_PMU_EVENT_FILTER, f); + run_vcpu_and_sync_pmc_results(vcpu); +} + +static void test_amd_deny_list(struct kvm_vcpu *vcpu) +{ + struct __kvm_pmu_event_filter f = { + .action = KVM_PMU_EVENT_DENY, + .nevents = 1, + .events = { + RAW_EVENT(0x1C2, 0), + }, + }; + + test_with_filter(vcpu, &f); + + ASSERT_PMC_COUNTING_INSTRUCTIONS(); +} + +static void test_member_deny_list(struct kvm_vcpu *vcpu) +{ + struct __kvm_pmu_event_filter f = base_event_filter; + + f.action = KVM_PMU_EVENT_DENY; + test_with_filter(vcpu, &f); + + ASSERT_PMC_NOT_COUNTING_INSTRUCTIONS(); +} + +static void test_member_allow_list(struct kvm_vcpu *vcpu) +{ + struct __kvm_pmu_event_filter f = base_event_filter; + + f.action = KVM_PMU_EVENT_ALLOW; + test_with_filter(vcpu, &f); + + ASSERT_PMC_COUNTING_INSTRUCTIONS(); +} + +static void test_not_member_deny_list(struct kvm_vcpu *vcpu) +{ + struct __kvm_pmu_event_filter f = base_event_filter; + + f.action = KVM_PMU_EVENT_DENY; + + remove_event(&f, INTEL_ARCH_INSTRUCTIONS_RETIRED); + remove_event(&f, INTEL_ARCH_BRANCHES_RETIRED); + remove_event(&f, AMD_ZEN_BRANCHES_RETIRED); + test_with_filter(vcpu, &f); + + ASSERT_PMC_COUNTING_INSTRUCTIONS(); +} + +static void test_not_member_allow_list(struct kvm_vcpu *vcpu) +{ + struct __kvm_pmu_event_filter f = base_event_filter; + + f.action = KVM_PMU_EVENT_ALLOW; + + remove_event(&f, INTEL_ARCH_INSTRUCTIONS_RETIRED); + remove_event(&f, INTEL_ARCH_BRANCHES_RETIRED); + remove_event(&f, AMD_ZEN_BRANCHES_RETIRED); + test_with_filter(vcpu, &f); + + ASSERT_PMC_NOT_COUNTING_INSTRUCTIONS(); +} + +/* + * Verify that setting KVM_PMU_CAP_DISABLE prevents the use of the PMU. + * + * Note that KVM_CAP_PMU_CAPABILITY must be invoked prior to creating VCPUs. + */ +static void test_pmu_config_disable(void (*guest_code)(void)) +{ + struct kvm_vcpu *vcpu; + int r; + struct kvm_vm *vm; + + r = kvm_check_cap(KVM_CAP_PMU_CAPABILITY); + if (!(r & KVM_PMU_CAP_DISABLE)) + return; + + vm = vm_create(1); + + vm_enable_cap(vm, KVM_CAP_PMU_CAPABILITY, KVM_PMU_CAP_DISABLE); + + vcpu = vm_vcpu_add(vm, 0, guest_code); + TEST_ASSERT(!sanity_check_pmu(vcpu), + "Guest should not be able to use disabled PMU."); + + kvm_vm_free(vm); +} + +/* + * On Intel, check for a non-zero PMU version, at least one general-purpose + * counter per logical processor, and support for counting the number of branch + * instructions retired. + */ +static bool use_intel_pmu(void) +{ + return host_cpu_is_intel && + kvm_cpu_property(X86_PROPERTY_PMU_VERSION) && + kvm_cpu_property(X86_PROPERTY_PMU_NR_GP_COUNTERS) && + kvm_pmu_has(X86_PMU_FEATURE_BRANCH_INSNS_RETIRED); +} + +static bool is_zen1(uint32_t family, uint32_t model) +{ + return family == 0x17 && model <= 0x0f; +} + +static bool is_zen2(uint32_t family, uint32_t model) +{ + return family == 0x17 && model >= 0x30 && model <= 0x3f; +} + +static bool is_zen3(uint32_t family, uint32_t model) +{ + return family == 0x19 && model <= 0x0f; +} + +/* + * Determining AMD support for a PMU event requires consulting the AMD + * PPR for the CPU or reference material derived therefrom. The AMD + * test code herein has been verified to work on Zen1, Zen2, and Zen3. + * + * Feel free to add more AMD CPUs that are documented to support event + * select 0xc2 umask 0 as "retired branch instructions." + */ +static bool use_amd_pmu(void) +{ + uint32_t family = kvm_cpu_family(); + uint32_t model = kvm_cpu_model(); + + return host_cpu_is_amd && + (is_zen1(family, model) || + is_zen2(family, model) || + is_zen3(family, model)); +} + +/* + * "MEM_INST_RETIRED.ALL_LOADS", "MEM_INST_RETIRED.ALL_STORES", and + * "MEM_INST_RETIRED.ANY" from https://perfmon-events.intel.com/ + * supported on Intel Xeon processors: + * - Sapphire Rapids, Ice Lake, Cascade Lake, Skylake. + */ +#define MEM_INST_RETIRED 0xD0 +#define MEM_INST_RETIRED_LOAD RAW_EVENT(MEM_INST_RETIRED, 0x81) +#define MEM_INST_RETIRED_STORE RAW_EVENT(MEM_INST_RETIRED, 0x82) +#define MEM_INST_RETIRED_LOAD_STORE RAW_EVENT(MEM_INST_RETIRED, 0x83) + +static bool supports_event_mem_inst_retired(void) +{ + uint32_t eax, ebx, ecx, edx; + + cpuid(1, &eax, &ebx, &ecx, &edx); + if (x86_family(eax) == 0x6) { + switch (x86_model(eax)) { + /* Sapphire Rapids */ + case 0x8F: + /* Ice Lake */ + case 0x6A: + /* Skylake */ + /* Cascade Lake */ + case 0x55: + return true; + } + } + + return false; +} + +/* + * "LS Dispatch", from Processor Programming Reference + * (PPR) for AMD Family 17h Model 01h, Revision B1 Processors, + * Preliminary Processor Programming Reference (PPR) for AMD Family + * 17h Model 31h, Revision B0 Processors, and Preliminary Processor + * Programming Reference (PPR) for AMD Family 19h Model 01h, Revision + * B1 Processors Volume 1 of 2. + */ +#define LS_DISPATCH 0x29 +#define LS_DISPATCH_LOAD RAW_EVENT(LS_DISPATCH, BIT(0)) +#define LS_DISPATCH_STORE RAW_EVENT(LS_DISPATCH, BIT(1)) +#define LS_DISPATCH_LOAD_STORE RAW_EVENT(LS_DISPATCH, BIT(2)) + +#define INCLUDE_MASKED_ENTRY(event_select, mask, match) \ + KVM_PMU_ENCODE_MASKED_ENTRY(event_select, mask, match, false) +#define EXCLUDE_MASKED_ENTRY(event_select, mask, match) \ + KVM_PMU_ENCODE_MASKED_ENTRY(event_select, mask, match, true) + +static void masked_events_guest_test(uint32_t msr_base) +{ + /* + * The actual value of the counters don't determine the outcome of + * the test. Only that they are zero or non-zero. + */ + const uint64_t loads = rdmsr(msr_base + 0); + const uint64_t stores = rdmsr(msr_base + 1); + const uint64_t loads_stores = rdmsr(msr_base + 2); + int val; + + + __asm__ __volatile__("movl $0, %[v];" + "movl %[v], %%eax;" + "incl %[v];" + : [v]"+m"(val) :: "eax"); + + pmc_results.loads = rdmsr(msr_base + 0) - loads; + pmc_results.stores = rdmsr(msr_base + 1) - stores; + pmc_results.loads_stores = rdmsr(msr_base + 2) - loads_stores; +} + +static void intel_masked_events_guest_code(void) +{ + for (;;) { + wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0); + + wrmsr(MSR_P6_EVNTSEL0 + 0, ARCH_PERFMON_EVENTSEL_ENABLE | + ARCH_PERFMON_EVENTSEL_OS | MEM_INST_RETIRED_LOAD); + wrmsr(MSR_P6_EVNTSEL0 + 1, ARCH_PERFMON_EVENTSEL_ENABLE | + ARCH_PERFMON_EVENTSEL_OS | MEM_INST_RETIRED_STORE); + wrmsr(MSR_P6_EVNTSEL0 + 2, ARCH_PERFMON_EVENTSEL_ENABLE | + ARCH_PERFMON_EVENTSEL_OS | MEM_INST_RETIRED_LOAD_STORE); + + wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0x7); + + masked_events_guest_test(MSR_IA32_PMC0); + GUEST_SYNC(0); + } +} + +static void amd_masked_events_guest_code(void) +{ + for (;;) { + wrmsr(MSR_K7_EVNTSEL0, 0); + wrmsr(MSR_K7_EVNTSEL1, 0); + wrmsr(MSR_K7_EVNTSEL2, 0); + + wrmsr(MSR_K7_EVNTSEL0, ARCH_PERFMON_EVENTSEL_ENABLE | + ARCH_PERFMON_EVENTSEL_OS | LS_DISPATCH_LOAD); + wrmsr(MSR_K7_EVNTSEL1, ARCH_PERFMON_EVENTSEL_ENABLE | + ARCH_PERFMON_EVENTSEL_OS | LS_DISPATCH_STORE); + wrmsr(MSR_K7_EVNTSEL2, ARCH_PERFMON_EVENTSEL_ENABLE | + ARCH_PERFMON_EVENTSEL_OS | LS_DISPATCH_LOAD_STORE); + + masked_events_guest_test(MSR_K7_PERFCTR0); + GUEST_SYNC(0); + } +} + +static void run_masked_events_test(struct kvm_vcpu *vcpu, + const uint64_t masked_events[], + const int nmasked_events) +{ + struct __kvm_pmu_event_filter f = { + .nevents = nmasked_events, + .action = KVM_PMU_EVENT_ALLOW, + .flags = KVM_PMU_EVENT_FLAG_MASKED_EVENTS, + }; + + memcpy(f.events, masked_events, sizeof(uint64_t) * nmasked_events); + test_with_filter(vcpu, &f); +} + +#define ALLOW_LOADS BIT(0) +#define ALLOW_STORES BIT(1) +#define ALLOW_LOADS_STORES BIT(2) + +struct masked_events_test { + uint64_t intel_events[MAX_TEST_EVENTS]; + uint64_t intel_event_end; + uint64_t amd_events[MAX_TEST_EVENTS]; + uint64_t amd_event_end; + const char *msg; + uint32_t flags; +}; + +/* + * These are the test cases for the masked events tests. + * + * For each test, the guest enables 3 PMU counters (loads, stores, + * loads + stores). The filter is then set in KVM with the masked events + * provided. The test then verifies that the counters agree with which + * ones should be counting and which ones should be filtered. + */ +const struct masked_events_test test_cases[] = { + { + .intel_events = { + INCLUDE_MASKED_ENTRY(MEM_INST_RETIRED, 0xFF, 0x81), + }, + .amd_events = { + INCLUDE_MASKED_ENTRY(LS_DISPATCH, 0xFF, BIT(0)), + }, + .msg = "Only allow loads.", + .flags = ALLOW_LOADS, + }, { + .intel_events = { + INCLUDE_MASKED_ENTRY(MEM_INST_RETIRED, 0xFF, 0x82), + }, + .amd_events = { + INCLUDE_MASKED_ENTRY(LS_DISPATCH, 0xFF, BIT(1)), + }, + .msg = "Only allow stores.", + .flags = ALLOW_STORES, + }, { + .intel_events = { + INCLUDE_MASKED_ENTRY(MEM_INST_RETIRED, 0xFF, 0x83), + }, + .amd_events = { + INCLUDE_MASKED_ENTRY(LS_DISPATCH, 0xFF, BIT(2)), + }, + .msg = "Only allow loads + stores.", + .flags = ALLOW_LOADS_STORES, + }, { + .intel_events = { + INCLUDE_MASKED_ENTRY(MEM_INST_RETIRED, 0x7C, 0), + EXCLUDE_MASKED_ENTRY(MEM_INST_RETIRED, 0xFF, 0x83), + }, + .amd_events = { + INCLUDE_MASKED_ENTRY(LS_DISPATCH, ~(BIT(0) | BIT(1)), 0), + }, + .msg = "Only allow loads and stores.", + .flags = ALLOW_LOADS | ALLOW_STORES, + }, { + .intel_events = { + INCLUDE_MASKED_ENTRY(MEM_INST_RETIRED, 0x7C, 0), + EXCLUDE_MASKED_ENTRY(MEM_INST_RETIRED, 0xFF, 0x82), + }, + .amd_events = { + INCLUDE_MASKED_ENTRY(LS_DISPATCH, 0xF8, 0), + EXCLUDE_MASKED_ENTRY(LS_DISPATCH, 0xFF, BIT(1)), + }, + .msg = "Only allow loads and loads + stores.", + .flags = ALLOW_LOADS | ALLOW_LOADS_STORES + }, { + .intel_events = { + INCLUDE_MASKED_ENTRY(MEM_INST_RETIRED, 0xFE, 0x82), + }, + .amd_events = { + INCLUDE_MASKED_ENTRY(LS_DISPATCH, 0xF8, 0), + EXCLUDE_MASKED_ENTRY(LS_DISPATCH, 0xFF, BIT(0)), + }, + .msg = "Only allow stores and loads + stores.", + .flags = ALLOW_STORES | ALLOW_LOADS_STORES + }, { + .intel_events = { + INCLUDE_MASKED_ENTRY(MEM_INST_RETIRED, 0x7C, 0), + }, + .amd_events = { + INCLUDE_MASKED_ENTRY(LS_DISPATCH, 0xF8, 0), + }, + .msg = "Only allow loads, stores, and loads + stores.", + .flags = ALLOW_LOADS | ALLOW_STORES | ALLOW_LOADS_STORES + }, +}; + +static int append_test_events(const struct masked_events_test *test, + uint64_t *events, int nevents) +{ + const uint64_t *evts; + int i; + + evts = use_intel_pmu() ? test->intel_events : test->amd_events; + for (i = 0; i < MAX_TEST_EVENTS; i++) { + if (evts[i] == 0) + break; + + events[nevents + i] = evts[i]; + } + + return nevents + i; +} + +static bool bool_eq(bool a, bool b) +{ + return a == b; +} + +static void run_masked_events_tests(struct kvm_vcpu *vcpu, uint64_t *events, + int nevents) +{ + int ntests = ARRAY_SIZE(test_cases); + int i, n; + + for (i = 0; i < ntests; i++) { + const struct masked_events_test *test = &test_cases[i]; + + /* Do any test case events overflow MAX_TEST_EVENTS? */ + assert(test->intel_event_end == 0); + assert(test->amd_event_end == 0); + + n = append_test_events(test, events, nevents); + + run_masked_events_test(vcpu, events, n); + + TEST_ASSERT(bool_eq(pmc_results.loads, test->flags & ALLOW_LOADS) && + bool_eq(pmc_results.stores, test->flags & ALLOW_STORES) && + bool_eq(pmc_results.loads_stores, + test->flags & ALLOW_LOADS_STORES), + "%s loads: %lu, stores: %lu, loads + stores: %lu", + test->msg, pmc_results.loads, pmc_results.stores, + pmc_results.loads_stores); + } +} + +static void add_dummy_events(uint64_t *events, int nevents) +{ + int i; + + for (i = 0; i < nevents; i++) { + int event_select = i % 0xFF; + bool exclude = ((i % 4) == 0); + + if (event_select == MEM_INST_RETIRED || + event_select == LS_DISPATCH) + event_select++; + + events[i] = KVM_PMU_ENCODE_MASKED_ENTRY(event_select, 0, + 0, exclude); + } +} + +static void test_masked_events(struct kvm_vcpu *vcpu) +{ + int nevents = KVM_PMU_EVENT_FILTER_MAX_EVENTS - MAX_TEST_EVENTS; + uint64_t events[KVM_PMU_EVENT_FILTER_MAX_EVENTS]; + + /* Run the test cases against a sparse PMU event filter. */ + run_masked_events_tests(vcpu, events, 0); + + /* Run the test cases against a dense PMU event filter. */ + add_dummy_events(events, KVM_PMU_EVENT_FILTER_MAX_EVENTS); + run_masked_events_tests(vcpu, events, nevents); +} + +static int set_pmu_event_filter(struct kvm_vcpu *vcpu, + struct __kvm_pmu_event_filter *__f) +{ + struct kvm_pmu_event_filter *f = (void *)__f; + + return __vm_ioctl(vcpu->vm, KVM_SET_PMU_EVENT_FILTER, f); +} + +static int set_pmu_single_event_filter(struct kvm_vcpu *vcpu, uint64_t event, + uint32_t flags, uint32_t action) +{ + struct __kvm_pmu_event_filter f = { + .nevents = 1, + .flags = flags, + .action = action, + .events = { + event, + }, + }; + + return set_pmu_event_filter(vcpu, &f); +} + +static void test_filter_ioctl(struct kvm_vcpu *vcpu) +{ + uint8_t nr_fixed_counters = kvm_cpu_property(X86_PROPERTY_PMU_NR_FIXED_COUNTERS); + struct __kvm_pmu_event_filter f; + uint64_t e = ~0ul; + int r; + + /* + * Unfortunately having invalid bits set in event data is expected to + * pass when flags == 0 (bits other than eventsel+umask). + */ + r = set_pmu_single_event_filter(vcpu, e, 0, KVM_PMU_EVENT_ALLOW); + TEST_ASSERT(r == 0, "Valid PMU Event Filter is failing"); + + r = set_pmu_single_event_filter(vcpu, e, + KVM_PMU_EVENT_FLAG_MASKED_EVENTS, + KVM_PMU_EVENT_ALLOW); + TEST_ASSERT(r != 0, "Invalid PMU Event Filter is expected to fail"); + + e = KVM_PMU_ENCODE_MASKED_ENTRY(0xff, 0xff, 0xff, 0xf); + r = set_pmu_single_event_filter(vcpu, e, + KVM_PMU_EVENT_FLAG_MASKED_EVENTS, + KVM_PMU_EVENT_ALLOW); + TEST_ASSERT(r == 0, "Valid PMU Event Filter is failing"); + + f = base_event_filter; + f.action = PMU_EVENT_FILTER_INVALID_ACTION; + r = set_pmu_event_filter(vcpu, &f); + TEST_ASSERT(r, "Set invalid action is expected to fail"); + + f = base_event_filter; + f.flags = PMU_EVENT_FILTER_INVALID_FLAGS; + r = set_pmu_event_filter(vcpu, &f); + TEST_ASSERT(r, "Set invalid flags is expected to fail"); + + f = base_event_filter; + f.nevents = PMU_EVENT_FILTER_INVALID_NEVENTS; + r = set_pmu_event_filter(vcpu, &f); + TEST_ASSERT(r, "Exceeding the max number of filter events should fail"); + + f = base_event_filter; + f.fixed_counter_bitmap = ~GENMASK_ULL(nr_fixed_counters, 0); + r = set_pmu_event_filter(vcpu, &f); + TEST_ASSERT(!r, "Masking non-existent fixed counters should be allowed"); +} + +static void intel_run_fixed_counter_guest_code(uint8_t idx) +{ + for (;;) { + wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0); + wrmsr(MSR_CORE_PERF_FIXED_CTR0 + idx, 0); + + /* Only OS_EN bit is enabled for fixed counter[idx]. */ + wrmsr(MSR_CORE_PERF_FIXED_CTR_CTRL, FIXED_PMC_CTRL(idx, FIXED_PMC_KERNEL)); + wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, FIXED_PMC_GLOBAL_CTRL_ENABLE(idx)); + __asm__ __volatile__("loop ." : "+c"((int){NUM_BRANCHES})); + wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0); + + GUEST_SYNC(rdmsr(MSR_CORE_PERF_FIXED_CTR0 + idx)); + } +} + +static uint64_t test_with_fixed_counter_filter(struct kvm_vcpu *vcpu, + uint32_t action, uint32_t bitmap) +{ + struct __kvm_pmu_event_filter f = { + .action = action, + .fixed_counter_bitmap = bitmap, + }; + set_pmu_event_filter(vcpu, &f); + + return run_vcpu_to_sync(vcpu); +} + +static uint64_t test_set_gp_and_fixed_event_filter(struct kvm_vcpu *vcpu, + uint32_t action, + uint32_t bitmap) +{ + struct __kvm_pmu_event_filter f = base_event_filter; + + f.action = action; + f.fixed_counter_bitmap = bitmap; + set_pmu_event_filter(vcpu, &f); + + return run_vcpu_to_sync(vcpu); +} + +static void __test_fixed_counter_bitmap(struct kvm_vcpu *vcpu, uint8_t idx, + uint8_t nr_fixed_counters) +{ + unsigned int i; + uint32_t bitmap; + uint64_t count; + + TEST_ASSERT(nr_fixed_counters < sizeof(bitmap) * 8, + "Invalid nr_fixed_counters"); + + /* + * Check the fixed performance counter can count normally when KVM + * userspace doesn't set any pmu filter. + */ + count = run_vcpu_to_sync(vcpu); + TEST_ASSERT(count, "Unexpected count value: %ld", count); + + for (i = 0; i < BIT(nr_fixed_counters); i++) { + bitmap = BIT(i); + count = test_with_fixed_counter_filter(vcpu, KVM_PMU_EVENT_ALLOW, + bitmap); + TEST_ASSERT_EQ(!!count, !!(bitmap & BIT(idx))); + + count = test_with_fixed_counter_filter(vcpu, KVM_PMU_EVENT_DENY, + bitmap); + TEST_ASSERT_EQ(!!count, !(bitmap & BIT(idx))); + + /* + * Check that fixed_counter_bitmap has higher priority than + * events[] when both are set. + */ + count = test_set_gp_and_fixed_event_filter(vcpu, + KVM_PMU_EVENT_ALLOW, + bitmap); + TEST_ASSERT_EQ(!!count, !!(bitmap & BIT(idx))); + + count = test_set_gp_and_fixed_event_filter(vcpu, + KVM_PMU_EVENT_DENY, + bitmap); + TEST_ASSERT_EQ(!!count, !(bitmap & BIT(idx))); + } +} + +static void test_fixed_counter_bitmap(void) +{ + uint8_t nr_fixed_counters = kvm_cpu_property(X86_PROPERTY_PMU_NR_FIXED_COUNTERS); + struct kvm_vm *vm; + struct kvm_vcpu *vcpu; + uint8_t idx; + + /* + * Check that pmu_event_filter works as expected when it's applied to + * fixed performance counters. + */ + for (idx = 0; idx < nr_fixed_counters; idx++) { + vm = vm_create_with_one_vcpu(&vcpu, + intel_run_fixed_counter_guest_code); + vcpu_args_set(vcpu, 1, idx); + __test_fixed_counter_bitmap(vcpu, idx, nr_fixed_counters); + kvm_vm_free(vm); + } +} + +int main(int argc, char *argv[]) +{ + void (*guest_code)(void); + struct kvm_vcpu *vcpu, *vcpu2 = NULL; + struct kvm_vm *vm; + + TEST_REQUIRE(kvm_is_pmu_enabled()); + TEST_REQUIRE(kvm_has_cap(KVM_CAP_PMU_EVENT_FILTER)); + TEST_REQUIRE(kvm_has_cap(KVM_CAP_PMU_EVENT_MASKED_EVENTS)); + + TEST_REQUIRE(use_intel_pmu() || use_amd_pmu()); + guest_code = use_intel_pmu() ? intel_guest_code : amd_guest_code; + + vm = vm_create_with_one_vcpu(&vcpu, guest_code); + + TEST_REQUIRE(sanity_check_pmu(vcpu)); + + if (use_amd_pmu()) + test_amd_deny_list(vcpu); + + test_without_filter(vcpu); + test_member_deny_list(vcpu); + test_member_allow_list(vcpu); + test_not_member_deny_list(vcpu); + test_not_member_allow_list(vcpu); + + if (use_intel_pmu() && + supports_event_mem_inst_retired() && + kvm_cpu_property(X86_PROPERTY_PMU_NR_GP_COUNTERS) >= 3) + vcpu2 = vm_vcpu_add(vm, 2, intel_masked_events_guest_code); + else if (use_amd_pmu()) + vcpu2 = vm_vcpu_add(vm, 2, amd_masked_events_guest_code); + + if (vcpu2) + test_masked_events(vcpu2); + test_filter_ioctl(vcpu); + + kvm_vm_free(vm); + + test_pmu_config_disable(guest_code); + test_fixed_counter_bitmap(); + + return 0; +} diff --git a/tools/testing/selftests/kvm/x86_64/private_mem_conversions_test.c b/tools/testing/selftests/kvm/x86_64/private_mem_conversions_test.c new file mode 100644 index 000000000000..82a8d88b5338 --- /dev/null +++ b/tools/testing/selftests/kvm/x86_64/private_mem_conversions_test.c @@ -0,0 +1,483 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2022, Google LLC. + */ +#include <fcntl.h> +#include <limits.h> +#include <pthread.h> +#include <sched.h> +#include <signal.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/ioctl.h> + +#include <linux/compiler.h> +#include <linux/kernel.h> +#include <linux/kvm_para.h> +#include <linux/memfd.h> +#include <linux/sizes.h> + +#include <test_util.h> +#include <kvm_util.h> +#include <processor.h> + +#define BASE_DATA_SLOT 10 +#define BASE_DATA_GPA ((uint64_t)(1ull << 32)) +#define PER_CPU_DATA_SIZE ((uint64_t)(SZ_2M + PAGE_SIZE)) + +/* Horrific macro so that the line info is captured accurately :-( */ +#define memcmp_g(gpa, pattern, size) \ +do { \ + uint8_t *mem = (uint8_t *)gpa; \ + size_t i; \ + \ + for (i = 0; i < size; i++) \ + __GUEST_ASSERT(mem[i] == pattern, \ + "Guest expected 0x%x at offset %lu (gpa 0x%lx), got 0x%x", \ + pattern, i, gpa + i, mem[i]); \ +} while (0) + +static void memcmp_h(uint8_t *mem, uint64_t gpa, uint8_t pattern, size_t size) +{ + size_t i; + + for (i = 0; i < size; i++) + TEST_ASSERT(mem[i] == pattern, + "Host expected 0x%x at gpa 0x%lx, got 0x%x", + pattern, gpa + i, mem[i]); +} + +/* + * Run memory conversion tests with explicit conversion: + * Execute KVM hypercall to map/unmap gpa range which will cause userspace exit + * to back/unback private memory. Subsequent accesses by guest to the gpa range + * will not cause exit to userspace. + * + * Test memory conversion scenarios with following steps: + * 1) Access private memory using private access and verify that memory contents + * are not visible to userspace. + * 2) Convert memory to shared using explicit conversions and ensure that + * userspace is able to access the shared regions. + * 3) Convert memory back to private using explicit conversions and ensure that + * userspace is again not able to access converted private regions. + */ + +#define GUEST_STAGE(o, s) { .offset = o, .size = s } + +enum ucall_syncs { + SYNC_SHARED, + SYNC_PRIVATE, +}; + +static void guest_sync_shared(uint64_t gpa, uint64_t size, + uint8_t current_pattern, uint8_t new_pattern) +{ + GUEST_SYNC5(SYNC_SHARED, gpa, size, current_pattern, new_pattern); +} + +static void guest_sync_private(uint64_t gpa, uint64_t size, uint8_t pattern) +{ + GUEST_SYNC4(SYNC_PRIVATE, gpa, size, pattern); +} + +/* Arbitrary values, KVM doesn't care about the attribute flags. */ +#define MAP_GPA_SET_ATTRIBUTES BIT(0) +#define MAP_GPA_SHARED BIT(1) +#define MAP_GPA_DO_FALLOCATE BIT(2) + +static void guest_map_mem(uint64_t gpa, uint64_t size, bool map_shared, + bool do_fallocate) +{ + uint64_t flags = MAP_GPA_SET_ATTRIBUTES; + + if (map_shared) + flags |= MAP_GPA_SHARED; + if (do_fallocate) + flags |= MAP_GPA_DO_FALLOCATE; + kvm_hypercall_map_gpa_range(gpa, size, flags); +} + +static void guest_map_shared(uint64_t gpa, uint64_t size, bool do_fallocate) +{ + guest_map_mem(gpa, size, true, do_fallocate); +} + +static void guest_map_private(uint64_t gpa, uint64_t size, bool do_fallocate) +{ + guest_map_mem(gpa, size, false, do_fallocate); +} + +struct { + uint64_t offset; + uint64_t size; +} static const test_ranges[] = { + GUEST_STAGE(0, PAGE_SIZE), + GUEST_STAGE(0, SZ_2M), + GUEST_STAGE(PAGE_SIZE, PAGE_SIZE), + GUEST_STAGE(PAGE_SIZE, SZ_2M), + GUEST_STAGE(SZ_2M, PAGE_SIZE), +}; + +static void guest_test_explicit_conversion(uint64_t base_gpa, bool do_fallocate) +{ + const uint8_t def_p = 0xaa; + const uint8_t init_p = 0xcc; + uint64_t j; + int i; + + /* Memory should be shared by default. */ + memset((void *)base_gpa, def_p, PER_CPU_DATA_SIZE); + memcmp_g(base_gpa, def_p, PER_CPU_DATA_SIZE); + guest_sync_shared(base_gpa, PER_CPU_DATA_SIZE, def_p, init_p); + + memcmp_g(base_gpa, init_p, PER_CPU_DATA_SIZE); + + for (i = 0; i < ARRAY_SIZE(test_ranges); i++) { + uint64_t gpa = base_gpa + test_ranges[i].offset; + uint64_t size = test_ranges[i].size; + uint8_t p1 = 0x11; + uint8_t p2 = 0x22; + uint8_t p3 = 0x33; + uint8_t p4 = 0x44; + + /* + * Set the test region to pattern one to differentiate it from + * the data range as a whole (contains the initial pattern). + */ + memset((void *)gpa, p1, size); + + /* + * Convert to private, set and verify the private data, and + * then verify that the rest of the data (map shared) still + * holds the initial pattern, and that the host always sees the + * shared memory (initial pattern). Unlike shared memory, + * punching a hole in private memory is destructive, i.e. + * previous values aren't guaranteed to be preserved. + */ + guest_map_private(gpa, size, do_fallocate); + + if (size > PAGE_SIZE) { + memset((void *)gpa, p2, PAGE_SIZE); + goto skip; + } + + memset((void *)gpa, p2, size); + guest_sync_private(gpa, size, p1); + + /* + * Verify that the private memory was set to pattern two, and + * that shared memory still holds the initial pattern. + */ + memcmp_g(gpa, p2, size); + if (gpa > base_gpa) + memcmp_g(base_gpa, init_p, gpa - base_gpa); + if (gpa + size < base_gpa + PER_CPU_DATA_SIZE) + memcmp_g(gpa + size, init_p, + (base_gpa + PER_CPU_DATA_SIZE) - (gpa + size)); + + /* + * Convert odd-number page frames back to shared to verify KVM + * also correctly handles holes in private ranges. + */ + for (j = 0; j < size; j += PAGE_SIZE) { + if ((j >> PAGE_SHIFT) & 1) { + guest_map_shared(gpa + j, PAGE_SIZE, do_fallocate); + guest_sync_shared(gpa + j, PAGE_SIZE, p1, p3); + + memcmp_g(gpa + j, p3, PAGE_SIZE); + } else { + guest_sync_private(gpa + j, PAGE_SIZE, p1); + } + } + +skip: + /* + * Convert the entire region back to shared, explicitly write + * pattern three to fill in the even-number frames before + * asking the host to verify (and write pattern four). + */ + guest_map_shared(gpa, size, do_fallocate); + memset((void *)gpa, p3, size); + guest_sync_shared(gpa, size, p3, p4); + memcmp_g(gpa, p4, size); + + /* Reset the shared memory back to the initial pattern. */ + memset((void *)gpa, init_p, size); + + /* + * Free (via PUNCH_HOLE) *all* private memory so that the next + * iteration starts from a clean slate, e.g. with respect to + * whether or not there are pages/folios in guest_mem. + */ + guest_map_shared(base_gpa, PER_CPU_DATA_SIZE, true); + } +} + +static void guest_punch_hole(uint64_t gpa, uint64_t size) +{ + /* "Mapping" memory shared via fallocate() is done via PUNCH_HOLE. */ + uint64_t flags = MAP_GPA_SHARED | MAP_GPA_DO_FALLOCATE; + + kvm_hypercall_map_gpa_range(gpa, size, flags); +} + +/* + * Test that PUNCH_HOLE actually frees memory by punching holes without doing a + * proper conversion. Freeing (PUNCH_HOLE) should zap SPTEs, and reallocating + * (subsequent fault) should zero memory. + */ +static void guest_test_punch_hole(uint64_t base_gpa, bool precise) +{ + const uint8_t init_p = 0xcc; + int i; + + /* + * Convert the entire range to private, this testcase is all about + * punching holes in guest_memfd, i.e. shared mappings aren't needed. + */ + guest_map_private(base_gpa, PER_CPU_DATA_SIZE, false); + + for (i = 0; i < ARRAY_SIZE(test_ranges); i++) { + uint64_t gpa = base_gpa + test_ranges[i].offset; + uint64_t size = test_ranges[i].size; + + /* + * Free all memory before each iteration, even for the !precise + * case where the memory will be faulted back in. Freeing and + * reallocating should obviously work, and freeing all memory + * minimizes the probability of cross-testcase influence. + */ + guest_punch_hole(base_gpa, PER_CPU_DATA_SIZE); + + /* Fault-in and initialize memory, and verify the pattern. */ + if (precise) { + memset((void *)gpa, init_p, size); + memcmp_g(gpa, init_p, size); + } else { + memset((void *)base_gpa, init_p, PER_CPU_DATA_SIZE); + memcmp_g(base_gpa, init_p, PER_CPU_DATA_SIZE); + } + + /* + * Punch a hole at the target range and verify that reads from + * the guest succeed and return zeroes. + */ + guest_punch_hole(gpa, size); + memcmp_g(gpa, 0, size); + } +} + +static void guest_code(uint64_t base_gpa) +{ + /* + * Run the conversion test twice, with and without doing fallocate() on + * the guest_memfd backing when converting between shared and private. + */ + guest_test_explicit_conversion(base_gpa, false); + guest_test_explicit_conversion(base_gpa, true); + + /* + * Run the PUNCH_HOLE test twice too, once with the entire guest_memfd + * faulted in, once with only the target range faulted in. + */ + guest_test_punch_hole(base_gpa, false); + guest_test_punch_hole(base_gpa, true); + GUEST_DONE(); +} + +static void handle_exit_hypercall(struct kvm_vcpu *vcpu) +{ + struct kvm_run *run = vcpu->run; + uint64_t gpa = run->hypercall.args[0]; + uint64_t size = run->hypercall.args[1] * PAGE_SIZE; + bool set_attributes = run->hypercall.args[2] & MAP_GPA_SET_ATTRIBUTES; + bool map_shared = run->hypercall.args[2] & MAP_GPA_SHARED; + bool do_fallocate = run->hypercall.args[2] & MAP_GPA_DO_FALLOCATE; + struct kvm_vm *vm = vcpu->vm; + + TEST_ASSERT(run->hypercall.nr == KVM_HC_MAP_GPA_RANGE, + "Wanted MAP_GPA_RANGE (%u), got '%llu'", + KVM_HC_MAP_GPA_RANGE, run->hypercall.nr); + + if (do_fallocate) + vm_guest_mem_fallocate(vm, gpa, size, map_shared); + + if (set_attributes) + vm_set_memory_attributes(vm, gpa, size, + map_shared ? 0 : KVM_MEMORY_ATTRIBUTE_PRIVATE); + run->hypercall.ret = 0; +} + +static bool run_vcpus; + +static void *__test_mem_conversions(void *__vcpu) +{ + struct kvm_vcpu *vcpu = __vcpu; + struct kvm_run *run = vcpu->run; + struct kvm_vm *vm = vcpu->vm; + struct ucall uc; + + while (!READ_ONCE(run_vcpus)) + ; + + for ( ;; ) { + vcpu_run(vcpu); + + if (run->exit_reason == KVM_EXIT_HYPERCALL) { + handle_exit_hypercall(vcpu); + continue; + } + + TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, + "Wanted KVM_EXIT_IO, got exit reason: %u (%s)", + run->exit_reason, exit_reason_str(run->exit_reason)); + + switch (get_ucall(vcpu, &uc)) { + case UCALL_ABORT: + REPORT_GUEST_ASSERT(uc); + case UCALL_SYNC: { + uint64_t gpa = uc.args[1]; + size_t size = uc.args[2]; + size_t i; + + TEST_ASSERT(uc.args[0] == SYNC_SHARED || + uc.args[0] == SYNC_PRIVATE, + "Unknown sync command '%ld'", uc.args[0]); + + for (i = 0; i < size; i += vm->page_size) { + size_t nr_bytes = min_t(size_t, vm->page_size, size - i); + uint8_t *hva = addr_gpa2hva(vm, gpa + i); + + /* In all cases, the host should observe the shared data. */ + memcmp_h(hva, gpa + i, uc.args[3], nr_bytes); + + /* For shared, write the new pattern to guest memory. */ + if (uc.args[0] == SYNC_SHARED) + memset(hva, uc.args[4], nr_bytes); + } + break; + } + case UCALL_DONE: + return NULL; + default: + TEST_FAIL("Unknown ucall 0x%lx.", uc.cmd); + } + } +} + +static void test_mem_conversions(enum vm_mem_backing_src_type src_type, uint32_t nr_vcpus, + uint32_t nr_memslots) +{ + /* + * Allocate enough memory so that each vCPU's chunk of memory can be + * naturally aligned with respect to the size of the backing store. + */ + const size_t alignment = max_t(size_t, SZ_2M, get_backing_src_pagesz(src_type)); + const size_t per_cpu_size = align_up(PER_CPU_DATA_SIZE, alignment); + const size_t memfd_size = per_cpu_size * nr_vcpus; + const size_t slot_size = memfd_size / nr_memslots; + struct kvm_vcpu *vcpus[KVM_MAX_VCPUS]; + pthread_t threads[KVM_MAX_VCPUS]; + struct kvm_vm *vm; + int memfd, i, r; + + const struct vm_shape shape = { + .mode = VM_MODE_DEFAULT, + .type = KVM_X86_SW_PROTECTED_VM, + }; + + TEST_ASSERT(slot_size * nr_memslots == memfd_size, + "The memfd size (0x%lx) needs to be cleanly divisible by the number of memslots (%u)", + memfd_size, nr_memslots); + vm = __vm_create_with_vcpus(shape, nr_vcpus, 0, guest_code, vcpus); + + vm_enable_cap(vm, KVM_CAP_EXIT_HYPERCALL, (1 << KVM_HC_MAP_GPA_RANGE)); + + memfd = vm_create_guest_memfd(vm, memfd_size, 0); + + for (i = 0; i < nr_memslots; i++) + vm_mem_add(vm, src_type, BASE_DATA_GPA + slot_size * i, + BASE_DATA_SLOT + i, slot_size / vm->page_size, + KVM_MEM_GUEST_MEMFD, memfd, slot_size * i); + + for (i = 0; i < nr_vcpus; i++) { + uint64_t gpa = BASE_DATA_GPA + i * per_cpu_size; + + vcpu_args_set(vcpus[i], 1, gpa); + + /* + * Map only what is needed so that an out-of-bounds access + * results #PF => SHUTDOWN instead of data corruption. + */ + virt_map(vm, gpa, gpa, PER_CPU_DATA_SIZE / vm->page_size); + + pthread_create(&threads[i], NULL, __test_mem_conversions, vcpus[i]); + } + + WRITE_ONCE(run_vcpus, true); + + for (i = 0; i < nr_vcpus; i++) + pthread_join(threads[i], NULL); + + kvm_vm_free(vm); + + /* + * Allocate and free memory from the guest_memfd after closing the VM + * fd. The guest_memfd is gifted a reference to its owning VM, i.e. + * should prevent the VM from being fully destroyed until the last + * reference to the guest_memfd is also put. + */ + r = fallocate(memfd, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE, 0, memfd_size); + TEST_ASSERT(!r, __KVM_SYSCALL_ERROR("fallocate()", r)); + + r = fallocate(memfd, FALLOC_FL_KEEP_SIZE, 0, memfd_size); + TEST_ASSERT(!r, __KVM_SYSCALL_ERROR("fallocate()", r)); + + close(memfd); +} + +static void usage(const char *cmd) +{ + puts(""); + printf("usage: %s [-h] [-m nr_memslots] [-s mem_type] [-n nr_vcpus]\n", cmd); + puts(""); + backing_src_help("-s"); + puts(""); + puts(" -n: specify the number of vcpus (default: 1)"); + puts(""); + puts(" -m: specify the number of memslots (default: 1)"); + puts(""); +} + +int main(int argc, char *argv[]) +{ + enum vm_mem_backing_src_type src_type = DEFAULT_VM_MEM_SRC; + uint32_t nr_memslots = 1; + uint32_t nr_vcpus = 1; + int opt; + + TEST_REQUIRE(kvm_check_cap(KVM_CAP_VM_TYPES) & BIT(KVM_X86_SW_PROTECTED_VM)); + + while ((opt = getopt(argc, argv, "hm:s:n:")) != -1) { + switch (opt) { + case 's': + src_type = parse_backing_src_type(optarg); + break; + case 'n': + nr_vcpus = atoi_positive("nr_vcpus", optarg); + break; + case 'm': + nr_memslots = atoi_positive("nr_memslots", optarg); + break; + case 'h': + default: + usage(argv[0]); + exit(0); + } + } + + test_mem_conversions(src_type, nr_vcpus, nr_memslots); + + return 0; +} diff --git a/tools/testing/selftests/kvm/x86_64/private_mem_kvm_exits_test.c b/tools/testing/selftests/kvm/x86_64/private_mem_kvm_exits_test.c new file mode 100644 index 000000000000..13e72fcec8dd --- /dev/null +++ b/tools/testing/selftests/kvm/x86_64/private_mem_kvm_exits_test.c @@ -0,0 +1,120 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2023, Google LLC. + */ +#include <linux/kvm.h> +#include <pthread.h> +#include <stdint.h> + +#include "kvm_util.h" +#include "processor.h" +#include "test_util.h" + +/* Arbitrarily selected to avoid overlaps with anything else */ +#define EXITS_TEST_GVA 0xc0000000 +#define EXITS_TEST_GPA EXITS_TEST_GVA +#define EXITS_TEST_NPAGES 1 +#define EXITS_TEST_SIZE (EXITS_TEST_NPAGES * PAGE_SIZE) +#define EXITS_TEST_SLOT 10 + +static uint64_t guest_repeatedly_read(void) +{ + volatile uint64_t value; + + while (true) + value = *((uint64_t *) EXITS_TEST_GVA); + + return value; +} + +static uint32_t run_vcpu_get_exit_reason(struct kvm_vcpu *vcpu) +{ + int r; + + r = _vcpu_run(vcpu); + if (r) { + TEST_ASSERT(errno == EFAULT, KVM_IOCTL_ERROR(KVM_RUN, r)); + TEST_ASSERT_EQ(vcpu->run->exit_reason, KVM_EXIT_MEMORY_FAULT); + } + return vcpu->run->exit_reason; +} + +const struct vm_shape protected_vm_shape = { + .mode = VM_MODE_DEFAULT, + .type = KVM_X86_SW_PROTECTED_VM, +}; + +static void test_private_access_memslot_deleted(void) +{ + struct kvm_vm *vm; + struct kvm_vcpu *vcpu; + pthread_t vm_thread; + void *thread_return; + uint32_t exit_reason; + + vm = vm_create_shape_with_one_vcpu(protected_vm_shape, &vcpu, + guest_repeatedly_read); + + vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, + EXITS_TEST_GPA, EXITS_TEST_SLOT, + EXITS_TEST_NPAGES, + KVM_MEM_GUEST_MEMFD); + + virt_map(vm, EXITS_TEST_GVA, EXITS_TEST_GPA, EXITS_TEST_NPAGES); + + /* Request to access page privately */ + vm_mem_set_private(vm, EXITS_TEST_GPA, EXITS_TEST_SIZE); + + pthread_create(&vm_thread, NULL, + (void *(*)(void *))run_vcpu_get_exit_reason, + (void *)vcpu); + + vm_mem_region_delete(vm, EXITS_TEST_SLOT); + + pthread_join(vm_thread, &thread_return); + exit_reason = (uint32_t)(uint64_t)thread_return; + + TEST_ASSERT_EQ(exit_reason, KVM_EXIT_MEMORY_FAULT); + TEST_ASSERT_EQ(vcpu->run->memory_fault.flags, KVM_MEMORY_EXIT_FLAG_PRIVATE); + TEST_ASSERT_EQ(vcpu->run->memory_fault.gpa, EXITS_TEST_GPA); + TEST_ASSERT_EQ(vcpu->run->memory_fault.size, EXITS_TEST_SIZE); + + kvm_vm_free(vm); +} + +static void test_private_access_memslot_not_private(void) +{ + struct kvm_vm *vm; + struct kvm_vcpu *vcpu; + uint32_t exit_reason; + + vm = vm_create_shape_with_one_vcpu(protected_vm_shape, &vcpu, + guest_repeatedly_read); + + /* Add a non-private memslot (flags = 0) */ + vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, + EXITS_TEST_GPA, EXITS_TEST_SLOT, + EXITS_TEST_NPAGES, 0); + + virt_map(vm, EXITS_TEST_GVA, EXITS_TEST_GPA, EXITS_TEST_NPAGES); + + /* Request to access page privately */ + vm_mem_set_private(vm, EXITS_TEST_GPA, EXITS_TEST_SIZE); + + exit_reason = run_vcpu_get_exit_reason(vcpu); + + TEST_ASSERT_EQ(exit_reason, KVM_EXIT_MEMORY_FAULT); + TEST_ASSERT_EQ(vcpu->run->memory_fault.flags, KVM_MEMORY_EXIT_FLAG_PRIVATE); + TEST_ASSERT_EQ(vcpu->run->memory_fault.gpa, EXITS_TEST_GPA); + TEST_ASSERT_EQ(vcpu->run->memory_fault.size, EXITS_TEST_SIZE); + + kvm_vm_free(vm); +} + +int main(int argc, char *argv[]) +{ + TEST_REQUIRE(kvm_check_cap(KVM_CAP_VM_TYPES) & BIT(KVM_X86_SW_PROTECTED_VM)); + + test_private_access_memslot_deleted(); + test_private_access_memslot_not_private(); +} diff --git a/tools/testing/selftests/kvm/x86_64/recalc_apic_map_test.c b/tools/testing/selftests/kvm/x86_64/recalc_apic_map_test.c new file mode 100644 index 000000000000..cbc92a862ea9 --- /dev/null +++ b/tools/testing/selftests/kvm/x86_64/recalc_apic_map_test.c @@ -0,0 +1,74 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Test edge cases and race conditions in kvm_recalculate_apic_map(). + */ + +#include <sys/ioctl.h> +#include <pthread.h> +#include <time.h> + +#include "processor.h" +#include "test_util.h" +#include "kvm_util.h" +#include "apic.h" + +#define TIMEOUT 5 /* seconds */ + +#define LAPIC_DISABLED 0 +#define LAPIC_X2APIC (MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE) +#define MAX_XAPIC_ID 0xff + +static void *race(void *arg) +{ + struct kvm_lapic_state lapic = {}; + struct kvm_vcpu *vcpu = arg; + + while (1) { + /* Trigger kvm_recalculate_apic_map(). */ + vcpu_ioctl(vcpu, KVM_SET_LAPIC, &lapic); + pthread_testcancel(); + } + + return NULL; +} + +int main(void) +{ + struct kvm_vcpu *vcpus[KVM_MAX_VCPUS]; + struct kvm_vcpu *vcpuN; + struct kvm_vm *vm; + pthread_t thread; + time_t t; + int i; + + kvm_static_assert(KVM_MAX_VCPUS > MAX_XAPIC_ID); + + /* + * Create the max number of vCPUs supported by selftests so that KVM + * has decent amount of work to do when recalculating the map, i.e. to + * make the problematic window large enough to hit. + */ + vm = vm_create_with_vcpus(KVM_MAX_VCPUS, NULL, vcpus); + + /* + * Enable x2APIC on all vCPUs so that KVM doesn't bail from the recalc + * due to vCPUs having aliased xAPIC IDs (truncated to 8 bits). + */ + for (i = 0; i < KVM_MAX_VCPUS; i++) + vcpu_set_msr(vcpus[i], MSR_IA32_APICBASE, LAPIC_X2APIC); + + TEST_ASSERT_EQ(pthread_create(&thread, NULL, race, vcpus[0]), 0); + + vcpuN = vcpus[KVM_MAX_VCPUS - 1]; + for (t = time(NULL) + TIMEOUT; time(NULL) < t;) { + vcpu_set_msr(vcpuN, MSR_IA32_APICBASE, LAPIC_X2APIC); + vcpu_set_msr(vcpuN, MSR_IA32_APICBASE, LAPIC_DISABLED); + } + + TEST_ASSERT_EQ(pthread_cancel(thread), 0); + TEST_ASSERT_EQ(pthread_join(thread, NULL), 0); + + kvm_vm_free(vm); + + return 0; +} diff --git a/tools/testing/selftests/kvm/x86_64/set_boot_cpu_id.c b/tools/testing/selftests/kvm/x86_64/set_boot_cpu_id.c new file mode 100644 index 000000000000..d691d86e5bc3 --- /dev/null +++ b/tools/testing/selftests/kvm/x86_64/set_boot_cpu_id.c @@ -0,0 +1,130 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Test that KVM_SET_BOOT_CPU_ID works as intended + * + * Copyright (C) 2020, Red Hat, Inc. + */ +#include <fcntl.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/ioctl.h> + +#include "test_util.h" +#include "kvm_util.h" +#include "processor.h" +#include "apic.h" + +static void guest_bsp_vcpu(void *arg) +{ + GUEST_SYNC(1); + + GUEST_ASSERT_NE(get_bsp_flag(), 0); + + GUEST_DONE(); +} + +static void guest_not_bsp_vcpu(void *arg) +{ + GUEST_SYNC(1); + + GUEST_ASSERT_EQ(get_bsp_flag(), 0); + + GUEST_DONE(); +} + +static void test_set_bsp_busy(struct kvm_vcpu *vcpu, const char *msg) +{ + int r = __vm_ioctl(vcpu->vm, KVM_SET_BOOT_CPU_ID, + (void *)(unsigned long)vcpu->id); + + TEST_ASSERT(r == -1 && errno == EBUSY, "KVM_SET_BOOT_CPU_ID set %s", msg); +} + +static void run_vcpu(struct kvm_vcpu *vcpu) +{ + struct ucall uc; + int stage; + + for (stage = 0; stage < 2; stage++) { + + vcpu_run(vcpu); + + switch (get_ucall(vcpu, &uc)) { + case UCALL_SYNC: + TEST_ASSERT(!strcmp((const char *)uc.args[0], "hello") && + uc.args[1] == stage + 1, + "Stage %d: Unexpected register values vmexit, got %lx", + stage + 1, (ulong)uc.args[1]); + test_set_bsp_busy(vcpu, "while running vm"); + break; + case UCALL_DONE: + TEST_ASSERT(stage == 1, + "Expected GUEST_DONE in stage 2, got stage %d", + stage); + break; + case UCALL_ABORT: + REPORT_GUEST_ASSERT(uc); + default: + TEST_ASSERT(false, "Unexpected exit: %s", + exit_reason_str(vcpu->run->exit_reason)); + } + } +} + +static struct kvm_vm *create_vm(uint32_t nr_vcpus, uint32_t bsp_vcpu_id, + struct kvm_vcpu *vcpus[]) +{ + struct kvm_vm *vm; + uint32_t i; + + vm = vm_create(nr_vcpus); + + vm_ioctl(vm, KVM_SET_BOOT_CPU_ID, (void *)(unsigned long)bsp_vcpu_id); + + for (i = 0; i < nr_vcpus; i++) + vcpus[i] = vm_vcpu_add(vm, i, i == bsp_vcpu_id ? guest_bsp_vcpu : + guest_not_bsp_vcpu); + return vm; +} + +static void run_vm_bsp(uint32_t bsp_vcpu_id) +{ + struct kvm_vcpu *vcpus[2]; + struct kvm_vm *vm; + + vm = create_vm(ARRAY_SIZE(vcpus), bsp_vcpu_id, vcpus); + + run_vcpu(vcpus[0]); + run_vcpu(vcpus[1]); + + kvm_vm_free(vm); +} + +static void check_set_bsp_busy(void) +{ + struct kvm_vcpu *vcpus[2]; + struct kvm_vm *vm; + + vm = create_vm(ARRAY_SIZE(vcpus), 0, vcpus); + + test_set_bsp_busy(vcpus[1], "after adding vcpu"); + + run_vcpu(vcpus[0]); + run_vcpu(vcpus[1]); + + test_set_bsp_busy(vcpus[1], "to a terminated vcpu"); + + kvm_vm_free(vm); +} + +int main(int argc, char *argv[]) +{ + TEST_REQUIRE(kvm_has_cap(KVM_CAP_SET_BOOT_CPU_ID)); + + run_vm_bsp(0); + run_vm_bsp(1); + run_vm_bsp(0); + + check_set_bsp_busy(); +} diff --git a/tools/testing/selftests/kvm/x86_64/set_sregs_test.c b/tools/testing/selftests/kvm/x86_64/set_sregs_test.c index 9f7656184f31..c021c0795a96 100644 --- a/tools/testing/selftests/kvm/x86_64/set_sregs_test.c +++ b/tools/testing/selftests/kvm/x86_64/set_sregs_test.c @@ -10,7 +10,6 @@ * That bug allowed a user-mode program that called the KVM_SET_SREGS * ioctl to put a VCPU's local APIC into an invalid state. */ -#define _GNU_SOURCE /* for program_invocation_short_name */ #include <fcntl.h> #include <stdio.h> #include <stdlib.h> @@ -22,27 +21,117 @@ #include "kvm_util.h" #include "processor.h" -#define VCPU_ID 5 +#define TEST_INVALID_CR_BIT(vcpu, cr, orig, bit) \ +do { \ + struct kvm_sregs new; \ + int rc; \ + \ + /* Skip the sub-test, the feature/bit is supported. */ \ + if (orig.cr & bit) \ + break; \ + \ + memcpy(&new, &orig, sizeof(sregs)); \ + new.cr |= bit; \ + \ + rc = _vcpu_sregs_set(vcpu, &new); \ + TEST_ASSERT(rc, "KVM allowed invalid " #cr " bit (0x%lx)", bit); \ + \ + /* Sanity check that KVM didn't change anything. */ \ + vcpu_sregs_get(vcpu, &new); \ + TEST_ASSERT(!memcmp(&new, &orig, sizeof(new)), "KVM modified sregs"); \ +} while (0) + +static uint64_t calc_supported_cr4_feature_bits(void) +{ + uint64_t cr4; + + cr4 = X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE | + X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE | X86_CR4_PGE | + X86_CR4_PCE | X86_CR4_OSFXSR | X86_CR4_OSXMMEXCPT; + if (kvm_cpu_has(X86_FEATURE_UMIP)) + cr4 |= X86_CR4_UMIP; + if (kvm_cpu_has(X86_FEATURE_LA57)) + cr4 |= X86_CR4_LA57; + if (kvm_cpu_has(X86_FEATURE_VMX)) + cr4 |= X86_CR4_VMXE; + if (kvm_cpu_has(X86_FEATURE_SMX)) + cr4 |= X86_CR4_SMXE; + if (kvm_cpu_has(X86_FEATURE_FSGSBASE)) + cr4 |= X86_CR4_FSGSBASE; + if (kvm_cpu_has(X86_FEATURE_PCID)) + cr4 |= X86_CR4_PCIDE; + if (kvm_cpu_has(X86_FEATURE_XSAVE)) + cr4 |= X86_CR4_OSXSAVE; + if (kvm_cpu_has(X86_FEATURE_SMEP)) + cr4 |= X86_CR4_SMEP; + if (kvm_cpu_has(X86_FEATURE_SMAP)) + cr4 |= X86_CR4_SMAP; + if (kvm_cpu_has(X86_FEATURE_PKU)) + cr4 |= X86_CR4_PKE; + + return cr4; +} int main(int argc, char *argv[]) { struct kvm_sregs sregs; + struct kvm_vcpu *vcpu; struct kvm_vm *vm; - int rc; + uint64_t cr4; + int rc, i; + + /* + * Create a dummy VM, specifically to avoid doing KVM_SET_CPUID2, and + * use it to verify all supported CR4 bits can be set prior to defining + * the vCPU model, i.e. without doing KVM_SET_CPUID2. + */ + vm = vm_create_barebones(); + vcpu = __vm_vcpu_add(vm, 0); + + vcpu_sregs_get(vcpu, &sregs); - /* Tell stdout not to buffer its content */ - setbuf(stdout, NULL); + sregs.cr0 = 0; + sregs.cr4 |= calc_supported_cr4_feature_bits(); + cr4 = sregs.cr4; + + rc = _vcpu_sregs_set(vcpu, &sregs); + TEST_ASSERT(!rc, "Failed to set supported CR4 bits (0x%lx)", cr4); + + vcpu_sregs_get(vcpu, &sregs); + TEST_ASSERT(sregs.cr4 == cr4, "sregs.CR4 (0x%llx) != CR4 (0x%lx)", + sregs.cr4, cr4); + + /* Verify all unsupported features are rejected by KVM. */ + TEST_INVALID_CR_BIT(vcpu, cr4, sregs, X86_CR4_UMIP); + TEST_INVALID_CR_BIT(vcpu, cr4, sregs, X86_CR4_LA57); + TEST_INVALID_CR_BIT(vcpu, cr4, sregs, X86_CR4_VMXE); + TEST_INVALID_CR_BIT(vcpu, cr4, sregs, X86_CR4_SMXE); + TEST_INVALID_CR_BIT(vcpu, cr4, sregs, X86_CR4_FSGSBASE); + TEST_INVALID_CR_BIT(vcpu, cr4, sregs, X86_CR4_PCIDE); + TEST_INVALID_CR_BIT(vcpu, cr4, sregs, X86_CR4_OSXSAVE); + TEST_INVALID_CR_BIT(vcpu, cr4, sregs, X86_CR4_SMEP); + TEST_INVALID_CR_BIT(vcpu, cr4, sregs, X86_CR4_SMAP); + TEST_INVALID_CR_BIT(vcpu, cr4, sregs, X86_CR4_PKE); + + for (i = 32; i < 64; i++) + TEST_INVALID_CR_BIT(vcpu, cr0, sregs, BIT(i)); + + /* NW without CD is illegal, as is PG without PE. */ + TEST_INVALID_CR_BIT(vcpu, cr0, sregs, X86_CR0_NW); + TEST_INVALID_CR_BIT(vcpu, cr0, sregs, X86_CR0_PG); + + kvm_vm_free(vm); - /* Create VM */ - vm = vm_create_default(VCPU_ID, 0, NULL); + /* Create a "real" VM and verify APIC_BASE can be set. */ + vm = vm_create_with_one_vcpu(&vcpu, NULL); - vcpu_sregs_get(vm, VCPU_ID, &sregs); + vcpu_sregs_get(vcpu, &sregs); sregs.apic_base = 1 << 10; - rc = _vcpu_sregs_set(vm, VCPU_ID, &sregs); + rc = _vcpu_sregs_set(vcpu, &sregs); TEST_ASSERT(rc, "Set IA32_APIC_BASE to %llx (invalid)", sregs.apic_base); sregs.apic_base = 1 << 11; - rc = _vcpu_sregs_set(vm, VCPU_ID, &sregs); + rc = _vcpu_sregs_set(vcpu, &sregs); TEST_ASSERT(!rc, "Couldn't set IA32_APIC_BASE to %llx (valid)", sregs.apic_base); diff --git a/tools/testing/selftests/kvm/x86_64/sev_init2_tests.c b/tools/testing/selftests/kvm/x86_64/sev_init2_tests.c new file mode 100644 index 000000000000..7a4a61be119b --- /dev/null +++ b/tools/testing/selftests/kvm/x86_64/sev_init2_tests.c @@ -0,0 +1,152 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include <linux/kvm.h> +#include <linux/psp-sev.h> +#include <stdio.h> +#include <sys/ioctl.h> +#include <stdlib.h> +#include <errno.h> +#include <pthread.h> + +#include "test_util.h" +#include "kvm_util.h" +#include "processor.h" +#include "svm_util.h" +#include "kselftest.h" + +#define SVM_SEV_FEAT_DEBUG_SWAP 32u + +/* + * Some features may have hidden dependencies, or may only work + * for certain VM types. Err on the side of safety and don't + * expect that all supported features can be passed one by one + * to KVM_SEV_INIT2. + * + * (Well, right now there's only one...) + */ +#define KNOWN_FEATURES SVM_SEV_FEAT_DEBUG_SWAP + +int kvm_fd; +u64 supported_vmsa_features; +bool have_sev_es; + +static int __sev_ioctl(int vm_fd, int cmd_id, void *data) +{ + struct kvm_sev_cmd cmd = { + .id = cmd_id, + .data = (uint64_t)data, + .sev_fd = open_sev_dev_path_or_exit(), + }; + int ret; + + ret = ioctl(vm_fd, KVM_MEMORY_ENCRYPT_OP, &cmd); + TEST_ASSERT(ret < 0 || cmd.error == SEV_RET_SUCCESS, + "%d failed: fw error: %d\n", + cmd_id, cmd.error); + + return ret; +} + +static void test_init2(unsigned long vm_type, struct kvm_sev_init *init) +{ + struct kvm_vm *vm; + int ret; + + vm = vm_create_barebones_type(vm_type); + ret = __sev_ioctl(vm->fd, KVM_SEV_INIT2, init); + TEST_ASSERT(ret == 0, + "KVM_SEV_INIT2 return code is %d (expected 0), errno: %d", + ret, errno); + kvm_vm_free(vm); +} + +static void test_init2_invalid(unsigned long vm_type, struct kvm_sev_init *init, const char *msg) +{ + struct kvm_vm *vm; + int ret; + + vm = vm_create_barebones_type(vm_type); + ret = __sev_ioctl(vm->fd, KVM_SEV_INIT2, init); + TEST_ASSERT(ret == -1 && errno == EINVAL, + "KVM_SEV_INIT2 should fail, %s.", + msg); + kvm_vm_free(vm); +} + +void test_vm_types(void) +{ + test_init2(KVM_X86_SEV_VM, &(struct kvm_sev_init){}); + + /* + * TODO: check that unsupported types cannot be created. Probably + * a separate selftest. + */ + if (have_sev_es) + test_init2(KVM_X86_SEV_ES_VM, &(struct kvm_sev_init){}); + + test_init2_invalid(0, &(struct kvm_sev_init){}, + "VM type is KVM_X86_DEFAULT_VM"); + if (kvm_check_cap(KVM_CAP_VM_TYPES) & BIT(KVM_X86_SW_PROTECTED_VM)) + test_init2_invalid(KVM_X86_SW_PROTECTED_VM, &(struct kvm_sev_init){}, + "VM type is KVM_X86_SW_PROTECTED_VM"); +} + +void test_flags(uint32_t vm_type) +{ + int i; + + for (i = 0; i < 32; i++) + test_init2_invalid(vm_type, + &(struct kvm_sev_init){ .flags = BIT(i) }, + "invalid flag"); +} + +void test_features(uint32_t vm_type, uint64_t supported_features) +{ + int i; + + for (i = 0; i < 64; i++) { + if (!(supported_features & (1u << i))) + test_init2_invalid(vm_type, + &(struct kvm_sev_init){ .vmsa_features = BIT_ULL(i) }, + "unknown feature"); + else if (KNOWN_FEATURES & (1u << i)) + test_init2(vm_type, + &(struct kvm_sev_init){ .vmsa_features = BIT_ULL(i) }); + } +} + +int main(int argc, char *argv[]) +{ + int kvm_fd = open_kvm_dev_path_or_exit(); + bool have_sev; + + TEST_REQUIRE(__kvm_has_device_attr(kvm_fd, KVM_X86_GRP_SEV, + KVM_X86_SEV_VMSA_FEATURES) == 0); + kvm_device_attr_get(kvm_fd, KVM_X86_GRP_SEV, + KVM_X86_SEV_VMSA_FEATURES, + &supported_vmsa_features); + + have_sev = kvm_cpu_has(X86_FEATURE_SEV); + TEST_ASSERT(have_sev == !!(kvm_check_cap(KVM_CAP_VM_TYPES) & BIT(KVM_X86_SEV_VM)), + "sev: KVM_CAP_VM_TYPES (%x) does not match cpuid (checking %x)", + kvm_check_cap(KVM_CAP_VM_TYPES), 1 << KVM_X86_SEV_VM); + + TEST_REQUIRE(kvm_check_cap(KVM_CAP_VM_TYPES) & BIT(KVM_X86_SEV_VM)); + have_sev_es = kvm_cpu_has(X86_FEATURE_SEV_ES); + + TEST_ASSERT(have_sev_es == !!(kvm_check_cap(KVM_CAP_VM_TYPES) & BIT(KVM_X86_SEV_ES_VM)), + "sev-es: KVM_CAP_VM_TYPES (%x) does not match cpuid (checking %x)", + kvm_check_cap(KVM_CAP_VM_TYPES), 1 << KVM_X86_SEV_ES_VM); + + test_vm_types(); + + test_flags(KVM_X86_SEV_VM); + if (have_sev_es) + test_flags(KVM_X86_SEV_ES_VM); + + test_features(KVM_X86_SEV_VM, 0); + if (have_sev_es) + test_features(KVM_X86_SEV_ES_VM, supported_vmsa_features); + + return 0; +} diff --git a/tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c b/tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c new file mode 100644 index 000000000000..0a6dfba3905b --- /dev/null +++ b/tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c @@ -0,0 +1,397 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include <linux/kvm.h> +#include <linux/psp-sev.h> +#include <stdio.h> +#include <sys/ioctl.h> +#include <stdlib.h> +#include <errno.h> +#include <pthread.h> + +#include "test_util.h" +#include "kvm_util.h" +#include "processor.h" +#include "sev.h" +#include "kselftest.h" + +#define NR_MIGRATE_TEST_VCPUS 4 +#define NR_MIGRATE_TEST_VMS 3 +#define NR_LOCK_TESTING_THREADS 3 +#define NR_LOCK_TESTING_ITERATIONS 10000 + +bool have_sev_es; + +static struct kvm_vm *sev_vm_create(bool es) +{ + struct kvm_vm *vm; + int i; + + vm = vm_create_barebones(); + if (!es) + sev_vm_init(vm); + else + sev_es_vm_init(vm); + + for (i = 0; i < NR_MIGRATE_TEST_VCPUS; ++i) + __vm_vcpu_add(vm, i); + + sev_vm_launch(vm, es ? SEV_POLICY_ES : 0); + + if (es) + vm_sev_ioctl(vm, KVM_SEV_LAUNCH_UPDATE_VMSA, NULL); + return vm; +} + +static struct kvm_vm *aux_vm_create(bool with_vcpus) +{ + struct kvm_vm *vm; + int i; + + vm = vm_create_barebones(); + if (!with_vcpus) + return vm; + + for (i = 0; i < NR_MIGRATE_TEST_VCPUS; ++i) + __vm_vcpu_add(vm, i); + + return vm; +} + +static int __sev_migrate_from(struct kvm_vm *dst, struct kvm_vm *src) +{ + return __vm_enable_cap(dst, KVM_CAP_VM_MOVE_ENC_CONTEXT_FROM, src->fd); +} + + +static void sev_migrate_from(struct kvm_vm *dst, struct kvm_vm *src) +{ + int ret; + + ret = __sev_migrate_from(dst, src); + TEST_ASSERT(!ret, "Migration failed, ret: %d, errno: %d", ret, errno); +} + +static void test_sev_migrate_from(bool es) +{ + struct kvm_vm *src_vm; + struct kvm_vm *dst_vms[NR_MIGRATE_TEST_VMS]; + int i, ret; + + src_vm = sev_vm_create(es); + for (i = 0; i < NR_MIGRATE_TEST_VMS; ++i) + dst_vms[i] = aux_vm_create(true); + + /* Initial migration from the src to the first dst. */ + sev_migrate_from(dst_vms[0], src_vm); + + for (i = 1; i < NR_MIGRATE_TEST_VMS; i++) + sev_migrate_from(dst_vms[i], dst_vms[i - 1]); + + /* Migrate the guest back to the original VM. */ + ret = __sev_migrate_from(src_vm, dst_vms[NR_MIGRATE_TEST_VMS - 1]); + TEST_ASSERT(ret == -1 && errno == EIO, + "VM that was migrated from should be dead. ret %d, errno: %d", ret, + errno); + + kvm_vm_free(src_vm); + for (i = 0; i < NR_MIGRATE_TEST_VMS; ++i) + kvm_vm_free(dst_vms[i]); +} + +struct locking_thread_input { + struct kvm_vm *vm; + struct kvm_vm *source_vms[NR_LOCK_TESTING_THREADS]; +}; + +static void *locking_test_thread(void *arg) +{ + int i, j; + struct locking_thread_input *input = (struct locking_thread_input *)arg; + + for (i = 0; i < NR_LOCK_TESTING_ITERATIONS; ++i) { + j = i % NR_LOCK_TESTING_THREADS; + __sev_migrate_from(input->vm, input->source_vms[j]); + } + + return NULL; +} + +static void test_sev_migrate_locking(void) +{ + struct locking_thread_input input[NR_LOCK_TESTING_THREADS]; + pthread_t pt[NR_LOCK_TESTING_THREADS]; + int i; + + for (i = 0; i < NR_LOCK_TESTING_THREADS; ++i) { + input[i].vm = sev_vm_create(/* es= */ false); + input[0].source_vms[i] = input[i].vm; + } + for (i = 1; i < NR_LOCK_TESTING_THREADS; ++i) + memcpy(input[i].source_vms, input[0].source_vms, + sizeof(input[i].source_vms)); + + for (i = 0; i < NR_LOCK_TESTING_THREADS; ++i) + pthread_create(&pt[i], NULL, locking_test_thread, &input[i]); + + for (i = 0; i < NR_LOCK_TESTING_THREADS; ++i) + pthread_join(pt[i], NULL); + for (i = 0; i < NR_LOCK_TESTING_THREADS; ++i) + kvm_vm_free(input[i].vm); +} + +static void test_sev_migrate_parameters(void) +{ + struct kvm_vm *sev_vm, *sev_es_vm, *vm_no_vcpu, *vm_no_sev, + *sev_es_vm_no_vmsa; + int ret; + + vm_no_vcpu = vm_create_barebones(); + vm_no_sev = aux_vm_create(true); + ret = __sev_migrate_from(vm_no_vcpu, vm_no_sev); + TEST_ASSERT(ret == -1 && errno == EINVAL, + "Migrations require SEV enabled. ret %d, errno: %d", ret, + errno); + + if (!have_sev_es) + goto out; + + sev_vm = sev_vm_create(/* es= */ false); + sev_es_vm = sev_vm_create(/* es= */ true); + sev_es_vm_no_vmsa = vm_create_barebones(); + sev_es_vm_init(sev_es_vm_no_vmsa); + __vm_vcpu_add(sev_es_vm_no_vmsa, 1); + + ret = __sev_migrate_from(sev_vm, sev_es_vm); + TEST_ASSERT( + ret == -1 && errno == EINVAL, + "Should not be able migrate to SEV enabled VM. ret: %d, errno: %d", + ret, errno); + + ret = __sev_migrate_from(sev_es_vm, sev_vm); + TEST_ASSERT( + ret == -1 && errno == EINVAL, + "Should not be able migrate to SEV-ES enabled VM. ret: %d, errno: %d", + ret, errno); + + ret = __sev_migrate_from(vm_no_vcpu, sev_es_vm); + TEST_ASSERT( + ret == -1 && errno == EINVAL, + "SEV-ES migrations require same number of vCPUS. ret: %d, errno: %d", + ret, errno); + + ret = __sev_migrate_from(vm_no_vcpu, sev_es_vm_no_vmsa); + TEST_ASSERT( + ret == -1 && errno == EINVAL, + "SEV-ES migrations require UPDATE_VMSA. ret %d, errno: %d", + ret, errno); + + kvm_vm_free(sev_vm); + kvm_vm_free(sev_es_vm); + kvm_vm_free(sev_es_vm_no_vmsa); +out: + kvm_vm_free(vm_no_vcpu); + kvm_vm_free(vm_no_sev); +} + +static int __sev_mirror_create(struct kvm_vm *dst, struct kvm_vm *src) +{ + return __vm_enable_cap(dst, KVM_CAP_VM_COPY_ENC_CONTEXT_FROM, src->fd); +} + + +static void sev_mirror_create(struct kvm_vm *dst, struct kvm_vm *src) +{ + int ret; + + ret = __sev_mirror_create(dst, src); + TEST_ASSERT(!ret, "Copying context failed, ret: %d, errno: %d", ret, errno); +} + +static void verify_mirror_allowed_cmds(struct kvm_vm *vm) +{ + struct kvm_sev_guest_status status; + int cmd_id; + + for (cmd_id = KVM_SEV_INIT; cmd_id < KVM_SEV_NR_MAX; ++cmd_id) { + int ret; + + /* + * These commands are allowed for mirror VMs, all others are + * not. + */ + switch (cmd_id) { + case KVM_SEV_LAUNCH_UPDATE_VMSA: + case KVM_SEV_GUEST_STATUS: + case KVM_SEV_DBG_DECRYPT: + case KVM_SEV_DBG_ENCRYPT: + continue; + default: + break; + } + + /* + * These commands should be disallowed before the data + * parameter is examined so NULL is OK here. + */ + ret = __vm_sev_ioctl(vm, cmd_id, NULL); + TEST_ASSERT( + ret == -1 && errno == EINVAL, + "Should not be able call command: %d. ret: %d, errno: %d", + cmd_id, ret, errno); + } + + vm_sev_ioctl(vm, KVM_SEV_GUEST_STATUS, &status); +} + +static void test_sev_mirror(bool es) +{ + struct kvm_vm *src_vm, *dst_vm; + int i; + + src_vm = sev_vm_create(es); + dst_vm = aux_vm_create(false); + + sev_mirror_create(dst_vm, src_vm); + + /* Check that we can complete creation of the mirror VM. */ + for (i = 0; i < NR_MIGRATE_TEST_VCPUS; ++i) + __vm_vcpu_add(dst_vm, i); + + if (es) + vm_sev_ioctl(dst_vm, KVM_SEV_LAUNCH_UPDATE_VMSA, NULL); + + verify_mirror_allowed_cmds(dst_vm); + + kvm_vm_free(src_vm); + kvm_vm_free(dst_vm); +} + +static void test_sev_mirror_parameters(void) +{ + struct kvm_vm *sev_vm, *sev_es_vm, *vm_no_vcpu, *vm_with_vcpu; + int ret; + + sev_vm = sev_vm_create(/* es= */ false); + vm_with_vcpu = aux_vm_create(true); + vm_no_vcpu = aux_vm_create(false); + + ret = __sev_mirror_create(sev_vm, sev_vm); + TEST_ASSERT( + ret == -1 && errno == EINVAL, + "Should not be able copy context to self. ret: %d, errno: %d", + ret, errno); + + ret = __sev_mirror_create(vm_no_vcpu, vm_with_vcpu); + TEST_ASSERT(ret == -1 && errno == EINVAL, + "Copy context requires SEV enabled. ret %d, errno: %d", ret, + errno); + + ret = __sev_mirror_create(vm_with_vcpu, sev_vm); + TEST_ASSERT( + ret == -1 && errno == EINVAL, + "SEV copy context requires no vCPUS on the destination. ret: %d, errno: %d", + ret, errno); + + if (!have_sev_es) + goto out; + + sev_es_vm = sev_vm_create(/* es= */ true); + ret = __sev_mirror_create(sev_vm, sev_es_vm); + TEST_ASSERT( + ret == -1 && errno == EINVAL, + "Should not be able copy context to SEV enabled VM. ret: %d, errno: %d", + ret, errno); + + ret = __sev_mirror_create(sev_es_vm, sev_vm); + TEST_ASSERT( + ret == -1 && errno == EINVAL, + "Should not be able copy context to SEV-ES enabled VM. ret: %d, errno: %d", + ret, errno); + + kvm_vm_free(sev_es_vm); + +out: + kvm_vm_free(sev_vm); + kvm_vm_free(vm_with_vcpu); + kvm_vm_free(vm_no_vcpu); +} + +static void test_sev_move_copy(void) +{ + struct kvm_vm *dst_vm, *dst2_vm, *dst3_vm, *sev_vm, *mirror_vm, + *dst_mirror_vm, *dst2_mirror_vm, *dst3_mirror_vm; + + sev_vm = sev_vm_create(/* es= */ false); + dst_vm = aux_vm_create(true); + dst2_vm = aux_vm_create(true); + dst3_vm = aux_vm_create(true); + mirror_vm = aux_vm_create(false); + dst_mirror_vm = aux_vm_create(false); + dst2_mirror_vm = aux_vm_create(false); + dst3_mirror_vm = aux_vm_create(false); + + sev_mirror_create(mirror_vm, sev_vm); + + sev_migrate_from(dst_mirror_vm, mirror_vm); + sev_migrate_from(dst_vm, sev_vm); + + sev_migrate_from(dst2_vm, dst_vm); + sev_migrate_from(dst2_mirror_vm, dst_mirror_vm); + + sev_migrate_from(dst3_mirror_vm, dst2_mirror_vm); + sev_migrate_from(dst3_vm, dst2_vm); + + kvm_vm_free(dst_vm); + kvm_vm_free(sev_vm); + kvm_vm_free(dst2_vm); + kvm_vm_free(dst3_vm); + kvm_vm_free(mirror_vm); + kvm_vm_free(dst_mirror_vm); + kvm_vm_free(dst2_mirror_vm); + kvm_vm_free(dst3_mirror_vm); + + /* + * Run similar test be destroy mirrors before mirrored VMs to ensure + * destruction is done safely. + */ + sev_vm = sev_vm_create(/* es= */ false); + dst_vm = aux_vm_create(true); + mirror_vm = aux_vm_create(false); + dst_mirror_vm = aux_vm_create(false); + + sev_mirror_create(mirror_vm, sev_vm); + + sev_migrate_from(dst_mirror_vm, mirror_vm); + sev_migrate_from(dst_vm, sev_vm); + + kvm_vm_free(mirror_vm); + kvm_vm_free(dst_mirror_vm); + kvm_vm_free(dst_vm); + kvm_vm_free(sev_vm); +} + +int main(int argc, char *argv[]) +{ + TEST_REQUIRE(kvm_has_cap(KVM_CAP_VM_MOVE_ENC_CONTEXT_FROM)); + TEST_REQUIRE(kvm_has_cap(KVM_CAP_VM_COPY_ENC_CONTEXT_FROM)); + + TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_SEV)); + + have_sev_es = kvm_cpu_has(X86_FEATURE_SEV_ES); + + if (kvm_has_cap(KVM_CAP_VM_MOVE_ENC_CONTEXT_FROM)) { + test_sev_migrate_from(/* es= */ false); + if (have_sev_es) + test_sev_migrate_from(/* es= */ true); + test_sev_migrate_locking(); + test_sev_migrate_parameters(); + if (kvm_has_cap(KVM_CAP_VM_COPY_ENC_CONTEXT_FROM)) + test_sev_move_copy(); + } + if (kvm_has_cap(KVM_CAP_VM_COPY_ENC_CONTEXT_FROM)) { + test_sev_mirror(/* es= */ false); + if (have_sev_es) + test_sev_mirror(/* es= */ true); + test_sev_mirror_parameters(); + } + return 0; +} diff --git a/tools/testing/selftests/kvm/x86_64/sev_smoke_test.c b/tools/testing/selftests/kvm/x86_64/sev_smoke_test.c new file mode 100644 index 000000000000..7c70c0da4fb7 --- /dev/null +++ b/tools/testing/selftests/kvm/x86_64/sev_smoke_test.c @@ -0,0 +1,182 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include <fcntl.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/ioctl.h> +#include <math.h> + +#include "test_util.h" +#include "kvm_util.h" +#include "processor.h" +#include "svm_util.h" +#include "linux/psp-sev.h" +#include "sev.h" + + +#define XFEATURE_MASK_X87_AVX (XFEATURE_MASK_FP | XFEATURE_MASK_SSE | XFEATURE_MASK_YMM) + +static void guest_sev_es_code(void) +{ + /* TODO: Check CPUID after GHCB-based hypercall support is added. */ + GUEST_ASSERT(rdmsr(MSR_AMD64_SEV) & MSR_AMD64_SEV_ENABLED); + GUEST_ASSERT(rdmsr(MSR_AMD64_SEV) & MSR_AMD64_SEV_ES_ENABLED); + + /* + * TODO: Add GHCB and ucall support for SEV-ES guests. For now, simply + * force "termination" to signal "done" via the GHCB MSR protocol. + */ + wrmsr(MSR_AMD64_SEV_ES_GHCB, GHCB_MSR_TERM_REQ); + __asm__ __volatile__("rep; vmmcall"); +} + +static void guest_sev_code(void) +{ + GUEST_ASSERT(this_cpu_has(X86_FEATURE_SEV)); + GUEST_ASSERT(rdmsr(MSR_AMD64_SEV) & MSR_AMD64_SEV_ENABLED); + + GUEST_DONE(); +} + +/* Stash state passed via VMSA before any compiled code runs. */ +extern void guest_code_xsave(void); +asm("guest_code_xsave:\n" + "mov $-1, %eax\n" + "mov $-1, %edx\n" + "xsave (%rdi)\n" + "jmp guest_sev_es_code"); + +static void compare_xsave(u8 *from_host, u8 *from_guest) +{ + int i; + bool bad = false; + for (i = 0; i < 4095; i++) { + if (from_host[i] != from_guest[i]) { + printf("mismatch at %02hhx | %02hhx %02hhx\n", i, from_host[i], from_guest[i]); + bad = true; + } + } + + if (bad) + abort(); +} + +static void test_sync_vmsa(uint32_t policy) +{ + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; + vm_vaddr_t gva; + void *hva; + + double x87val = M_PI; + struct kvm_xsave __attribute__((aligned(64))) xsave = { 0 }; + struct kvm_sregs sregs; + struct kvm_xcrs xcrs = { + .nr_xcrs = 1, + .xcrs[0].xcr = 0, + .xcrs[0].value = XFEATURE_MASK_X87_AVX, + }; + + vm = vm_sev_create_with_one_vcpu(KVM_X86_SEV_ES_VM, guest_code_xsave, &vcpu); + gva = vm_vaddr_alloc_shared(vm, PAGE_SIZE, KVM_UTIL_MIN_VADDR, + MEM_REGION_TEST_DATA); + hva = addr_gva2hva(vm, gva); + + vcpu_args_set(vcpu, 1, gva); + + vcpu_sregs_get(vcpu, &sregs); + sregs.cr4 |= X86_CR4_OSFXSR | X86_CR4_OSXSAVE; + vcpu_sregs_set(vcpu, &sregs); + + vcpu_xcrs_set(vcpu, &xcrs); + asm("fninit\n" + "vpcmpeqb %%ymm4, %%ymm4, %%ymm4\n" + "fldl %3\n" + "xsave (%2)\n" + "fstp %%st\n" + : "=m"(xsave) + : "A"(XFEATURE_MASK_X87_AVX), "r"(&xsave), "m" (x87val) + : "ymm4", "st", "st(1)", "st(2)", "st(3)", "st(4)", "st(5)", "st(6)", "st(7)"); + vcpu_xsave_set(vcpu, &xsave); + + vm_sev_launch(vm, SEV_POLICY_ES | policy, NULL); + + /* This page is shared, so make it decrypted. */ + memset(hva, 0, 4096); + + vcpu_run(vcpu); + + TEST_ASSERT(vcpu->run->exit_reason == KVM_EXIT_SYSTEM_EVENT, + "Wanted SYSTEM_EVENT, got %s", + exit_reason_str(vcpu->run->exit_reason)); + TEST_ASSERT_EQ(vcpu->run->system_event.type, KVM_SYSTEM_EVENT_SEV_TERM); + TEST_ASSERT_EQ(vcpu->run->system_event.ndata, 1); + TEST_ASSERT_EQ(vcpu->run->system_event.data[0], GHCB_MSR_TERM_REQ); + + compare_xsave((u8 *)&xsave, (u8 *)hva); + + kvm_vm_free(vm); +} + +static void test_sev(void *guest_code, uint64_t policy) +{ + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; + struct ucall uc; + + uint32_t type = policy & SEV_POLICY_ES ? KVM_X86_SEV_ES_VM : KVM_X86_SEV_VM; + + vm = vm_sev_create_with_one_vcpu(type, guest_code, &vcpu); + + /* TODO: Validate the measurement is as expected. */ + vm_sev_launch(vm, policy, NULL); + + for (;;) { + vcpu_run(vcpu); + + if (policy & SEV_POLICY_ES) { + TEST_ASSERT(vcpu->run->exit_reason == KVM_EXIT_SYSTEM_EVENT, + "Wanted SYSTEM_EVENT, got %s", + exit_reason_str(vcpu->run->exit_reason)); + TEST_ASSERT_EQ(vcpu->run->system_event.type, KVM_SYSTEM_EVENT_SEV_TERM); + TEST_ASSERT_EQ(vcpu->run->system_event.ndata, 1); + TEST_ASSERT_EQ(vcpu->run->system_event.data[0], GHCB_MSR_TERM_REQ); + break; + } + + switch (get_ucall(vcpu, &uc)) { + case UCALL_SYNC: + continue; + case UCALL_DONE: + return; + case UCALL_ABORT: + REPORT_GUEST_ASSERT(uc); + default: + TEST_FAIL("Unexpected exit: %s", + exit_reason_str(vcpu->run->exit_reason)); + } + } + + kvm_vm_free(vm); +} + +int main(int argc, char *argv[]) +{ + TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_SEV)); + + test_sev(guest_sev_code, SEV_POLICY_NO_DBG); + test_sev(guest_sev_code, 0); + + if (kvm_cpu_has(X86_FEATURE_SEV_ES)) { + test_sev(guest_sev_es_code, SEV_POLICY_ES | SEV_POLICY_NO_DBG); + test_sev(guest_sev_es_code, SEV_POLICY_ES); + + if (kvm_has_cap(KVM_CAP_XCRS) && + (xgetbv(0) & XFEATURE_MASK_X87_AVX) == XFEATURE_MASK_X87_AVX) { + test_sync_vmsa(0); + test_sync_vmsa(SEV_POLICY_NO_DBG); + } + } + + return 0; +} diff --git a/tools/testing/selftests/kvm/x86_64/smaller_maxphyaddr_emulation_test.c b/tools/testing/selftests/kvm/x86_64/smaller_maxphyaddr_emulation_test.c new file mode 100644 index 000000000000..fabeeaddfb3a --- /dev/null +++ b/tools/testing/selftests/kvm/x86_64/smaller_maxphyaddr_emulation_test.c @@ -0,0 +1,105 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2020, Google LLC. + * + * Test that KVM emulates instructions in response to EPT violations when + * allow_smaller_maxphyaddr is enabled and guest.MAXPHYADDR < host.MAXPHYADDR. + */ +#include "flds_emulation.h" + +#include "test_util.h" +#include "kvm_util.h" +#include "vmx.h" + +#define MAXPHYADDR 36 + +#define MEM_REGION_GVA 0x0000123456789000 +#define MEM_REGION_GPA 0x0000000700000000 +#define MEM_REGION_SLOT 10 +#define MEM_REGION_SIZE PAGE_SIZE + +static void guest_code(bool tdp_enabled) +{ + uint64_t error_code; + uint64_t vector; + + vector = kvm_asm_safe_ec(FLDS_MEM_EAX, error_code, "a"(MEM_REGION_GVA)); + + /* + * When TDP is enabled, flds will trigger an emulation failure, exit to + * userspace, and then the selftest host "VMM" skips the instruction. + * + * When TDP is disabled, no instruction emulation is required so flds + * should generate #PF(RSVD). + */ + if (tdp_enabled) { + GUEST_ASSERT(!vector); + } else { + GUEST_ASSERT_EQ(vector, PF_VECTOR); + GUEST_ASSERT(error_code & PFERR_RSVD_MASK); + } + + GUEST_DONE(); +} + +int main(int argc, char *argv[]) +{ + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; + struct ucall uc; + uint64_t *pte; + uint64_t *hva; + uint64_t gpa; + int rc; + + TEST_REQUIRE(kvm_has_cap(KVM_CAP_SMALLER_MAXPHYADDR)); + + vm = vm_create_with_one_vcpu(&vcpu, guest_code); + vcpu_args_set(vcpu, 1, kvm_is_tdp_enabled()); + + vcpu_set_cpuid_property(vcpu, X86_PROPERTY_MAX_PHY_ADDR, MAXPHYADDR); + + rc = kvm_check_cap(KVM_CAP_EXIT_ON_EMULATION_FAILURE); + TEST_ASSERT(rc, "KVM_CAP_EXIT_ON_EMULATION_FAILURE is unavailable"); + vm_enable_cap(vm, KVM_CAP_EXIT_ON_EMULATION_FAILURE, 1); + + vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, + MEM_REGION_GPA, MEM_REGION_SLOT, + MEM_REGION_SIZE / PAGE_SIZE, 0); + gpa = vm_phy_pages_alloc(vm, MEM_REGION_SIZE / PAGE_SIZE, + MEM_REGION_GPA, MEM_REGION_SLOT); + TEST_ASSERT(gpa == MEM_REGION_GPA, "Failed vm_phy_pages_alloc"); + virt_map(vm, MEM_REGION_GVA, MEM_REGION_GPA, 1); + hva = addr_gpa2hva(vm, MEM_REGION_GPA); + memset(hva, 0, PAGE_SIZE); + + pte = vm_get_page_table_entry(vm, MEM_REGION_GVA); + *pte |= BIT_ULL(MAXPHYADDR); + + vcpu_run(vcpu); + + /* + * When TDP is enabled, KVM must emulate in response the guest physical + * address that is illegal from the guest's perspective, but is legal + * from hardware's perspeective. This should result in an emulation + * failure exit to userspace since KVM doesn't support emulating flds. + */ + if (kvm_is_tdp_enabled()) { + handle_flds_emulation_failure_exit(vcpu); + vcpu_run(vcpu); + } + + switch (get_ucall(vcpu, &uc)) { + case UCALL_ABORT: + REPORT_GUEST_ASSERT(uc); + break; + case UCALL_DONE: + break; + default: + TEST_FAIL("Unrecognized ucall: %lu", uc.cmd); + } + + kvm_vm_free(vm); + + return 0; +} diff --git a/tools/testing/selftests/kvm/x86_64/smm_test.c b/tools/testing/selftests/kvm/x86_64/smm_test.c index ae39a220609f..55c88d664a94 100644 --- a/tools/testing/selftests/kvm/x86_64/smm_test.c +++ b/tools/testing/selftests/kvm/x86_64/smm_test.c @@ -4,7 +4,6 @@ * * Tests for SMM. */ -#define _GNU_SOURCE /* for program_invocation_short_name */ #include <fcntl.h> #include <stdio.h> #include <stdlib.h> @@ -19,10 +18,6 @@ #include "vmx.h" #include "svm_util.h" -#define VCPU_ID 1 - -#define PAGE_SIZE 4096 - #define SMRAM_SIZE 65536 #define SMRAM_MEMSLOT ((1 << 16) | 1) #define SMRAM_PAGES (SMRAM_SIZE / PAGE_SIZE) @@ -53,15 +48,28 @@ static inline void sync_with_host(uint64_t phase) : "+a" (phase)); } -void self_smi(void) +static void self_smi(void) +{ + x2apic_write_reg(APIC_ICR, + APIC_DEST_SELF | APIC_INT_ASSERT | APIC_DM_SMI); +} + +static void l2_guest_code(void) { - wrmsr(APIC_BASE_MSR + (APIC_ICR >> 4), - APIC_DEST_SELF | APIC_INT_ASSERT | APIC_DM_SMI); + sync_with_host(8); + + sync_with_host(10); + + vmcall(); } -void guest_code(void *arg) +static void guest_code(void *arg) { + #define L2_GUEST_STACK_SIZE 64 + unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; uint64_t apicbase = rdmsr(MSR_IA32_APICBASE); + struct svm_test_data *svm = arg; + struct vmx_pages *vmx_pages = arg; sync_with_host(1); @@ -74,37 +82,63 @@ void guest_code(void *arg) sync_with_host(4); if (arg) { - if (cpu_has_svm()) - generic_svm_setup(arg, NULL, NULL); - else - GUEST_ASSERT(prepare_for_vmx_operation(arg)); + if (this_cpu_has(X86_FEATURE_SVM)) { + generic_svm_setup(svm, l2_guest_code, + &l2_guest_stack[L2_GUEST_STACK_SIZE]); + } else { + GUEST_ASSERT(prepare_for_vmx_operation(vmx_pages)); + GUEST_ASSERT(load_vmcs(vmx_pages)); + prepare_vmcs(vmx_pages, l2_guest_code, + &l2_guest_stack[L2_GUEST_STACK_SIZE]); + } sync_with_host(5); self_smi(); sync_with_host(7); + + if (this_cpu_has(X86_FEATURE_SVM)) { + run_guest(svm->vmcb, svm->vmcb_gpa); + run_guest(svm->vmcb, svm->vmcb_gpa); + } else { + vmlaunch(); + vmresume(); + } + + /* Stages 8-11 are eaten by SMM (SMRAM_STAGE reported instead) */ + sync_with_host(12); } sync_with_host(DONE); } +void inject_smi(struct kvm_vcpu *vcpu) +{ + struct kvm_vcpu_events events; + + vcpu_events_get(vcpu, &events); + + events.smi.pending = 1; + events.flags |= KVM_VCPUEVENT_VALID_SMM; + + vcpu_events_set(vcpu, &events); +} + int main(int argc, char *argv[]) { vm_vaddr_t nested_gva = 0; + struct kvm_vcpu *vcpu; struct kvm_regs regs; struct kvm_vm *vm; - struct kvm_run *run; struct kvm_x86_state *state; int stage, stage_reported; - /* Create VM */ - vm = vm_create_default(VCPU_ID, 0, guest_code); - - vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid()); + TEST_REQUIRE(kvm_has_cap(KVM_CAP_X86_SMM)); - run = vcpu_state(vm, VCPU_ID); + /* Create VM */ + vm = vm_create_with_one_vcpu(&vcpu, guest_code); vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, SMRAM_GPA, SMRAM_MEMSLOT, SMRAM_PAGES, 0); @@ -115,29 +149,26 @@ int main(int argc, char *argv[]) memcpy(addr_gpa2hva(vm, SMRAM_GPA) + 0x8000, smi_handler, sizeof(smi_handler)); - vcpu_set_msr(vm, VCPU_ID, MSR_IA32_SMBASE, SMRAM_GPA); + vcpu_set_msr(vcpu, MSR_IA32_SMBASE, SMRAM_GPA); - if (kvm_check_cap(KVM_CAP_NESTED_STATE)) { - if (nested_svm_supported()) + if (kvm_has_cap(KVM_CAP_NESTED_STATE)) { + if (kvm_cpu_has(X86_FEATURE_SVM)) vcpu_alloc_svm(vm, &nested_gva); - else if (nested_vmx_supported()) + else if (kvm_cpu_has(X86_FEATURE_VMX)) vcpu_alloc_vmx(vm, &nested_gva); } if (!nested_gva) pr_info("will skip SMM test with VMX enabled\n"); - vcpu_args_set(vm, VCPU_ID, 1, nested_gva); + vcpu_args_set(vcpu, 1, nested_gva); for (stage = 1;; stage++) { - _vcpu_run(vm, VCPU_ID); - TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, - "Stage %d: unexpected exit reason: %u (%s),\n", - stage, run->exit_reason, - exit_reason_str(run->exit_reason)); + vcpu_run(vcpu); + TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO); memset(®s, 0, sizeof(regs)); - vcpu_regs_get(vm, VCPU_ID, ®s); + vcpu_regs_get(vcpu, ®s); stage_reported = regs.rax & 0xff; @@ -149,14 +180,28 @@ int main(int argc, char *argv[]) "Unexpected stage: #%x, got %x", stage, stage_reported); - state = vcpu_save_state(vm, VCPU_ID); + /* + * Enter SMM during L2 execution and check that we correctly + * return from it. Do not perform save/restore while in SMM yet. + */ + if (stage == 8) { + inject_smi(vcpu); + continue; + } + + /* + * Perform save/restore while the guest is in SMM triggered + * during L2 execution. + */ + if (stage == 10) + inject_smi(vcpu); + + state = vcpu_save_state(vcpu); kvm_vm_release(vm); - kvm_vm_restart(vm, O_RDWR); - vm_vcpu_add(vm, VCPU_ID); - vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid()); - vcpu_load_state(vm, VCPU_ID, state); - run = vcpu_state(vm, VCPU_ID); - free(state); + + vcpu = vm_recreate_with_one_vcpu(vm); + vcpu_load_state(vcpu, state); + kvm_x86_state_cleanup(state); } done: diff --git a/tools/testing/selftests/kvm/x86_64/state_test.c b/tools/testing/selftests/kvm/x86_64/state_test.c index f6c8b9042f8a..1c756db329e5 100644 --- a/tools/testing/selftests/kvm/x86_64/state_test.c +++ b/tools/testing/selftests/kvm/x86_64/state_test.c @@ -6,7 +6,6 @@ * * Tests for vCPU state save/restore, including nested guest state. */ -#define _GNU_SOURCE /* for program_invocation_short_name */ #include <fcntl.h> #include <stdio.h> #include <stdlib.h> @@ -20,7 +19,6 @@ #include "vmx.h" #include "svm_util.h" -#define VCPU_ID 5 #define L2_GUEST_STACK_SIZE 256 void svm_l2_guest_code(void) @@ -140,10 +138,87 @@ static void vmx_l1_guest_code(struct vmx_pages *vmx_pages) static void __attribute__((__flatten__)) guest_code(void *arg) { GUEST_SYNC(1); + + if (this_cpu_has(X86_FEATURE_XSAVE)) { + uint64_t supported_xcr0 = this_cpu_supported_xcr0(); + uint8_t buffer[4096]; + + memset(buffer, 0xcc, sizeof(buffer)); + + set_cr4(get_cr4() | X86_CR4_OSXSAVE); + GUEST_ASSERT(this_cpu_has(X86_FEATURE_OSXSAVE)); + + xsetbv(0, xgetbv(0) | supported_xcr0); + + /* + * Modify state for all supported xfeatures to take them out of + * their "init" state, i.e. to make them show up in XSTATE_BV. + * + * Note off-by-default features, e.g. AMX, are out of scope for + * this particular testcase as they have a different ABI. + */ + GUEST_ASSERT(supported_xcr0 & XFEATURE_MASK_FP); + asm volatile ("fincstp"); + + GUEST_ASSERT(supported_xcr0 & XFEATURE_MASK_SSE); + asm volatile ("vmovdqu %0, %%xmm0" :: "m" (buffer)); + + if (supported_xcr0 & XFEATURE_MASK_YMM) + asm volatile ("vmovdqu %0, %%ymm0" :: "m" (buffer)); + + if (supported_xcr0 & XFEATURE_MASK_AVX512) { + asm volatile ("kmovq %0, %%k1" :: "r" (-1ull)); + asm volatile ("vmovupd %0, %%zmm0" :: "m" (buffer)); + asm volatile ("vmovupd %0, %%zmm16" :: "m" (buffer)); + } + + if (this_cpu_has(X86_FEATURE_MPX)) { + uint64_t bounds[2] = { 10, 0xffffffffull }; + uint64_t output[2] = { }; + + GUEST_ASSERT(supported_xcr0 & XFEATURE_MASK_BNDREGS); + GUEST_ASSERT(supported_xcr0 & XFEATURE_MASK_BNDCSR); + + /* + * Don't bother trying to get BNDCSR into the INUSE + * state. MSR_IA32_BNDCFGS doesn't count as it isn't + * managed via XSAVE/XRSTOR, and BNDCFGU can only be + * modified by XRSTOR. Stuffing XSTATE_BV in the host + * is simpler than doing XRSTOR here in the guest. + * + * However, temporarily enable MPX in BNDCFGS so that + * BNDMOV actually loads BND1. If MPX isn't *fully* + * enabled, all MPX instructions are treated as NOPs. + * + * Hand encode "bndmov (%rax),%bnd1" as support for MPX + * mnemonics/registers has been removed from gcc and + * clang (and was never fully supported by clang). + */ + wrmsr(MSR_IA32_BNDCFGS, BIT_ULL(0)); + asm volatile (".byte 0x66,0x0f,0x1a,0x08" :: "a" (bounds)); + /* + * Hand encode "bndmov %bnd1, (%rax)" to sanity check + * that BND1 actually got loaded. + */ + asm volatile (".byte 0x66,0x0f,0x1b,0x08" :: "a" (output)); + wrmsr(MSR_IA32_BNDCFGS, 0); + + GUEST_ASSERT_EQ(bounds[0], output[0]); + GUEST_ASSERT_EQ(bounds[1], output[1]); + } + if (this_cpu_has(X86_FEATURE_PKU)) { + GUEST_ASSERT(supported_xcr0 & XFEATURE_MASK_PKRU); + set_cr4(get_cr4() | X86_CR4_PKE); + GUEST_ASSERT(this_cpu_has(X86_FEATURE_OSPKE)); + + wrpkru(-1u); + } + } + GUEST_SYNC(2); if (arg) { - if (cpu_has_svm()) + if (this_cpu_has(X86_FEATURE_SVM)) svm_l1_guest_code(arg); else vmx_l1_guest_code(arg); @@ -154,45 +229,40 @@ static void __attribute__((__flatten__)) guest_code(void *arg) int main(int argc, char *argv[]) { + uint64_t *xstate_bv, saved_xstate_bv; vm_vaddr_t nested_gva = 0; - + struct kvm_cpuid2 empty_cpuid = {}; struct kvm_regs regs1, regs2; + struct kvm_vcpu *vcpu, *vcpuN; struct kvm_vm *vm; - struct kvm_run *run; struct kvm_x86_state *state; struct ucall uc; int stage; /* Create VM */ - vm = vm_create_default(VCPU_ID, 0, guest_code); - vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid()); - run = vcpu_state(vm, VCPU_ID); + vm = vm_create_with_one_vcpu(&vcpu, guest_code); - vcpu_regs_get(vm, VCPU_ID, ®s1); + vcpu_regs_get(vcpu, ®s1); - if (kvm_check_cap(KVM_CAP_NESTED_STATE)) { - if (nested_svm_supported()) + if (kvm_has_cap(KVM_CAP_NESTED_STATE)) { + if (kvm_cpu_has(X86_FEATURE_SVM)) vcpu_alloc_svm(vm, &nested_gva); - else if (nested_vmx_supported()) + else if (kvm_cpu_has(X86_FEATURE_VMX)) vcpu_alloc_vmx(vm, &nested_gva); } if (!nested_gva) pr_info("will skip nested state checks\n"); - vcpu_args_set(vm, VCPU_ID, 1, nested_gva); + vcpu_args_set(vcpu, 1, nested_gva); for (stage = 1;; stage++) { - _vcpu_run(vm, VCPU_ID); - TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, - "Stage %d: unexpected exit reason: %u (%s),\n", - stage, run->exit_reason, - exit_reason_str(run->exit_reason)); + vcpu_run(vcpu); + TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO); - switch (get_ucall(vm, VCPU_ID, &uc)) { + switch (get_ucall(vcpu, &uc)) { case UCALL_ABORT: - TEST_FAIL("%s at %s:%ld", (const char *)uc.args[0], - __FILE__, uc.args[1]); + REPORT_GUEST_ASSERT(uc); /* NOT REACHED */ case UCALL_SYNC: break; @@ -207,22 +277,47 @@ int main(int argc, char *argv[]) uc.args[1] == stage, "Stage %d: Unexpected register values vmexit, got %lx", stage, (ulong)uc.args[1]); - state = vcpu_save_state(vm, VCPU_ID); + state = vcpu_save_state(vcpu); memset(®s1, 0, sizeof(regs1)); - vcpu_regs_get(vm, VCPU_ID, ®s1); + vcpu_regs_get(vcpu, ®s1); kvm_vm_release(vm); /* Restore state in a new VM. */ - kvm_vm_restart(vm, O_RDWR); - vm_vcpu_add(vm, VCPU_ID); - vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid()); - vcpu_load_state(vm, VCPU_ID, state); - run = vcpu_state(vm, VCPU_ID); - free(state); + vcpu = vm_recreate_with_one_vcpu(vm); + vcpu_load_state(vcpu, state); + + /* + * Restore XSAVE state in a dummy vCPU, first without doing + * KVM_SET_CPUID2, and then with an empty guest CPUID. Except + * for off-by-default xfeatures, e.g. AMX, KVM is supposed to + * allow KVM_SET_XSAVE regardless of guest CPUID. Manually + * load only XSAVE state, MSRs in particular have a much more + * convoluted ABI. + * + * Load two versions of XSAVE state: one with the actual guest + * XSAVE state, and one with all supported features forced "on" + * in xstate_bv, e.g. to ensure that KVM allows loading all + * supported features, even if something goes awry in saving + * the original snapshot. + */ + xstate_bv = (void *)&((uint8_t *)state->xsave->region)[512]; + saved_xstate_bv = *xstate_bv; + + vcpuN = __vm_vcpu_add(vm, vcpu->id + 1); + vcpu_xsave_set(vcpuN, state->xsave); + *xstate_bv = kvm_cpu_supported_xcr0(); + vcpu_xsave_set(vcpuN, state->xsave); + + vcpu_init_cpuid(vcpuN, &empty_cpuid); + vcpu_xsave_set(vcpuN, state->xsave); + *xstate_bv = saved_xstate_bv; + vcpu_xsave_set(vcpuN, state->xsave); + + kvm_x86_state_cleanup(state); memset(®s2, 0, sizeof(regs2)); - vcpu_regs_get(vm, VCPU_ID, ®s2); + vcpu_regs_get(vcpu, ®s2); TEST_ASSERT(!memcmp(®s1, ®s2, sizeof(regs2)), "Unexpected register values after vcpu_load_state; rdi: %lx rsi: %lx", (ulong) regs2.rdi, (ulong) regs2.rsi); diff --git a/tools/testing/selftests/kvm/x86_64/svm_int_ctl_test.c b/tools/testing/selftests/kvm/x86_64/svm_int_ctl_test.c new file mode 100644 index 000000000000..916e04248fbb --- /dev/null +++ b/tools/testing/selftests/kvm/x86_64/svm_int_ctl_test.c @@ -0,0 +1,118 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * svm_int_ctl_test + * + * Copyright (C) 2021, Red Hat, Inc. + * + * Nested SVM testing: test simultaneous use of V_IRQ from L1 and L0. + */ + +#include "test_util.h" +#include "kvm_util.h" +#include "processor.h" +#include "svm_util.h" +#include "apic.h" + +bool vintr_irq_called; +bool intr_irq_called; + +#define VINTR_IRQ_NUMBER 0x20 +#define INTR_IRQ_NUMBER 0x30 + +static void vintr_irq_handler(struct ex_regs *regs) +{ + vintr_irq_called = true; +} + +static void intr_irq_handler(struct ex_regs *regs) +{ + x2apic_write_reg(APIC_EOI, 0x00); + intr_irq_called = true; +} + +static void l2_guest_code(struct svm_test_data *svm) +{ + /* This code raises interrupt INTR_IRQ_NUMBER in the L1's LAPIC, + * and since L1 didn't enable virtual interrupt masking, + * L2 should receive it and not L1. + * + * L2 also has virtual interrupt 'VINTR_IRQ_NUMBER' pending in V_IRQ + * so it should also receive it after the following 'sti'. + */ + x2apic_write_reg(APIC_ICR, + APIC_DEST_SELF | APIC_INT_ASSERT | INTR_IRQ_NUMBER); + + __asm__ __volatile__( + "sti\n" + "nop\n" + ); + + GUEST_ASSERT(vintr_irq_called); + GUEST_ASSERT(intr_irq_called); + + __asm__ __volatile__( + "vmcall\n" + ); +} + +static void l1_guest_code(struct svm_test_data *svm) +{ + #define L2_GUEST_STACK_SIZE 64 + unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; + struct vmcb *vmcb = svm->vmcb; + + x2apic_enable(); + + /* Prepare for L2 execution. */ + generic_svm_setup(svm, l2_guest_code, + &l2_guest_stack[L2_GUEST_STACK_SIZE]); + + /* No virtual interrupt masking */ + vmcb->control.int_ctl &= ~V_INTR_MASKING_MASK; + + /* No intercepts for real and virtual interrupts */ + vmcb->control.intercept &= ~(BIT(INTERCEPT_INTR) | BIT(INTERCEPT_VINTR)); + + /* Make a virtual interrupt VINTR_IRQ_NUMBER pending */ + vmcb->control.int_ctl |= V_IRQ_MASK | (0x1 << V_INTR_PRIO_SHIFT); + vmcb->control.int_vector = VINTR_IRQ_NUMBER; + + run_guest(vmcb, svm->vmcb_gpa); + GUEST_ASSERT(vmcb->control.exit_code == SVM_EXIT_VMMCALL); + GUEST_DONE(); +} + +int main(int argc, char *argv[]) +{ + struct kvm_vcpu *vcpu; + vm_vaddr_t svm_gva; + struct kvm_vm *vm; + struct ucall uc; + + TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_SVM)); + + vm = vm_create_with_one_vcpu(&vcpu, l1_guest_code); + + vm_install_exception_handler(vm, VINTR_IRQ_NUMBER, vintr_irq_handler); + vm_install_exception_handler(vm, INTR_IRQ_NUMBER, intr_irq_handler); + + vcpu_alloc_svm(vm, &svm_gva); + vcpu_args_set(vcpu, 1, svm_gva); + + vcpu_run(vcpu); + TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO); + + switch (get_ucall(vcpu, &uc)) { + case UCALL_ABORT: + REPORT_GUEST_ASSERT(uc); + break; + /* NOT REACHED */ + case UCALL_DONE: + goto done; + default: + TEST_FAIL("Unknown ucall 0x%lx.", uc.cmd); + } +done: + kvm_vm_free(vm); + return 0; +} diff --git a/tools/testing/selftests/kvm/x86_64/svm_nested_shutdown_test.c b/tools/testing/selftests/kvm/x86_64/svm_nested_shutdown_test.c new file mode 100644 index 000000000000..00135cbba35e --- /dev/null +++ b/tools/testing/selftests/kvm/x86_64/svm_nested_shutdown_test.c @@ -0,0 +1,59 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * svm_nested_shutdown_test + * + * Copyright (C) 2022, Red Hat, Inc. + * + * Nested SVM testing: test that unintercepted shutdown in L2 doesn't crash the host + */ + +#include "test_util.h" +#include "kvm_util.h" +#include "processor.h" +#include "svm_util.h" + +static void l2_guest_code(struct svm_test_data *svm) +{ + __asm__ __volatile__("ud2"); +} + +static void l1_guest_code(struct svm_test_data *svm, struct idt_entry *idt) +{ + #define L2_GUEST_STACK_SIZE 64 + unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; + struct vmcb *vmcb = svm->vmcb; + + generic_svm_setup(svm, l2_guest_code, + &l2_guest_stack[L2_GUEST_STACK_SIZE]); + + vmcb->control.intercept &= ~(BIT(INTERCEPT_SHUTDOWN)); + + idt[6].p = 0; // #UD is intercepted but its injection will cause #NP + idt[11].p = 0; // #NP is not intercepted and will cause another + // #NP that will be converted to #DF + idt[8].p = 0; // #DF will cause #NP which will cause SHUTDOWN + + run_guest(vmcb, svm->vmcb_gpa); + + /* should not reach here */ + GUEST_ASSERT(0); +} + +int main(int argc, char *argv[]) +{ + struct kvm_vcpu *vcpu; + vm_vaddr_t svm_gva; + struct kvm_vm *vm; + + TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_SVM)); + + vm = vm_create_with_one_vcpu(&vcpu, l1_guest_code); + vcpu_alloc_svm(vm, &svm_gva); + + vcpu_args_set(vcpu, 2, svm_gva, vm->arch.idt); + + vcpu_run(vcpu); + TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_SHUTDOWN); + + kvm_vm_free(vm); +} diff --git a/tools/testing/selftests/kvm/x86_64/svm_nested_soft_inject_test.c b/tools/testing/selftests/kvm/x86_64/svm_nested_soft_inject_test.c new file mode 100644 index 000000000000..7b6481d6c0d3 --- /dev/null +++ b/tools/testing/selftests/kvm/x86_64/svm_nested_soft_inject_test.c @@ -0,0 +1,210 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2022 Oracle and/or its affiliates. + * + * Based on: + * svm_int_ctl_test + * + * Copyright (C) 2021, Red Hat, Inc. + * + */ +#include <stdatomic.h> +#include <stdio.h> +#include <unistd.h> +#include "apic.h" +#include "kvm_util.h" +#include "processor.h" +#include "svm_util.h" +#include "test_util.h" + +#define INT_NR 0x20 + +static_assert(ATOMIC_INT_LOCK_FREE == 2, "atomic int is not lockless"); + +static unsigned int bp_fired; +static void guest_bp_handler(struct ex_regs *regs) +{ + bp_fired++; +} + +static unsigned int int_fired; +static void l2_guest_code_int(void); + +static void guest_int_handler(struct ex_regs *regs) +{ + int_fired++; + GUEST_ASSERT_EQ(regs->rip, (unsigned long)l2_guest_code_int); +} + +static void l2_guest_code_int(void) +{ + GUEST_ASSERT_EQ(int_fired, 1); + + /* + * Same as the vmmcall() function, but with a ud2 sneaked after the + * vmmcall. The caller injects an exception with the return address + * increased by 2, so the "pop rbp" must be after the ud2 and we cannot + * use vmmcall() directly. + */ + __asm__ __volatile__("push %%rbp; vmmcall; ud2; pop %%rbp" + : : "a"(0xdeadbeef), "c"(0xbeefdead) + : "rbx", "rdx", "rsi", "rdi", "r8", "r9", + "r10", "r11", "r12", "r13", "r14", "r15"); + + GUEST_ASSERT_EQ(bp_fired, 1); + hlt(); +} + +static atomic_int nmi_stage; +#define nmi_stage_get() atomic_load_explicit(&nmi_stage, memory_order_acquire) +#define nmi_stage_inc() atomic_fetch_add_explicit(&nmi_stage, 1, memory_order_acq_rel) +static void guest_nmi_handler(struct ex_regs *regs) +{ + nmi_stage_inc(); + + if (nmi_stage_get() == 1) { + vmmcall(); + GUEST_FAIL("Unexpected resume after VMMCALL"); + } else { + GUEST_ASSERT_EQ(nmi_stage_get(), 3); + GUEST_DONE(); + } +} + +static void l2_guest_code_nmi(void) +{ + ud2(); +} + +static void l1_guest_code(struct svm_test_data *svm, uint64_t is_nmi, uint64_t idt_alt) +{ + #define L2_GUEST_STACK_SIZE 64 + unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; + struct vmcb *vmcb = svm->vmcb; + + if (is_nmi) + x2apic_enable(); + + /* Prepare for L2 execution. */ + generic_svm_setup(svm, + is_nmi ? l2_guest_code_nmi : l2_guest_code_int, + &l2_guest_stack[L2_GUEST_STACK_SIZE]); + + vmcb->control.intercept_exceptions |= BIT(PF_VECTOR) | BIT(UD_VECTOR); + vmcb->control.intercept |= BIT(INTERCEPT_NMI) | BIT(INTERCEPT_HLT); + + if (is_nmi) { + vmcb->control.event_inj = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI; + } else { + vmcb->control.event_inj = INT_NR | SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_SOFT; + /* The return address pushed on stack */ + vmcb->control.next_rip = vmcb->save.rip; + } + + run_guest(vmcb, svm->vmcb_gpa); + __GUEST_ASSERT(vmcb->control.exit_code == SVM_EXIT_VMMCALL, + "Expected VMMCAL #VMEXIT, got '0x%x', info1 = '0x%lx, info2 = '0x%lx'", + vmcb->control.exit_code, + vmcb->control.exit_info_1, vmcb->control.exit_info_2); + + if (is_nmi) { + clgi(); + x2apic_write_reg(APIC_ICR, APIC_DEST_SELF | APIC_INT_ASSERT | APIC_DM_NMI); + + GUEST_ASSERT_EQ(nmi_stage_get(), 1); + nmi_stage_inc(); + + stgi(); + /* self-NMI happens here */ + while (true) + cpu_relax(); + } + + /* Skip over VMMCALL */ + vmcb->save.rip += 3; + + /* Switch to alternate IDT to cause intervening NPF again */ + vmcb->save.idtr.base = idt_alt; + vmcb->control.clean = 0; /* &= ~BIT(VMCB_DT) would be enough */ + + vmcb->control.event_inj = BP_VECTOR | SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_EXEPT; + /* The return address pushed on stack, skip over UD2 */ + vmcb->control.next_rip = vmcb->save.rip + 2; + + run_guest(vmcb, svm->vmcb_gpa); + __GUEST_ASSERT(vmcb->control.exit_code == SVM_EXIT_HLT, + "Expected HLT #VMEXIT, got '0x%x', info1 = '0x%lx, info2 = '0x%lx'", + vmcb->control.exit_code, + vmcb->control.exit_info_1, vmcb->control.exit_info_2); + + GUEST_DONE(); +} + +static void run_test(bool is_nmi) +{ + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; + vm_vaddr_t svm_gva; + vm_vaddr_t idt_alt_vm; + struct kvm_guest_debug debug; + + pr_info("Running %s test\n", is_nmi ? "NMI" : "soft int"); + + vm = vm_create_with_one_vcpu(&vcpu, l1_guest_code); + + vm_install_exception_handler(vm, NMI_VECTOR, guest_nmi_handler); + vm_install_exception_handler(vm, BP_VECTOR, guest_bp_handler); + vm_install_exception_handler(vm, INT_NR, guest_int_handler); + + vcpu_alloc_svm(vm, &svm_gva); + + if (!is_nmi) { + void *idt, *idt_alt; + + idt_alt_vm = vm_vaddr_alloc_page(vm); + idt_alt = addr_gva2hva(vm, idt_alt_vm); + idt = addr_gva2hva(vm, vm->arch.idt); + memcpy(idt_alt, idt, getpagesize()); + } else { + idt_alt_vm = 0; + } + vcpu_args_set(vcpu, 3, svm_gva, (uint64_t)is_nmi, (uint64_t)idt_alt_vm); + + memset(&debug, 0, sizeof(debug)); + vcpu_guest_debug_set(vcpu, &debug); + + struct ucall uc; + + alarm(2); + vcpu_run(vcpu); + alarm(0); + TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO); + + switch (get_ucall(vcpu, &uc)) { + case UCALL_ABORT: + REPORT_GUEST_ASSERT(uc); + break; + /* NOT REACHED */ + case UCALL_DONE: + goto done; + default: + TEST_FAIL("Unknown ucall 0x%lx.", uc.cmd); + } +done: + kvm_vm_free(vm); +} + +int main(int argc, char *argv[]) +{ + TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_SVM)); + + TEST_ASSERT(kvm_cpu_has(X86_FEATURE_NRIPS), + "KVM with nSVM is supposed to unconditionally advertise nRIP Save"); + + atomic_init(&nmi_stage, 0); + + run_test(false); + run_test(true); + + return 0; +} diff --git a/tools/testing/selftests/kvm/x86_64/svm_vmcall_test.c b/tools/testing/selftests/kvm/x86_64/svm_vmcall_test.c index 0e1adb4e3199..8a62cca28cfb 100644 --- a/tools/testing/selftests/kvm/x86_64/svm_vmcall_test.c +++ b/tools/testing/selftests/kvm/x86_64/svm_vmcall_test.c @@ -12,10 +12,6 @@ #include "processor.h" #include "svm_util.h" -#define VCPU_ID 5 - -static struct kvm_vm *vm; - static void l2_guest_code(struct svm_test_data *svm) { __asm__ __volatile__("vmcall"); @@ -39,29 +35,26 @@ static void l1_guest_code(struct svm_test_data *svm) int main(int argc, char *argv[]) { + struct kvm_vcpu *vcpu; vm_vaddr_t svm_gva; + struct kvm_vm *vm; - nested_svm_check_supported(); + TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_SVM)); - vm = vm_create_default(VCPU_ID, 0, (void *) l1_guest_code); - vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid()); + vm = vm_create_with_one_vcpu(&vcpu, l1_guest_code); vcpu_alloc_svm(vm, &svm_gva); - vcpu_args_set(vm, VCPU_ID, 1, svm_gva); + vcpu_args_set(vcpu, 1, svm_gva); for (;;) { - volatile struct kvm_run *run = vcpu_state(vm, VCPU_ID); struct ucall uc; - vcpu_run(vm, VCPU_ID); - TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, - "Got exit_reason other than KVM_EXIT_IO: %u (%s)\n", - run->exit_reason, - exit_reason_str(run->exit_reason)); + vcpu_run(vcpu); + TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO); - switch (get_ucall(vm, VCPU_ID, &uc)) { + switch (get_ucall(vcpu, &uc)) { case UCALL_ABORT: - TEST_FAIL("%s", (const char *)uc.args[0]); + REPORT_GUEST_ASSERT(uc); /* NOT REACHED */ case UCALL_SYNC: break; diff --git a/tools/testing/selftests/kvm/x86_64/sync_regs_test.c b/tools/testing/selftests/kvm/x86_64/sync_regs_test.c index d672f0a473f8..8fa3948b0170 100644 --- a/tools/testing/selftests/kvm/x86_64/sync_regs_test.c +++ b/tools/testing/selftests/kvm/x86_64/sync_regs_test.c @@ -8,22 +8,24 @@ * including requesting an invalid register set, updates to/from values * in kvm_run.s.regs when kvm_valid_regs and kvm_dirty_regs are toggled. */ - -#define _GNU_SOURCE /* for program_invocation_short_name */ #include <fcntl.h> #include <stdio.h> #include <stdlib.h> #include <string.h> #include <sys/ioctl.h> +#include <pthread.h> +#include "kvm_test_harness.h" #include "test_util.h" #include "kvm_util.h" #include "processor.h" -#define VCPU_ID 5 - #define UCALL_PIO_PORT ((uint16_t)0x1000) +struct ucall uc_none = { + .cmd = UCALL_NONE, +}; + /* * ucall is embedded here to protect against compiler reshuffling registers * before calling a function. In this test we only need to get KVM_EXIT_IO @@ -34,15 +36,18 @@ void guest_code(void) asm volatile("1: in %[port], %%al\n" "add $0x1, %%rbx\n" "jmp 1b" - : : [port] "d" (UCALL_PIO_PORT) : "rax", "rbx"); + : : [port] "d" (UCALL_PIO_PORT), "D" (&uc_none) + : "rax", "rbx"); } +KVM_ONE_VCPU_TEST_SUITE(sync_regs_test); + static void compare_regs(struct kvm_regs *left, struct kvm_regs *right) { #define REG_COMPARE(reg) \ TEST_ASSERT(left->reg == right->reg, \ "Register " #reg \ - " values did not match: 0x%llx, 0x%llx\n", \ + " values did not match: 0x%llx, 0x%llx", \ left->reg, right->reg) REG_COMPARE(rax); REG_COMPARE(rbx); @@ -77,80 +82,205 @@ static void compare_vcpu_events(struct kvm_vcpu_events *left, #define TEST_SYNC_FIELDS (KVM_SYNC_X86_REGS|KVM_SYNC_X86_SREGS|KVM_SYNC_X86_EVENTS) #define INVALID_SYNC_FIELD 0x80000000 -int main(int argc, char *argv[]) +/* + * Set an exception as pending *and* injected while KVM is processing events. + * KVM is supposed to ignore/drop pending exceptions if userspace is also + * requesting that an exception be injected. + */ +static void *race_events_inj_pen(void *arg) { - struct kvm_vm *vm; - struct kvm_run *run; - struct kvm_regs regs; - struct kvm_sregs sregs; - struct kvm_vcpu_events events; - int rv, cap; + struct kvm_run *run = (struct kvm_run *)arg; + struct kvm_vcpu_events *events = &run->s.regs.events; - /* Tell stdout not to buffer its content */ - setbuf(stdout, NULL); + WRITE_ONCE(events->exception.nr, UD_VECTOR); - cap = kvm_check_cap(KVM_CAP_SYNC_REGS); - if ((cap & TEST_SYNC_FIELDS) != TEST_SYNC_FIELDS) { - print_skip("KVM_CAP_SYNC_REGS not supported"); - exit(KSFT_SKIP); + for (;;) { + WRITE_ONCE(run->kvm_dirty_regs, KVM_SYNC_X86_EVENTS); + WRITE_ONCE(events->flags, 0); + WRITE_ONCE(events->exception.injected, 1); + WRITE_ONCE(events->exception.pending, 1); + + pthread_testcancel(); } - if ((cap & INVALID_SYNC_FIELD) != 0) { - print_skip("The \"invalid\" field is not invalid"); - exit(KSFT_SKIP); + + return NULL; +} + +/* + * Set an invalid exception vector while KVM is processing events. KVM is + * supposed to reject any vector >= 32, as well as NMIs (vector 2). + */ +static void *race_events_exc(void *arg) +{ + struct kvm_run *run = (struct kvm_run *)arg; + struct kvm_vcpu_events *events = &run->s.regs.events; + + for (;;) { + WRITE_ONCE(run->kvm_dirty_regs, KVM_SYNC_X86_EVENTS); + WRITE_ONCE(events->flags, 0); + WRITE_ONCE(events->exception.nr, UD_VECTOR); + WRITE_ONCE(events->exception.pending, 1); + WRITE_ONCE(events->exception.nr, 255); + + pthread_testcancel(); } - /* Create VM */ - vm = vm_create_default(VCPU_ID, 0, guest_code); + return NULL; +} - run = vcpu_state(vm, VCPU_ID); +/* + * Toggle CR4.PAE while KVM is processing SREGS, EFER.LME=1 with CR4.PAE=0 is + * illegal, and KVM's MMU heavily relies on vCPU state being valid. + */ +static noinline void *race_sregs_cr4(void *arg) +{ + struct kvm_run *run = (struct kvm_run *)arg; + __u64 *cr4 = &run->s.regs.sregs.cr4; + __u64 pae_enabled = *cr4; + __u64 pae_disabled = *cr4 & ~X86_CR4_PAE; + + for (;;) { + WRITE_ONCE(run->kvm_dirty_regs, KVM_SYNC_X86_SREGS); + WRITE_ONCE(*cr4, pae_enabled); + asm volatile(".rept 512\n\t" + "nop\n\t" + ".endr"); + WRITE_ONCE(*cr4, pae_disabled); + + pthread_testcancel(); + } + + return NULL; +} + +static void race_sync_regs(struct kvm_vcpu *vcpu, void *racer) +{ + const time_t TIMEOUT = 2; /* seconds, roughly */ + struct kvm_x86_state *state; + struct kvm_translation tr; + struct kvm_run *run; + pthread_t thread; + time_t t; + + run = vcpu->run; + + run->kvm_valid_regs = KVM_SYNC_X86_SREGS; + vcpu_run(vcpu); + run->kvm_valid_regs = 0; + + /* Save state *before* spawning the thread that mucks with vCPU state. */ + state = vcpu_save_state(vcpu); + + /* + * Selftests run 64-bit guests by default, both EFER.LME and CR4.PAE + * should already be set in guest state. + */ + TEST_ASSERT((run->s.regs.sregs.cr4 & X86_CR4_PAE) && + (run->s.regs.sregs.efer & EFER_LME), + "vCPU should be in long mode, CR4.PAE=%d, EFER.LME=%d", + !!(run->s.regs.sregs.cr4 & X86_CR4_PAE), + !!(run->s.regs.sregs.efer & EFER_LME)); + + TEST_ASSERT_EQ(pthread_create(&thread, NULL, racer, (void *)run), 0); + + for (t = time(NULL) + TIMEOUT; time(NULL) < t;) { + /* + * Reload known good state if the vCPU triple faults, e.g. due + * to the unhandled #GPs being injected. VMX preserves state + * on shutdown, but SVM synthesizes an INIT as the VMCB state + * is architecturally undefined on triple fault. + */ + if (!__vcpu_run(vcpu) && run->exit_reason == KVM_EXIT_SHUTDOWN) + vcpu_load_state(vcpu, state); + + if (racer == race_sregs_cr4) { + tr = (struct kvm_translation) { .linear_address = 0 }; + __vcpu_ioctl(vcpu, KVM_TRANSLATE, &tr); + } + } + + TEST_ASSERT_EQ(pthread_cancel(thread), 0); + TEST_ASSERT_EQ(pthread_join(thread, NULL), 0); + + kvm_x86_state_cleanup(state); +} + +KVM_ONE_VCPU_TEST(sync_regs_test, read_invalid, guest_code) +{ + struct kvm_run *run = vcpu->run; + int rv; /* Request reading invalid register set from VCPU. */ run->kvm_valid_regs = INVALID_SYNC_FIELD; - rv = _vcpu_run(vm, VCPU_ID); + rv = _vcpu_run(vcpu); TEST_ASSERT(rv < 0 && errno == EINVAL, - "Invalid kvm_valid_regs did not cause expected KVM_RUN error: %d\n", + "Invalid kvm_valid_regs did not cause expected KVM_RUN error: %d", rv); - vcpu_state(vm, VCPU_ID)->kvm_valid_regs = 0; + run->kvm_valid_regs = 0; run->kvm_valid_regs = INVALID_SYNC_FIELD | TEST_SYNC_FIELDS; - rv = _vcpu_run(vm, VCPU_ID); + rv = _vcpu_run(vcpu); TEST_ASSERT(rv < 0 && errno == EINVAL, - "Invalid kvm_valid_regs did not cause expected KVM_RUN error: %d\n", + "Invalid kvm_valid_regs did not cause expected KVM_RUN error: %d", rv); - vcpu_state(vm, VCPU_ID)->kvm_valid_regs = 0; + run->kvm_valid_regs = 0; +} + +KVM_ONE_VCPU_TEST(sync_regs_test, set_invalid, guest_code) +{ + struct kvm_run *run = vcpu->run; + int rv; /* Request setting invalid register set into VCPU. */ run->kvm_dirty_regs = INVALID_SYNC_FIELD; - rv = _vcpu_run(vm, VCPU_ID); + rv = _vcpu_run(vcpu); TEST_ASSERT(rv < 0 && errno == EINVAL, - "Invalid kvm_dirty_regs did not cause expected KVM_RUN error: %d\n", + "Invalid kvm_dirty_regs did not cause expected KVM_RUN error: %d", rv); - vcpu_state(vm, VCPU_ID)->kvm_dirty_regs = 0; + run->kvm_dirty_regs = 0; run->kvm_dirty_regs = INVALID_SYNC_FIELD | TEST_SYNC_FIELDS; - rv = _vcpu_run(vm, VCPU_ID); + rv = _vcpu_run(vcpu); TEST_ASSERT(rv < 0 && errno == EINVAL, - "Invalid kvm_dirty_regs did not cause expected KVM_RUN error: %d\n", + "Invalid kvm_dirty_regs did not cause expected KVM_RUN error: %d", rv); - vcpu_state(vm, VCPU_ID)->kvm_dirty_regs = 0; + run->kvm_dirty_regs = 0; +} + +KVM_ONE_VCPU_TEST(sync_regs_test, req_and_verify_all_valid, guest_code) +{ + struct kvm_run *run = vcpu->run; + struct kvm_vcpu_events events; + struct kvm_sregs sregs; + struct kvm_regs regs; /* Request and verify all valid register sets. */ /* TODO: BUILD TIME CHECK: TEST_ASSERT(KVM_SYNC_X86_NUM_FIELDS != 3); */ run->kvm_valid_regs = TEST_SYNC_FIELDS; - rv = _vcpu_run(vm, VCPU_ID); - TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, - "Unexpected exit reason: %u (%s),\n", - run->exit_reason, - exit_reason_str(run->exit_reason)); + vcpu_run(vcpu); + TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO); - vcpu_regs_get(vm, VCPU_ID, ®s); + vcpu_regs_get(vcpu, ®s); compare_regs(®s, &run->s.regs.regs); - vcpu_sregs_get(vm, VCPU_ID, &sregs); + vcpu_sregs_get(vcpu, &sregs); compare_sregs(&sregs, &run->s.regs.sregs); - vcpu_events_get(vm, VCPU_ID, &events); + vcpu_events_get(vcpu, &events); compare_vcpu_events(&events, &run->s.regs.events); +} + +KVM_ONE_VCPU_TEST(sync_regs_test, set_and_verify_various, guest_code) +{ + struct kvm_run *run = vcpu->run; + struct kvm_vcpu_events events; + struct kvm_sregs sregs; + struct kvm_regs regs; + + /* Run once to get register set */ + run->kvm_valid_regs = TEST_SYNC_FIELDS; + vcpu_run(vcpu); + TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO); /* Set and verify various register values. */ run->s.regs.regs.rbx = 0xBAD1DEA; @@ -159,11 +289,8 @@ int main(int argc, char *argv[]) run->kvm_valid_regs = TEST_SYNC_FIELDS; run->kvm_dirty_regs = KVM_SYNC_X86_REGS | KVM_SYNC_X86_SREGS; - rv = _vcpu_run(vm, VCPU_ID); - TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, - "Unexpected exit reason: %u (%s),\n", - run->exit_reason, - exit_reason_str(run->exit_reason)); + vcpu_run(vcpu); + TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO); TEST_ASSERT(run->s.regs.regs.rbx == 0xBAD1DEA + 1, "rbx sync regs value incorrect 0x%llx.", run->s.regs.regs.rbx); @@ -171,14 +298,19 @@ int main(int argc, char *argv[]) "apic_base sync regs value incorrect 0x%llx.", run->s.regs.sregs.apic_base); - vcpu_regs_get(vm, VCPU_ID, ®s); + vcpu_regs_get(vcpu, ®s); compare_regs(®s, &run->s.regs.regs); - vcpu_sregs_get(vm, VCPU_ID, &sregs); + vcpu_sregs_get(vcpu, &sregs); compare_sregs(&sregs, &run->s.regs.sregs); - vcpu_events_get(vm, VCPU_ID, &events); + vcpu_events_get(vcpu, &events); compare_vcpu_events(&events, &run->s.regs.events); +} + +KVM_ONE_VCPU_TEST(sync_regs_test, clear_kvm_dirty_regs_bits, guest_code) +{ + struct kvm_run *run = vcpu->run; /* Clear kvm_dirty_regs bits, verify new s.regs values are * overwritten with existing guest values. @@ -186,14 +318,22 @@ int main(int argc, char *argv[]) run->kvm_valid_regs = TEST_SYNC_FIELDS; run->kvm_dirty_regs = 0; run->s.regs.regs.rbx = 0xDEADBEEF; - rv = _vcpu_run(vm, VCPU_ID); - TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, - "Unexpected exit reason: %u (%s),\n", - run->exit_reason, - exit_reason_str(run->exit_reason)); + vcpu_run(vcpu); + TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO); TEST_ASSERT(run->s.regs.regs.rbx != 0xDEADBEEF, "rbx sync regs value incorrect 0x%llx.", run->s.regs.regs.rbx); +} + +KVM_ONE_VCPU_TEST(sync_regs_test, clear_kvm_valid_and_dirty_regs, guest_code) +{ + struct kvm_run *run = vcpu->run; + struct kvm_regs regs; + + /* Run once to get register set */ + run->kvm_valid_regs = TEST_SYNC_FIELDS; + vcpu_run(vcpu); + TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO); /* Clear kvm_valid_regs bits and kvm_dirty_bits. * Verify s.regs values are not overwritten with existing guest values @@ -202,20 +342,29 @@ int main(int argc, char *argv[]) run->kvm_valid_regs = 0; run->kvm_dirty_regs = 0; run->s.regs.regs.rbx = 0xAAAA; + vcpu_regs_get(vcpu, ®s); regs.rbx = 0xBAC0; - vcpu_regs_set(vm, VCPU_ID, ®s); - rv = _vcpu_run(vm, VCPU_ID); - TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, - "Unexpected exit reason: %u (%s),\n", - run->exit_reason, - exit_reason_str(run->exit_reason)); + vcpu_regs_set(vcpu, ®s); + vcpu_run(vcpu); + TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO); TEST_ASSERT(run->s.regs.regs.rbx == 0xAAAA, "rbx sync regs value incorrect 0x%llx.", run->s.regs.regs.rbx); - vcpu_regs_get(vm, VCPU_ID, ®s); + vcpu_regs_get(vcpu, ®s); TEST_ASSERT(regs.rbx == 0xBAC0 + 1, "rbx guest value incorrect 0x%llx.", regs.rbx); +} + +KVM_ONE_VCPU_TEST(sync_regs_test, clear_kvm_valid_regs_bits, guest_code) +{ + struct kvm_run *run = vcpu->run; + struct kvm_regs regs; + + /* Run once to get register set */ + run->kvm_valid_regs = TEST_SYNC_FIELDS; + vcpu_run(vcpu); + TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO); /* Clear kvm_valid_regs bits. Verify s.regs values are not overwritten * with existing guest values but that guest values are overwritten @@ -224,20 +373,39 @@ int main(int argc, char *argv[]) run->kvm_valid_regs = 0; run->kvm_dirty_regs = TEST_SYNC_FIELDS; run->s.regs.regs.rbx = 0xBBBB; - rv = _vcpu_run(vm, VCPU_ID); - TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, - "Unexpected exit reason: %u (%s),\n", - run->exit_reason, - exit_reason_str(run->exit_reason)); + vcpu_run(vcpu); + TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO); TEST_ASSERT(run->s.regs.regs.rbx == 0xBBBB, "rbx sync regs value incorrect 0x%llx.", run->s.regs.regs.rbx); - vcpu_regs_get(vm, VCPU_ID, ®s); + vcpu_regs_get(vcpu, ®s); TEST_ASSERT(regs.rbx == 0xBBBB + 1, "rbx guest value incorrect 0x%llx.", regs.rbx); +} - kvm_vm_free(vm); +KVM_ONE_VCPU_TEST(sync_regs_test, race_cr4, guest_code) +{ + race_sync_regs(vcpu, race_sregs_cr4); +} + +KVM_ONE_VCPU_TEST(sync_regs_test, race_exc, guest_code) +{ + race_sync_regs(vcpu, race_events_exc); +} + +KVM_ONE_VCPU_TEST(sync_regs_test, race_inj_pen, guest_code) +{ + race_sync_regs(vcpu, race_events_inj_pen); +} + +int main(int argc, char *argv[]) +{ + int cap; + + cap = kvm_check_cap(KVM_CAP_SYNC_REGS); + TEST_REQUIRE((cap & TEST_SYNC_FIELDS) == TEST_SYNC_FIELDS); + TEST_REQUIRE(!(cap & INVALID_SYNC_FIELD)); - return 0; + return test_harness_run(argc, argv); } diff --git a/tools/testing/selftests/kvm/x86_64/triple_fault_event_test.c b/tools/testing/selftests/kvm/x86_64/triple_fault_event_test.c new file mode 100644 index 000000000000..56306a19144a --- /dev/null +++ b/tools/testing/selftests/kvm/x86_64/triple_fault_event_test.c @@ -0,0 +1,124 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include "test_util.h" +#include "kvm_util.h" +#include "processor.h" +#include "vmx.h" +#include "svm_util.h" + +#include <string.h> +#include <sys/ioctl.h> + +#include "kselftest.h" + +#define ARBITRARY_IO_PORT 0x2000 + +/* The virtual machine object. */ +static struct kvm_vm *vm; + +static void l2_guest_code(void) +{ + asm volatile("inb %%dx, %%al" + : : [port] "d" (ARBITRARY_IO_PORT) : "rax"); +} + +#define L2_GUEST_STACK_SIZE 64 +unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; + +void l1_guest_code_vmx(struct vmx_pages *vmx) +{ + + GUEST_ASSERT(vmx->vmcs_gpa); + GUEST_ASSERT(prepare_for_vmx_operation(vmx)); + GUEST_ASSERT(load_vmcs(vmx)); + + prepare_vmcs(vmx, l2_guest_code, + &l2_guest_stack[L2_GUEST_STACK_SIZE]); + + GUEST_ASSERT(!vmlaunch()); + /* L2 should triple fault after a triple fault event injected. */ + GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_TRIPLE_FAULT); + GUEST_DONE(); +} + +void l1_guest_code_svm(struct svm_test_data *svm) +{ + struct vmcb *vmcb = svm->vmcb; + + generic_svm_setup(svm, l2_guest_code, + &l2_guest_stack[L2_GUEST_STACK_SIZE]); + + /* don't intercept shutdown to test the case of SVM allowing to do so */ + vmcb->control.intercept &= ~(BIT(INTERCEPT_SHUTDOWN)); + + run_guest(vmcb, svm->vmcb_gpa); + + /* should not reach here, L1 should crash */ + GUEST_ASSERT(0); +} + +int main(void) +{ + struct kvm_vcpu *vcpu; + struct kvm_run *run; + struct kvm_vcpu_events events; + struct ucall uc; + + bool has_vmx = kvm_cpu_has(X86_FEATURE_VMX); + bool has_svm = kvm_cpu_has(X86_FEATURE_SVM); + + TEST_REQUIRE(has_vmx || has_svm); + + TEST_REQUIRE(kvm_has_cap(KVM_CAP_X86_TRIPLE_FAULT_EVENT)); + + + if (has_vmx) { + vm_vaddr_t vmx_pages_gva; + + vm = vm_create_with_one_vcpu(&vcpu, l1_guest_code_vmx); + vcpu_alloc_vmx(vm, &vmx_pages_gva); + vcpu_args_set(vcpu, 1, vmx_pages_gva); + } else { + vm_vaddr_t svm_gva; + + vm = vm_create_with_one_vcpu(&vcpu, l1_guest_code_svm); + vcpu_alloc_svm(vm, &svm_gva); + vcpu_args_set(vcpu, 1, svm_gva); + } + + vm_enable_cap(vm, KVM_CAP_X86_TRIPLE_FAULT_EVENT, 1); + run = vcpu->run; + vcpu_run(vcpu); + + TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO); + TEST_ASSERT(run->io.port == ARBITRARY_IO_PORT, + "Expected IN from port %d from L2, got port %d", + ARBITRARY_IO_PORT, run->io.port); + vcpu_events_get(vcpu, &events); + events.flags |= KVM_VCPUEVENT_VALID_TRIPLE_FAULT; + events.triple_fault.pending = true; + vcpu_events_set(vcpu, &events); + run->immediate_exit = true; + vcpu_run_complete_io(vcpu); + + vcpu_events_get(vcpu, &events); + TEST_ASSERT(events.flags & KVM_VCPUEVENT_VALID_TRIPLE_FAULT, + "Triple fault event invalid"); + TEST_ASSERT(events.triple_fault.pending, + "No triple fault pending"); + vcpu_run(vcpu); + + + if (has_svm) { + TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_SHUTDOWN); + } else { + switch (get_ucall(vcpu, &uc)) { + case UCALL_DONE: + break; + case UCALL_ABORT: + REPORT_GUEST_ASSERT(uc); + default: + TEST_FAIL("Unexpected ucall: %lu", uc.cmd); + } + } + return 0; +} diff --git a/tools/testing/selftests/kvm/x86_64/tsc_msrs_test.c b/tools/testing/selftests/kvm/x86_64/tsc_msrs_test.c new file mode 100644 index 000000000000..12b0964f4f13 --- /dev/null +++ b/tools/testing/selftests/kvm/x86_64/tsc_msrs_test.c @@ -0,0 +1,161 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Tests for MSR_IA32_TSC and MSR_IA32_TSC_ADJUST. + * + * Copyright (C) 2020, Red Hat, Inc. + */ +#include <stdio.h> +#include <string.h> +#include "kvm_util.h" +#include "processor.h" + +#define UNITY (1ull << 30) +#define HOST_ADJUST (UNITY * 64) +#define GUEST_STEP (UNITY * 4) +#define ROUND(x) ((x + UNITY / 2) & -UNITY) +#define rounded_rdmsr(x) ROUND(rdmsr(x)) +#define rounded_host_rdmsr(x) ROUND(vcpu_get_msr(vcpu, x)) + +static void guest_code(void) +{ + u64 val = 0; + + GUEST_ASSERT_EQ(rounded_rdmsr(MSR_IA32_TSC), val); + GUEST_ASSERT_EQ(rounded_rdmsr(MSR_IA32_TSC_ADJUST), val); + + /* Guest: writes to MSR_IA32_TSC affect both MSRs. */ + val = 1ull * GUEST_STEP; + wrmsr(MSR_IA32_TSC, val); + GUEST_ASSERT_EQ(rounded_rdmsr(MSR_IA32_TSC), val); + GUEST_ASSERT_EQ(rounded_rdmsr(MSR_IA32_TSC_ADJUST), val); + + /* Guest: writes to MSR_IA32_TSC_ADJUST affect both MSRs. */ + GUEST_SYNC(2); + val = 2ull * GUEST_STEP; + wrmsr(MSR_IA32_TSC_ADJUST, val); + GUEST_ASSERT_EQ(rounded_rdmsr(MSR_IA32_TSC), val); + GUEST_ASSERT_EQ(rounded_rdmsr(MSR_IA32_TSC_ADJUST), val); + + /* Host: setting the TSC offset. */ + GUEST_SYNC(3); + GUEST_ASSERT_EQ(rounded_rdmsr(MSR_IA32_TSC), HOST_ADJUST + val); + GUEST_ASSERT_EQ(rounded_rdmsr(MSR_IA32_TSC_ADJUST), val); + + /* + * Guest: writes to MSR_IA32_TSC_ADJUST do not destroy the + * host-side offset and affect both MSRs. + */ + GUEST_SYNC(4); + val = 3ull * GUEST_STEP; + wrmsr(MSR_IA32_TSC_ADJUST, val); + GUEST_ASSERT_EQ(rounded_rdmsr(MSR_IA32_TSC), HOST_ADJUST + val); + GUEST_ASSERT_EQ(rounded_rdmsr(MSR_IA32_TSC_ADJUST), val); + + /* + * Guest: writes to MSR_IA32_TSC affect both MSRs, so the host-side + * offset is now visible in MSR_IA32_TSC_ADJUST. + */ + GUEST_SYNC(5); + val = 4ull * GUEST_STEP; + wrmsr(MSR_IA32_TSC, val); + GUEST_ASSERT_EQ(rounded_rdmsr(MSR_IA32_TSC), val); + GUEST_ASSERT_EQ(rounded_rdmsr(MSR_IA32_TSC_ADJUST), val - HOST_ADJUST); + + GUEST_DONE(); +} + +static void run_vcpu(struct kvm_vcpu *vcpu, int stage) +{ + struct ucall uc; + + vcpu_run(vcpu); + + switch (get_ucall(vcpu, &uc)) { + case UCALL_SYNC: + if (!strcmp((const char *)uc.args[0], "hello") && + uc.args[1] == stage + 1) + ksft_test_result_pass("stage %d passed\n", stage + 1); + else + ksft_test_result_fail( + "stage %d: Unexpected register values vmexit, got %lx", + stage + 1, (ulong)uc.args[1]); + return; + case UCALL_DONE: + ksft_test_result_pass("stage %d passed\n", stage + 1); + return; + case UCALL_ABORT: + REPORT_GUEST_ASSERT(uc); + default: + TEST_ASSERT(false, "Unexpected exit: %s", + exit_reason_str(vcpu->run->exit_reason)); + } +} + +int main(void) +{ + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; + uint64_t val; + + ksft_print_header(); + ksft_set_plan(5); + + vm = vm_create_with_one_vcpu(&vcpu, guest_code); + + val = 0; + TEST_ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC), val); + TEST_ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC_ADJUST), val); + + /* Guest: writes to MSR_IA32_TSC affect both MSRs. */ + run_vcpu(vcpu, 1); + val = 1ull * GUEST_STEP; + TEST_ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC), val); + TEST_ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC_ADJUST), val); + + /* Guest: writes to MSR_IA32_TSC_ADJUST affect both MSRs. */ + run_vcpu(vcpu, 2); + val = 2ull * GUEST_STEP; + TEST_ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC), val); + TEST_ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC_ADJUST), val); + + /* + * Host: writes to MSR_IA32_TSC set the host-side offset + * and therefore do not change MSR_IA32_TSC_ADJUST. + */ + vcpu_set_msr(vcpu, MSR_IA32_TSC, HOST_ADJUST + val); + TEST_ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC), HOST_ADJUST + val); + TEST_ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC_ADJUST), val); + run_vcpu(vcpu, 3); + + /* Host: writes to MSR_IA32_TSC_ADJUST do not modify the TSC. */ + vcpu_set_msr(vcpu, MSR_IA32_TSC_ADJUST, UNITY * 123456); + TEST_ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC), HOST_ADJUST + val); + TEST_ASSERT_EQ(vcpu_get_msr(vcpu, MSR_IA32_TSC_ADJUST), UNITY * 123456); + + /* Restore previous value. */ + vcpu_set_msr(vcpu, MSR_IA32_TSC_ADJUST, val); + TEST_ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC), HOST_ADJUST + val); + TEST_ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC_ADJUST), val); + + /* + * Guest: writes to MSR_IA32_TSC_ADJUST do not destroy the + * host-side offset and affect both MSRs. + */ + run_vcpu(vcpu, 4); + val = 3ull * GUEST_STEP; + TEST_ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC), HOST_ADJUST + val); + TEST_ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC_ADJUST), val); + + /* + * Guest: writes to MSR_IA32_TSC affect both MSRs, so the host-side + * offset is now visible in MSR_IA32_TSC_ADJUST. + */ + run_vcpu(vcpu, 5); + val = 4ull * GUEST_STEP; + TEST_ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC), val); + TEST_ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC_ADJUST), val - HOST_ADJUST); + + kvm_vm_free(vm); + + ksft_finished(); /* Print results and exit() accordingly */ +} diff --git a/tools/testing/selftests/kvm/x86_64/tsc_scaling_sync.c b/tools/testing/selftests/kvm/x86_64/tsc_scaling_sync.c new file mode 100644 index 000000000000..59c7304f805e --- /dev/null +++ b/tools/testing/selftests/kvm/x86_64/tsc_scaling_sync.c @@ -0,0 +1,110 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright © 2021 Amazon.com, Inc. or its affiliates. + */ + +#include "test_util.h" +#include "kvm_util.h" +#include "processor.h" + +#include <stdint.h> +#include <time.h> +#include <sched.h> +#include <signal.h> +#include <pthread.h> + +#define NR_TEST_VCPUS 20 + +static struct kvm_vm *vm; +pthread_spinlock_t create_lock; + +#define TEST_TSC_KHZ 2345678UL +#define TEST_TSC_OFFSET 200000000 + +uint64_t tsc_sync; +static void guest_code(void) +{ + uint64_t start_tsc, local_tsc, tmp; + + start_tsc = rdtsc(); + do { + tmp = READ_ONCE(tsc_sync); + local_tsc = rdtsc(); + WRITE_ONCE(tsc_sync, local_tsc); + if (unlikely(local_tsc < tmp)) + GUEST_SYNC_ARGS(0, local_tsc, tmp, 0, 0); + + } while (local_tsc - start_tsc < 5000 * TEST_TSC_KHZ); + + GUEST_DONE(); +} + + +static void *run_vcpu(void *_cpu_nr) +{ + unsigned long vcpu_id = (unsigned long)_cpu_nr; + unsigned long failures = 0; + static bool first_cpu_done; + struct kvm_vcpu *vcpu; + + /* The kernel is fine, but vm_vcpu_add() needs locking */ + pthread_spin_lock(&create_lock); + + vcpu = vm_vcpu_add(vm, vcpu_id, guest_code); + + if (!first_cpu_done) { + first_cpu_done = true; + vcpu_set_msr(vcpu, MSR_IA32_TSC, TEST_TSC_OFFSET); + } + + pthread_spin_unlock(&create_lock); + + for (;;) { + struct ucall uc; + + vcpu_run(vcpu); + TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO); + + switch (get_ucall(vcpu, &uc)) { + case UCALL_DONE: + goto out; + + case UCALL_SYNC: + printf("Guest %d sync %lx %lx %ld\n", vcpu->id, + uc.args[2], uc.args[3], uc.args[2] - uc.args[3]); + failures++; + break; + + default: + TEST_FAIL("Unknown ucall %lu", uc.cmd); + } + } + out: + return (void *)failures; +} + +int main(int argc, char *argv[]) +{ + TEST_REQUIRE(kvm_has_cap(KVM_CAP_VM_TSC_CONTROL)); + + vm = vm_create(NR_TEST_VCPUS); + vm_ioctl(vm, KVM_SET_TSC_KHZ, (void *) TEST_TSC_KHZ); + + pthread_spin_init(&create_lock, PTHREAD_PROCESS_PRIVATE); + pthread_t cpu_threads[NR_TEST_VCPUS]; + unsigned long cpu; + for (cpu = 0; cpu < NR_TEST_VCPUS; cpu++) + pthread_create(&cpu_threads[cpu], NULL, run_vcpu, (void *)cpu); + + unsigned long failures = 0; + for (cpu = 0; cpu < NR_TEST_VCPUS; cpu++) { + void *this_cpu_failures; + pthread_join(cpu_threads[cpu], &this_cpu_failures); + failures += (unsigned long)this_cpu_failures; + } + + TEST_ASSERT(!failures, "TSC sync failed"); + pthread_spin_destroy(&create_lock); + kvm_vm_free(vm); + return 0; +} diff --git a/tools/testing/selftests/kvm/x86_64/ucna_injection_test.c b/tools/testing/selftests/kvm/x86_64/ucna_injection_test.c new file mode 100644 index 000000000000..57f157c06b39 --- /dev/null +++ b/tools/testing/selftests/kvm/x86_64/ucna_injection_test.c @@ -0,0 +1,295 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * ucna_injection_test + * + * Copyright (C) 2022, Google LLC. + * + * This work is licensed under the terms of the GNU GPL, version 2. + * + * Test that user space can inject UnCorrectable No Action required (UCNA) + * memory errors to the guest. + * + * The test starts one vCPU with the MCG_CMCI_P enabled. It verifies that + * proper UCNA errors can be injected to a vCPU with MCG_CMCI_P and + * corresponding per-bank control register (MCI_CTL2) bit enabled. + * The test also checks that the UCNA errors get recorded in the + * Machine Check bank registers no matter the error signal interrupts get + * delivered into the guest or not. + * + */ +#include <pthread.h> +#include <inttypes.h> +#include <string.h> +#include <time.h> + +#include "kvm_util.h" +#include "mce.h" +#include "processor.h" +#include "test_util.h" +#include "apic.h" + +#define SYNC_FIRST_UCNA 9 +#define SYNC_SECOND_UCNA 10 +#define SYNC_GP 11 +#define FIRST_UCNA_ADDR 0xdeadbeef +#define SECOND_UCNA_ADDR 0xcafeb0ba + +/* + * Vector for the CMCI interrupt. + * Value is arbitrary. Any value in 0x20-0xFF should work: + * https://wiki.osdev.org/Interrupt_Vector_Table + */ +#define CMCI_VECTOR 0xa9 + +#define UCNA_BANK 0x7 // IMC0 bank + +#define MCI_CTL2_RESERVED_BIT BIT_ULL(29) + +static uint64_t supported_mcg_caps; + +/* + * Record states about the injected UCNA. + * The variables started with the 'i_' prefixes are recorded in interrupt + * handler. Variables without the 'i_' prefixes are recorded in guest main + * execution thread. + */ +static volatile uint64_t i_ucna_rcvd; +static volatile uint64_t i_ucna_addr; +static volatile uint64_t ucna_addr; +static volatile uint64_t ucna_addr2; + +struct thread_params { + struct kvm_vcpu *vcpu; + uint64_t *p_i_ucna_rcvd; + uint64_t *p_i_ucna_addr; + uint64_t *p_ucna_addr; + uint64_t *p_ucna_addr2; +}; + +static void verify_apic_base_addr(void) +{ + uint64_t msr = rdmsr(MSR_IA32_APICBASE); + uint64_t base = GET_APIC_BASE(msr); + + GUEST_ASSERT(base == APIC_DEFAULT_GPA); +} + +static void ucna_injection_guest_code(void) +{ + uint64_t ctl2; + verify_apic_base_addr(); + xapic_enable(); + + /* Sets up the interrupt vector and enables per-bank CMCI sigaling. */ + xapic_write_reg(APIC_LVTCMCI, CMCI_VECTOR | APIC_DM_FIXED); + ctl2 = rdmsr(MSR_IA32_MCx_CTL2(UCNA_BANK)); + wrmsr(MSR_IA32_MCx_CTL2(UCNA_BANK), ctl2 | MCI_CTL2_CMCI_EN); + + /* Enables interrupt in guest. */ + asm volatile("sti"); + + /* Let user space inject the first UCNA */ + GUEST_SYNC(SYNC_FIRST_UCNA); + + ucna_addr = rdmsr(MSR_IA32_MCx_ADDR(UCNA_BANK)); + + /* Disables the per-bank CMCI signaling. */ + ctl2 = rdmsr(MSR_IA32_MCx_CTL2(UCNA_BANK)); + wrmsr(MSR_IA32_MCx_CTL2(UCNA_BANK), ctl2 & ~MCI_CTL2_CMCI_EN); + + /* Let the user space inject the second UCNA */ + GUEST_SYNC(SYNC_SECOND_UCNA); + + ucna_addr2 = rdmsr(MSR_IA32_MCx_ADDR(UCNA_BANK)); + GUEST_DONE(); +} + +static void cmci_disabled_guest_code(void) +{ + uint64_t ctl2 = rdmsr(MSR_IA32_MCx_CTL2(UCNA_BANK)); + wrmsr(MSR_IA32_MCx_CTL2(UCNA_BANK), ctl2 | MCI_CTL2_CMCI_EN); + + GUEST_DONE(); +} + +static void cmci_enabled_guest_code(void) +{ + uint64_t ctl2 = rdmsr(MSR_IA32_MCx_CTL2(UCNA_BANK)); + wrmsr(MSR_IA32_MCx_CTL2(UCNA_BANK), ctl2 | MCI_CTL2_RESERVED_BIT); + + GUEST_DONE(); +} + +static void guest_cmci_handler(struct ex_regs *regs) +{ + i_ucna_rcvd++; + i_ucna_addr = rdmsr(MSR_IA32_MCx_ADDR(UCNA_BANK)); + xapic_write_reg(APIC_EOI, 0); +} + +static void guest_gp_handler(struct ex_regs *regs) +{ + GUEST_SYNC(SYNC_GP); +} + +static void run_vcpu_expect_gp(struct kvm_vcpu *vcpu) +{ + struct ucall uc; + + vcpu_run(vcpu); + + TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO); + TEST_ASSERT(get_ucall(vcpu, &uc) == UCALL_SYNC, + "Expect UCALL_SYNC"); + TEST_ASSERT(uc.args[1] == SYNC_GP, "#GP is expected."); + printf("vCPU received GP in guest.\n"); +} + +static void inject_ucna(struct kvm_vcpu *vcpu, uint64_t addr) { + /* + * A UCNA error is indicated with VAL=1, UC=1, PCC=0, S=0 and AR=0 in + * the IA32_MCi_STATUS register. + * MSCOD=1 (BIT[16] - MscodDataRdErr). + * MCACOD=0x0090 (Memory controller error format, channel 0) + */ + uint64_t status = MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN | + MCI_STATUS_MISCV | MCI_STATUS_ADDRV | 0x10090; + struct kvm_x86_mce mce = {}; + mce.status = status; + mce.mcg_status = 0; + /* + * MCM_ADDR_PHYS indicates the reported address is a physical address. + * Lowest 6 bits is the recoverable address LSB, i.e., the injected MCE + * is at 4KB granularity. + */ + mce.misc = (MCM_ADDR_PHYS << 6) | 0xc; + mce.addr = addr; + mce.bank = UCNA_BANK; + + vcpu_ioctl(vcpu, KVM_X86_SET_MCE, &mce); +} + +static void *run_ucna_injection(void *arg) +{ + struct thread_params *params = (struct thread_params *)arg; + struct ucall uc; + int old; + int r; + + r = pthread_setcanceltype(PTHREAD_CANCEL_ASYNCHRONOUS, &old); + TEST_ASSERT(r == 0, + "pthread_setcanceltype failed with errno=%d", + r); + + vcpu_run(params->vcpu); + + TEST_ASSERT_KVM_EXIT_REASON(params->vcpu, KVM_EXIT_IO); + TEST_ASSERT(get_ucall(params->vcpu, &uc) == UCALL_SYNC, + "Expect UCALL_SYNC"); + TEST_ASSERT(uc.args[1] == SYNC_FIRST_UCNA, "Injecting first UCNA."); + + printf("Injecting first UCNA at %#x.\n", FIRST_UCNA_ADDR); + + inject_ucna(params->vcpu, FIRST_UCNA_ADDR); + vcpu_run(params->vcpu); + + TEST_ASSERT_KVM_EXIT_REASON(params->vcpu, KVM_EXIT_IO); + TEST_ASSERT(get_ucall(params->vcpu, &uc) == UCALL_SYNC, + "Expect UCALL_SYNC"); + TEST_ASSERT(uc.args[1] == SYNC_SECOND_UCNA, "Injecting second UCNA."); + + printf("Injecting second UCNA at %#x.\n", SECOND_UCNA_ADDR); + + inject_ucna(params->vcpu, SECOND_UCNA_ADDR); + vcpu_run(params->vcpu); + + TEST_ASSERT_KVM_EXIT_REASON(params->vcpu, KVM_EXIT_IO); + if (get_ucall(params->vcpu, &uc) == UCALL_ABORT) { + TEST_ASSERT(false, "vCPU assertion failure: %s.", + (const char *)uc.args[0]); + } + + return NULL; +} + +static void test_ucna_injection(struct kvm_vcpu *vcpu, struct thread_params *params) +{ + struct kvm_vm *vm = vcpu->vm; + params->vcpu = vcpu; + params->p_i_ucna_rcvd = (uint64_t *)addr_gva2hva(vm, (uint64_t)&i_ucna_rcvd); + params->p_i_ucna_addr = (uint64_t *)addr_gva2hva(vm, (uint64_t)&i_ucna_addr); + params->p_ucna_addr = (uint64_t *)addr_gva2hva(vm, (uint64_t)&ucna_addr); + params->p_ucna_addr2 = (uint64_t *)addr_gva2hva(vm, (uint64_t)&ucna_addr2); + + run_ucna_injection(params); + + TEST_ASSERT(*params->p_i_ucna_rcvd == 1, "Only first UCNA get signaled."); + TEST_ASSERT(*params->p_i_ucna_addr == FIRST_UCNA_ADDR, + "Only first UCNA reported addr get recorded via interrupt."); + TEST_ASSERT(*params->p_ucna_addr == FIRST_UCNA_ADDR, + "First injected UCNAs should get exposed via registers."); + TEST_ASSERT(*params->p_ucna_addr2 == SECOND_UCNA_ADDR, + "Second injected UCNAs should get exposed via registers."); + + printf("Test successful.\n" + "UCNA CMCI interrupts received: %ld\n" + "Last UCNA address received via CMCI: %lx\n" + "First UCNA address in vCPU thread: %lx\n" + "Second UCNA address in vCPU thread: %lx\n", + *params->p_i_ucna_rcvd, *params->p_i_ucna_addr, + *params->p_ucna_addr, *params->p_ucna_addr2); +} + +static void setup_mce_cap(struct kvm_vcpu *vcpu, bool enable_cmci_p) +{ + uint64_t mcg_caps = MCG_CTL_P | MCG_SER_P | MCG_LMCE_P | KVM_MAX_MCE_BANKS; + if (enable_cmci_p) + mcg_caps |= MCG_CMCI_P; + + mcg_caps &= supported_mcg_caps | MCG_CAP_BANKS_MASK; + vcpu_ioctl(vcpu, KVM_X86_SETUP_MCE, &mcg_caps); +} + +static struct kvm_vcpu *create_vcpu_with_mce_cap(struct kvm_vm *vm, uint32_t vcpuid, + bool enable_cmci_p, void *guest_code) +{ + struct kvm_vcpu *vcpu = vm_vcpu_add(vm, vcpuid, guest_code); + setup_mce_cap(vcpu, enable_cmci_p); + return vcpu; +} + +int main(int argc, char *argv[]) +{ + struct thread_params params; + struct kvm_vm *vm; + struct kvm_vcpu *ucna_vcpu; + struct kvm_vcpu *cmcidis_vcpu; + struct kvm_vcpu *cmci_vcpu; + + kvm_check_cap(KVM_CAP_MCE); + + vm = __vm_create(VM_SHAPE_DEFAULT, 3, 0); + + kvm_ioctl(vm->kvm_fd, KVM_X86_GET_MCE_CAP_SUPPORTED, + &supported_mcg_caps); + + if (!(supported_mcg_caps & MCG_CMCI_P)) { + print_skip("MCG_CMCI_P is not supported"); + exit(KSFT_SKIP); + } + + ucna_vcpu = create_vcpu_with_mce_cap(vm, 0, true, ucna_injection_guest_code); + cmcidis_vcpu = create_vcpu_with_mce_cap(vm, 1, false, cmci_disabled_guest_code); + cmci_vcpu = create_vcpu_with_mce_cap(vm, 2, true, cmci_enabled_guest_code); + + vm_install_exception_handler(vm, CMCI_VECTOR, guest_cmci_handler); + vm_install_exception_handler(vm, GP_VECTOR, guest_gp_handler); + + virt_pg_map(vm, APIC_DEFAULT_GPA, APIC_DEFAULT_GPA); + + test_ucna_injection(ucna_vcpu, ¶ms); + run_vcpu_expect_gp(cmcidis_vcpu); + run_vcpu_expect_gp(cmci_vcpu); + + kvm_vm_free(vm); +} diff --git a/tools/testing/selftests/kvm/x86_64/userspace_io_test.c b/tools/testing/selftests/kvm/x86_64/userspace_io_test.c new file mode 100644 index 000000000000..9481cbcf284f --- /dev/null +++ b/tools/testing/selftests/kvm/x86_64/userspace_io_test.c @@ -0,0 +1,103 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <fcntl.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/ioctl.h> + +#include "test_util.h" + +#include "kvm_util.h" +#include "processor.h" + +static void guest_ins_port80(uint8_t *buffer, unsigned int count) +{ + unsigned long end; + + if (count == 2) + end = (unsigned long)buffer + 1; + else + end = (unsigned long)buffer + 8192; + + asm volatile("cld; rep; insb" : "+D"(buffer), "+c"(count) : "d"(0x80) : "memory"); + GUEST_ASSERT_EQ(count, 0); + GUEST_ASSERT_EQ((unsigned long)buffer, end); +} + +static void guest_code(void) +{ + uint8_t buffer[8192]; + int i; + + /* + * Special case tests. main() will adjust RCX 2 => 1 and 3 => 8192 to + * test that KVM doesn't explode when userspace modifies the "count" on + * a userspace I/O exit. KVM isn't required to play nice with the I/O + * itself as KVM doesn't support manipulating the count, it just needs + * to not explode or overflow a buffer. + */ + guest_ins_port80(buffer, 2); + guest_ins_port80(buffer, 3); + + /* Verify KVM fills the buffer correctly when not stuffing RCX. */ + memset(buffer, 0, sizeof(buffer)); + guest_ins_port80(buffer, 8192); + for (i = 0; i < 8192; i++) + __GUEST_ASSERT(buffer[i] == 0xaa, + "Expected '0xaa', got '0x%x' at buffer[%u]", + buffer[i], i); + + GUEST_DONE(); +} + +int main(int argc, char *argv[]) +{ + struct kvm_vcpu *vcpu; + struct kvm_regs regs; + struct kvm_run *run; + struct kvm_vm *vm; + struct ucall uc; + + vm = vm_create_with_one_vcpu(&vcpu, guest_code); + run = vcpu->run; + + memset(®s, 0, sizeof(regs)); + + while (1) { + vcpu_run(vcpu); + TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO); + + if (get_ucall(vcpu, &uc)) + break; + + TEST_ASSERT(run->io.port == 0x80, + "Expected I/O at port 0x80, got port 0x%x", run->io.port); + + /* + * Modify the rep string count in RCX: 2 => 1 and 3 => 8192. + * Note, this abuses KVM's batching of rep string I/O to avoid + * getting stuck in an infinite loop. That behavior isn't in + * scope from a testing perspective as it's not ABI in any way, + * i.e. it really is abusing internal KVM knowledge. + */ + vcpu_regs_get(vcpu, ®s); + if (regs.rcx == 2) + regs.rcx = 1; + if (regs.rcx == 3) + regs.rcx = 8192; + memset((void *)run + run->io.data_offset, 0xaa, 4096); + vcpu_regs_set(vcpu, ®s); + } + + switch (uc.cmd) { + case UCALL_DONE: + break; + case UCALL_ABORT: + REPORT_GUEST_ASSERT(uc); + default: + TEST_FAIL("Unknown ucall %lu", uc.cmd); + } + + kvm_vm_free(vm); + return 0; +} diff --git a/tools/testing/selftests/kvm/x86_64/userspace_msr_exit_test.c b/tools/testing/selftests/kvm/x86_64/userspace_msr_exit_test.c new file mode 100644 index 000000000000..32b2794b78fe --- /dev/null +++ b/tools/testing/selftests/kvm/x86_64/userspace_msr_exit_test.c @@ -0,0 +1,769 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2020, Google LLC. + * + * Tests for exiting into userspace on registered MSRs + */ +#include <sys/ioctl.h> + +#include "kvm_test_harness.h" +#include "test_util.h" +#include "kvm_util.h" +#include "vmx.h" + +#define MSR_NON_EXISTENT 0x474f4f00 + +static u64 deny_bits = 0; +struct kvm_msr_filter filter_allow = { + .flags = KVM_MSR_FILTER_DEFAULT_ALLOW, + .ranges = { + { + .flags = KVM_MSR_FILTER_READ | + KVM_MSR_FILTER_WRITE, + .nmsrs = 1, + /* Test an MSR the kernel knows about. */ + .base = MSR_IA32_XSS, + .bitmap = (uint8_t*)&deny_bits, + }, { + .flags = KVM_MSR_FILTER_READ | + KVM_MSR_FILTER_WRITE, + .nmsrs = 1, + /* Test an MSR the kernel doesn't know about. */ + .base = MSR_IA32_FLUSH_CMD, + .bitmap = (uint8_t*)&deny_bits, + }, { + .flags = KVM_MSR_FILTER_READ | + KVM_MSR_FILTER_WRITE, + .nmsrs = 1, + /* Test a fabricated MSR that no one knows about. */ + .base = MSR_NON_EXISTENT, + .bitmap = (uint8_t*)&deny_bits, + }, + }, +}; + +struct kvm_msr_filter filter_fs = { + .flags = KVM_MSR_FILTER_DEFAULT_ALLOW, + .ranges = { + { + .flags = KVM_MSR_FILTER_READ, + .nmsrs = 1, + .base = MSR_FS_BASE, + .bitmap = (uint8_t*)&deny_bits, + }, + }, +}; + +struct kvm_msr_filter filter_gs = { + .flags = KVM_MSR_FILTER_DEFAULT_ALLOW, + .ranges = { + { + .flags = KVM_MSR_FILTER_READ, + .nmsrs = 1, + .base = MSR_GS_BASE, + .bitmap = (uint8_t*)&deny_bits, + }, + }, +}; + +static uint64_t msr_non_existent_data; +static int guest_exception_count; +static u32 msr_reads, msr_writes; + +static u8 bitmap_00000000[KVM_MSR_FILTER_MAX_BITMAP_SIZE]; +static u8 bitmap_00000000_write[KVM_MSR_FILTER_MAX_BITMAP_SIZE]; +static u8 bitmap_40000000[KVM_MSR_FILTER_MAX_BITMAP_SIZE]; +static u8 bitmap_c0000000[KVM_MSR_FILTER_MAX_BITMAP_SIZE]; +static u8 bitmap_c0000000_read[KVM_MSR_FILTER_MAX_BITMAP_SIZE]; +static u8 bitmap_deadbeef[1] = { 0x1 }; + +static void deny_msr(uint8_t *bitmap, u32 msr) +{ + u32 idx = msr & (KVM_MSR_FILTER_MAX_BITMAP_SIZE - 1); + + bitmap[idx / 8] &= ~(1 << (idx % 8)); +} + +static void prepare_bitmaps(void) +{ + memset(bitmap_00000000, 0xff, sizeof(bitmap_00000000)); + memset(bitmap_00000000_write, 0xff, sizeof(bitmap_00000000_write)); + memset(bitmap_40000000, 0xff, sizeof(bitmap_40000000)); + memset(bitmap_c0000000, 0xff, sizeof(bitmap_c0000000)); + memset(bitmap_c0000000_read, 0xff, sizeof(bitmap_c0000000_read)); + + deny_msr(bitmap_00000000_write, MSR_IA32_POWER_CTL); + deny_msr(bitmap_c0000000_read, MSR_SYSCALL_MASK); + deny_msr(bitmap_c0000000_read, MSR_GS_BASE); +} + +struct kvm_msr_filter filter_deny = { + .flags = KVM_MSR_FILTER_DEFAULT_DENY, + .ranges = { + { + .flags = KVM_MSR_FILTER_READ, + .base = 0x00000000, + .nmsrs = KVM_MSR_FILTER_MAX_BITMAP_SIZE * BITS_PER_BYTE, + .bitmap = bitmap_00000000, + }, { + .flags = KVM_MSR_FILTER_WRITE, + .base = 0x00000000, + .nmsrs = KVM_MSR_FILTER_MAX_BITMAP_SIZE * BITS_PER_BYTE, + .bitmap = bitmap_00000000_write, + }, { + .flags = KVM_MSR_FILTER_READ | KVM_MSR_FILTER_WRITE, + .base = 0x40000000, + .nmsrs = KVM_MSR_FILTER_MAX_BITMAP_SIZE * BITS_PER_BYTE, + .bitmap = bitmap_40000000, + }, { + .flags = KVM_MSR_FILTER_READ, + .base = 0xc0000000, + .nmsrs = KVM_MSR_FILTER_MAX_BITMAP_SIZE * BITS_PER_BYTE, + .bitmap = bitmap_c0000000_read, + }, { + .flags = KVM_MSR_FILTER_WRITE, + .base = 0xc0000000, + .nmsrs = KVM_MSR_FILTER_MAX_BITMAP_SIZE * BITS_PER_BYTE, + .bitmap = bitmap_c0000000, + }, { + .flags = KVM_MSR_FILTER_WRITE | KVM_MSR_FILTER_READ, + .base = 0xdeadbeef, + .nmsrs = 1, + .bitmap = bitmap_deadbeef, + }, + }, +}; + +struct kvm_msr_filter no_filter_deny = { + .flags = KVM_MSR_FILTER_DEFAULT_ALLOW, +}; + +/* + * Note: Force test_rdmsr() to not be inlined to prevent the labels, + * rdmsr_start and rdmsr_end, from being defined multiple times. + */ +static noinline uint64_t test_rdmsr(uint32_t msr) +{ + uint32_t a, d; + + guest_exception_count = 0; + + __asm__ __volatile__("rdmsr_start: rdmsr; rdmsr_end:" : + "=a"(a), "=d"(d) : "c"(msr) : "memory"); + + return a | ((uint64_t) d << 32); +} + +/* + * Note: Force test_wrmsr() to not be inlined to prevent the labels, + * wrmsr_start and wrmsr_end, from being defined multiple times. + */ +static noinline void test_wrmsr(uint32_t msr, uint64_t value) +{ + uint32_t a = value; + uint32_t d = value >> 32; + + guest_exception_count = 0; + + __asm__ __volatile__("wrmsr_start: wrmsr; wrmsr_end:" :: + "a"(a), "d"(d), "c"(msr) : "memory"); +} + +extern char rdmsr_start, rdmsr_end; +extern char wrmsr_start, wrmsr_end; + +/* + * Note: Force test_em_rdmsr() to not be inlined to prevent the labels, + * rdmsr_start and rdmsr_end, from being defined multiple times. + */ +static noinline uint64_t test_em_rdmsr(uint32_t msr) +{ + uint32_t a, d; + + guest_exception_count = 0; + + __asm__ __volatile__(KVM_FEP "em_rdmsr_start: rdmsr; em_rdmsr_end:" : + "=a"(a), "=d"(d) : "c"(msr) : "memory"); + + return a | ((uint64_t) d << 32); +} + +/* + * Note: Force test_em_wrmsr() to not be inlined to prevent the labels, + * wrmsr_start and wrmsr_end, from being defined multiple times. + */ +static noinline void test_em_wrmsr(uint32_t msr, uint64_t value) +{ + uint32_t a = value; + uint32_t d = value >> 32; + + guest_exception_count = 0; + + __asm__ __volatile__(KVM_FEP "em_wrmsr_start: wrmsr; em_wrmsr_end:" :: + "a"(a), "d"(d), "c"(msr) : "memory"); +} + +extern char em_rdmsr_start, em_rdmsr_end; +extern char em_wrmsr_start, em_wrmsr_end; + +static void guest_code_filter_allow(void) +{ + uint64_t data; + + /* + * Test userspace intercepting rdmsr / wrmsr for MSR_IA32_XSS. + * + * A GP is thrown if anything other than 0 is written to + * MSR_IA32_XSS. + */ + data = test_rdmsr(MSR_IA32_XSS); + GUEST_ASSERT(data == 0); + GUEST_ASSERT(guest_exception_count == 0); + + test_wrmsr(MSR_IA32_XSS, 0); + GUEST_ASSERT(guest_exception_count == 0); + + test_wrmsr(MSR_IA32_XSS, 1); + GUEST_ASSERT(guest_exception_count == 1); + + /* + * Test userspace intercepting rdmsr / wrmsr for MSR_IA32_FLUSH_CMD. + * + * A GP is thrown if MSR_IA32_FLUSH_CMD is read + * from or if a value other than 1 is written to it. + */ + test_rdmsr(MSR_IA32_FLUSH_CMD); + GUEST_ASSERT(guest_exception_count == 1); + + test_wrmsr(MSR_IA32_FLUSH_CMD, 0); + GUEST_ASSERT(guest_exception_count == 1); + + test_wrmsr(MSR_IA32_FLUSH_CMD, 1); + GUEST_ASSERT(guest_exception_count == 0); + + /* + * Test userspace intercepting rdmsr / wrmsr for MSR_NON_EXISTENT. + * + * Test that a fabricated MSR can pass through the kernel + * and be handled in userspace. + */ + test_wrmsr(MSR_NON_EXISTENT, 2); + GUEST_ASSERT(guest_exception_count == 0); + + data = test_rdmsr(MSR_NON_EXISTENT); + GUEST_ASSERT(data == 2); + GUEST_ASSERT(guest_exception_count == 0); + + if (is_forced_emulation_enabled) { + /* Let userspace know we aren't done. */ + GUEST_SYNC(0); + + /* + * Now run the same tests with the instruction emulator. + */ + data = test_em_rdmsr(MSR_IA32_XSS); + GUEST_ASSERT(data == 0); + GUEST_ASSERT(guest_exception_count == 0); + test_em_wrmsr(MSR_IA32_XSS, 0); + GUEST_ASSERT(guest_exception_count == 0); + test_em_wrmsr(MSR_IA32_XSS, 1); + GUEST_ASSERT(guest_exception_count == 1); + + test_em_rdmsr(MSR_IA32_FLUSH_CMD); + GUEST_ASSERT(guest_exception_count == 1); + test_em_wrmsr(MSR_IA32_FLUSH_CMD, 0); + GUEST_ASSERT(guest_exception_count == 1); + test_em_wrmsr(MSR_IA32_FLUSH_CMD, 1); + GUEST_ASSERT(guest_exception_count == 0); + + test_em_wrmsr(MSR_NON_EXISTENT, 2); + GUEST_ASSERT(guest_exception_count == 0); + data = test_em_rdmsr(MSR_NON_EXISTENT); + GUEST_ASSERT(data == 2); + GUEST_ASSERT(guest_exception_count == 0); + } + + GUEST_DONE(); +} + +static void guest_msr_calls(bool trapped) +{ + /* This goes into the in-kernel emulation */ + wrmsr(MSR_SYSCALL_MASK, 0); + + if (trapped) { + /* This goes into user space emulation */ + GUEST_ASSERT(rdmsr(MSR_SYSCALL_MASK) == MSR_SYSCALL_MASK); + GUEST_ASSERT(rdmsr(MSR_GS_BASE) == MSR_GS_BASE); + } else { + GUEST_ASSERT(rdmsr(MSR_SYSCALL_MASK) != MSR_SYSCALL_MASK); + GUEST_ASSERT(rdmsr(MSR_GS_BASE) != MSR_GS_BASE); + } + + /* If trapped == true, this goes into user space emulation */ + wrmsr(MSR_IA32_POWER_CTL, 0x1234); + + /* This goes into the in-kernel emulation */ + rdmsr(MSR_IA32_POWER_CTL); + + /* Invalid MSR, should always be handled by user space exit */ + GUEST_ASSERT(rdmsr(0xdeadbeef) == 0xdeadbeef); + wrmsr(0xdeadbeef, 0x1234); +} + +static void guest_code_filter_deny(void) +{ + guest_msr_calls(true); + + /* + * Disable msr filtering, so that the kernel + * handles everything in the next round + */ + GUEST_SYNC(0); + + guest_msr_calls(false); + + GUEST_DONE(); +} + +static void guest_code_permission_bitmap(void) +{ + uint64_t data; + + data = test_rdmsr(MSR_FS_BASE); + GUEST_ASSERT(data == MSR_FS_BASE); + data = test_rdmsr(MSR_GS_BASE); + GUEST_ASSERT(data != MSR_GS_BASE); + + /* Let userspace know to switch the filter */ + GUEST_SYNC(0); + + data = test_rdmsr(MSR_FS_BASE); + GUEST_ASSERT(data != MSR_FS_BASE); + data = test_rdmsr(MSR_GS_BASE); + GUEST_ASSERT(data == MSR_GS_BASE); + + GUEST_DONE(); +} + +static void __guest_gp_handler(struct ex_regs *regs, + char *r_start, char *r_end, + char *w_start, char *w_end) +{ + if (regs->rip == (uintptr_t)r_start) { + regs->rip = (uintptr_t)r_end; + regs->rax = 0; + regs->rdx = 0; + } else if (regs->rip == (uintptr_t)w_start) { + regs->rip = (uintptr_t)w_end; + } else { + GUEST_ASSERT(!"RIP is at an unknown location!"); + } + + ++guest_exception_count; +} + +static void guest_gp_handler(struct ex_regs *regs) +{ + __guest_gp_handler(regs, &rdmsr_start, &rdmsr_end, + &wrmsr_start, &wrmsr_end); +} + +static void guest_fep_gp_handler(struct ex_regs *regs) +{ + __guest_gp_handler(regs, &em_rdmsr_start, &em_rdmsr_end, + &em_wrmsr_start, &em_wrmsr_end); +} + +static void check_for_guest_assert(struct kvm_vcpu *vcpu) +{ + struct ucall uc; + + if (vcpu->run->exit_reason == KVM_EXIT_IO && + get_ucall(vcpu, &uc) == UCALL_ABORT) { + REPORT_GUEST_ASSERT(uc); + } +} + +static void process_rdmsr(struct kvm_vcpu *vcpu, uint32_t msr_index) +{ + struct kvm_run *run = vcpu->run; + + check_for_guest_assert(vcpu); + + TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_X86_RDMSR); + TEST_ASSERT(run->msr.index == msr_index, + "Unexpected msr (0x%04x), expected 0x%04x", + run->msr.index, msr_index); + + switch (run->msr.index) { + case MSR_IA32_XSS: + run->msr.data = 0; + break; + case MSR_IA32_FLUSH_CMD: + run->msr.error = 1; + break; + case MSR_NON_EXISTENT: + run->msr.data = msr_non_existent_data; + break; + case MSR_FS_BASE: + run->msr.data = MSR_FS_BASE; + break; + case MSR_GS_BASE: + run->msr.data = MSR_GS_BASE; + break; + default: + TEST_ASSERT(false, "Unexpected MSR: 0x%04x", run->msr.index); + } +} + +static void process_wrmsr(struct kvm_vcpu *vcpu, uint32_t msr_index) +{ + struct kvm_run *run = vcpu->run; + + check_for_guest_assert(vcpu); + + TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_X86_WRMSR); + TEST_ASSERT(run->msr.index == msr_index, + "Unexpected msr (0x%04x), expected 0x%04x", + run->msr.index, msr_index); + + switch (run->msr.index) { + case MSR_IA32_XSS: + if (run->msr.data != 0) + run->msr.error = 1; + break; + case MSR_IA32_FLUSH_CMD: + if (run->msr.data != 1) + run->msr.error = 1; + break; + case MSR_NON_EXISTENT: + msr_non_existent_data = run->msr.data; + break; + default: + TEST_ASSERT(false, "Unexpected MSR: 0x%04x", run->msr.index); + } +} + +static void process_ucall_done(struct kvm_vcpu *vcpu) +{ + struct ucall uc; + + check_for_guest_assert(vcpu); + + TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO); + + TEST_ASSERT(get_ucall(vcpu, &uc) == UCALL_DONE, + "Unexpected ucall command: %lu, expected UCALL_DONE (%d)", + uc.cmd, UCALL_DONE); +} + +static uint64_t process_ucall(struct kvm_vcpu *vcpu) +{ + struct ucall uc = {}; + + check_for_guest_assert(vcpu); + + TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO); + + switch (get_ucall(vcpu, &uc)) { + case UCALL_SYNC: + break; + case UCALL_ABORT: + check_for_guest_assert(vcpu); + break; + case UCALL_DONE: + process_ucall_done(vcpu); + break; + default: + TEST_ASSERT(false, "Unexpected ucall"); + } + + return uc.cmd; +} + +static void run_guest_then_process_rdmsr(struct kvm_vcpu *vcpu, + uint32_t msr_index) +{ + vcpu_run(vcpu); + process_rdmsr(vcpu, msr_index); +} + +static void run_guest_then_process_wrmsr(struct kvm_vcpu *vcpu, + uint32_t msr_index) +{ + vcpu_run(vcpu); + process_wrmsr(vcpu, msr_index); +} + +static uint64_t run_guest_then_process_ucall(struct kvm_vcpu *vcpu) +{ + vcpu_run(vcpu); + return process_ucall(vcpu); +} + +static void run_guest_then_process_ucall_done(struct kvm_vcpu *vcpu) +{ + vcpu_run(vcpu); + process_ucall_done(vcpu); +} + +KVM_ONE_VCPU_TEST_SUITE(user_msr); + +KVM_ONE_VCPU_TEST(user_msr, msr_filter_allow, guest_code_filter_allow) +{ + struct kvm_vm *vm = vcpu->vm; + uint64_t cmd; + int rc; + + rc = kvm_check_cap(KVM_CAP_X86_USER_SPACE_MSR); + TEST_ASSERT(rc, "KVM_CAP_X86_USER_SPACE_MSR is available"); + vm_enable_cap(vm, KVM_CAP_X86_USER_SPACE_MSR, KVM_MSR_EXIT_REASON_FILTER); + + rc = kvm_check_cap(KVM_CAP_X86_MSR_FILTER); + TEST_ASSERT(rc, "KVM_CAP_X86_MSR_FILTER is available"); + + vm_ioctl(vm, KVM_X86_SET_MSR_FILTER, &filter_allow); + + vm_install_exception_handler(vm, GP_VECTOR, guest_gp_handler); + + /* Process guest code userspace exits. */ + run_guest_then_process_rdmsr(vcpu, MSR_IA32_XSS); + run_guest_then_process_wrmsr(vcpu, MSR_IA32_XSS); + run_guest_then_process_wrmsr(vcpu, MSR_IA32_XSS); + + run_guest_then_process_rdmsr(vcpu, MSR_IA32_FLUSH_CMD); + run_guest_then_process_wrmsr(vcpu, MSR_IA32_FLUSH_CMD); + run_guest_then_process_wrmsr(vcpu, MSR_IA32_FLUSH_CMD); + + run_guest_then_process_wrmsr(vcpu, MSR_NON_EXISTENT); + run_guest_then_process_rdmsr(vcpu, MSR_NON_EXISTENT); + + vcpu_run(vcpu); + cmd = process_ucall(vcpu); + + if (is_forced_emulation_enabled) { + TEST_ASSERT_EQ(cmd, UCALL_SYNC); + vm_install_exception_handler(vm, GP_VECTOR, guest_fep_gp_handler); + + /* Process emulated rdmsr and wrmsr instructions. */ + run_guest_then_process_rdmsr(vcpu, MSR_IA32_XSS); + run_guest_then_process_wrmsr(vcpu, MSR_IA32_XSS); + run_guest_then_process_wrmsr(vcpu, MSR_IA32_XSS); + + run_guest_then_process_rdmsr(vcpu, MSR_IA32_FLUSH_CMD); + run_guest_then_process_wrmsr(vcpu, MSR_IA32_FLUSH_CMD); + run_guest_then_process_wrmsr(vcpu, MSR_IA32_FLUSH_CMD); + + run_guest_then_process_wrmsr(vcpu, MSR_NON_EXISTENT); + run_guest_then_process_rdmsr(vcpu, MSR_NON_EXISTENT); + + /* Confirm the guest completed without issues. */ + run_guest_then_process_ucall_done(vcpu); + } else { + TEST_ASSERT_EQ(cmd, UCALL_DONE); + printf("To run the instruction emulated tests set the module parameter 'kvm.force_emulation_prefix=1'\n"); + } +} + +static int handle_ucall(struct kvm_vcpu *vcpu) +{ + struct ucall uc; + + switch (get_ucall(vcpu, &uc)) { + case UCALL_ABORT: + REPORT_GUEST_ASSERT(uc); + break; + case UCALL_SYNC: + vm_ioctl(vcpu->vm, KVM_X86_SET_MSR_FILTER, &no_filter_deny); + break; + case UCALL_DONE: + return 1; + default: + TEST_FAIL("Unknown ucall %lu", uc.cmd); + } + + return 0; +} + +static void handle_rdmsr(struct kvm_run *run) +{ + run->msr.data = run->msr.index; + msr_reads++; + + if (run->msr.index == MSR_SYSCALL_MASK || + run->msr.index == MSR_GS_BASE) { + TEST_ASSERT(run->msr.reason == KVM_MSR_EXIT_REASON_FILTER, + "MSR read trap w/o access fault"); + } + + if (run->msr.index == 0xdeadbeef) { + TEST_ASSERT(run->msr.reason == KVM_MSR_EXIT_REASON_UNKNOWN, + "MSR deadbeef read trap w/o inval fault"); + } +} + +static void handle_wrmsr(struct kvm_run *run) +{ + /* ignore */ + msr_writes++; + + if (run->msr.index == MSR_IA32_POWER_CTL) { + TEST_ASSERT(run->msr.data == 0x1234, + "MSR data for MSR_IA32_POWER_CTL incorrect"); + TEST_ASSERT(run->msr.reason == KVM_MSR_EXIT_REASON_FILTER, + "MSR_IA32_POWER_CTL trap w/o access fault"); + } + + if (run->msr.index == 0xdeadbeef) { + TEST_ASSERT(run->msr.data == 0x1234, + "MSR data for deadbeef incorrect"); + TEST_ASSERT(run->msr.reason == KVM_MSR_EXIT_REASON_UNKNOWN, + "deadbeef trap w/o inval fault"); + } +} + +KVM_ONE_VCPU_TEST(user_msr, msr_filter_deny, guest_code_filter_deny) +{ + struct kvm_vm *vm = vcpu->vm; + struct kvm_run *run = vcpu->run; + int rc; + + rc = kvm_check_cap(KVM_CAP_X86_USER_SPACE_MSR); + TEST_ASSERT(rc, "KVM_CAP_X86_USER_SPACE_MSR is available"); + vm_enable_cap(vm, KVM_CAP_X86_USER_SPACE_MSR, KVM_MSR_EXIT_REASON_INVAL | + KVM_MSR_EXIT_REASON_UNKNOWN | + KVM_MSR_EXIT_REASON_FILTER); + + rc = kvm_check_cap(KVM_CAP_X86_MSR_FILTER); + TEST_ASSERT(rc, "KVM_CAP_X86_MSR_FILTER is available"); + + prepare_bitmaps(); + vm_ioctl(vm, KVM_X86_SET_MSR_FILTER, &filter_deny); + + while (1) { + vcpu_run(vcpu); + + switch (run->exit_reason) { + case KVM_EXIT_X86_RDMSR: + handle_rdmsr(run); + break; + case KVM_EXIT_X86_WRMSR: + handle_wrmsr(run); + break; + case KVM_EXIT_IO: + if (handle_ucall(vcpu)) + goto done; + break; + } + + } + +done: + TEST_ASSERT(msr_reads == 4, "Handled 4 rdmsr in user space"); + TEST_ASSERT(msr_writes == 3, "Handled 3 wrmsr in user space"); +} + +KVM_ONE_VCPU_TEST(user_msr, msr_permission_bitmap, guest_code_permission_bitmap) +{ + struct kvm_vm *vm = vcpu->vm; + int rc; + + rc = kvm_check_cap(KVM_CAP_X86_USER_SPACE_MSR); + TEST_ASSERT(rc, "KVM_CAP_X86_USER_SPACE_MSR is available"); + vm_enable_cap(vm, KVM_CAP_X86_USER_SPACE_MSR, KVM_MSR_EXIT_REASON_FILTER); + + rc = kvm_check_cap(KVM_CAP_X86_MSR_FILTER); + TEST_ASSERT(rc, "KVM_CAP_X86_MSR_FILTER is available"); + + vm_ioctl(vm, KVM_X86_SET_MSR_FILTER, &filter_fs); + run_guest_then_process_rdmsr(vcpu, MSR_FS_BASE); + TEST_ASSERT(run_guest_then_process_ucall(vcpu) == UCALL_SYNC, + "Expected ucall state to be UCALL_SYNC."); + vm_ioctl(vm, KVM_X86_SET_MSR_FILTER, &filter_gs); + run_guest_then_process_rdmsr(vcpu, MSR_GS_BASE); + run_guest_then_process_ucall_done(vcpu); +} + +#define test_user_exit_msr_ioctl(vm, cmd, arg, flag, valid_mask) \ +({ \ + int r = __vm_ioctl(vm, cmd, arg); \ + \ + if (flag & valid_mask) \ + TEST_ASSERT(!r, __KVM_IOCTL_ERROR(#cmd, r)); \ + else \ + TEST_ASSERT(r == -1 && errno == EINVAL, \ + "Wanted EINVAL for %s with flag = 0x%llx, got rc: %i errno: %i (%s)", \ + #cmd, flag, r, errno, strerror(errno)); \ +}) + +static void run_user_space_msr_flag_test(struct kvm_vm *vm) +{ + struct kvm_enable_cap cap = { .cap = KVM_CAP_X86_USER_SPACE_MSR }; + int nflags = sizeof(cap.args[0]) * BITS_PER_BYTE; + int rc; + int i; + + rc = kvm_check_cap(KVM_CAP_X86_USER_SPACE_MSR); + TEST_ASSERT(rc, "KVM_CAP_X86_USER_SPACE_MSR is available"); + + for (i = 0; i < nflags; i++) { + cap.args[0] = BIT_ULL(i); + test_user_exit_msr_ioctl(vm, KVM_ENABLE_CAP, &cap, + BIT_ULL(i), KVM_MSR_EXIT_REASON_VALID_MASK); + } +} + +static void run_msr_filter_flag_test(struct kvm_vm *vm) +{ + u64 deny_bits = 0; + struct kvm_msr_filter filter = { + .flags = KVM_MSR_FILTER_DEFAULT_ALLOW, + .ranges = { + { + .flags = KVM_MSR_FILTER_READ, + .nmsrs = 1, + .base = 0, + .bitmap = (uint8_t *)&deny_bits, + }, + }, + }; + int nflags; + int rc; + int i; + + rc = kvm_check_cap(KVM_CAP_X86_MSR_FILTER); + TEST_ASSERT(rc, "KVM_CAP_X86_MSR_FILTER is available"); + + nflags = sizeof(filter.flags) * BITS_PER_BYTE; + for (i = 0; i < nflags; i++) { + filter.flags = BIT_ULL(i); + test_user_exit_msr_ioctl(vm, KVM_X86_SET_MSR_FILTER, &filter, + BIT_ULL(i), KVM_MSR_FILTER_VALID_MASK); + } + + filter.flags = KVM_MSR_FILTER_DEFAULT_ALLOW; + nflags = sizeof(filter.ranges[0].flags) * BITS_PER_BYTE; + for (i = 0; i < nflags; i++) { + filter.ranges[0].flags = BIT_ULL(i); + test_user_exit_msr_ioctl(vm, KVM_X86_SET_MSR_FILTER, &filter, + BIT_ULL(i), KVM_MSR_FILTER_RANGE_VALID_MASK); + } +} + +/* Test that attempts to write to the unused bits in a flag fails. */ +KVM_ONE_VCPU_TEST(user_msr, user_exit_msr_flags, NULL) +{ + struct kvm_vm *vm = vcpu->vm; + + /* Test flags for KVM_CAP_X86_USER_SPACE_MSR. */ + run_user_space_msr_flag_test(vm); + + /* Test flags and range flags for KVM_X86_SET_MSR_FILTER. */ + run_msr_filter_flag_test(vm); +} + +int main(int argc, char *argv[]) +{ + return test_harness_run(argc, argv); +} diff --git a/tools/testing/selftests/kvm/x86_64/vmx_apic_access_test.c b/tools/testing/selftests/kvm/x86_64/vmx_apic_access_test.c new file mode 100644 index 000000000000..a81a24761aac --- /dev/null +++ b/tools/testing/selftests/kvm/x86_64/vmx_apic_access_test.c @@ -0,0 +1,124 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * vmx_apic_access_test + * + * Copyright (C) 2020, Google LLC. + * + * This work is licensed under the terms of the GNU GPL, version 2. + * + * The first subtest simply checks to see that an L2 guest can be + * launched with a valid APIC-access address that is backed by a + * page of L1 physical memory. + * + * The second subtest sets the APIC-access address to a (valid) L1 + * physical address that is not backed by memory. KVM can't handle + * this situation, so resuming L2 should result in a KVM exit for + * internal error (emulation). This is not an architectural + * requirement. It is just a shortcoming of KVM. The internal error + * is unfortunate, but it's better than what used to happen! + */ + +#include "test_util.h" +#include "kvm_util.h" +#include "processor.h" +#include "vmx.h" + +#include <string.h> +#include <sys/ioctl.h> + +#include "kselftest.h" + +static void l2_guest_code(void) +{ + /* Exit to L1 */ + __asm__ __volatile__("vmcall"); +} + +static void l1_guest_code(struct vmx_pages *vmx_pages, unsigned long high_gpa) +{ +#define L2_GUEST_STACK_SIZE 64 + unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; + uint32_t control; + + GUEST_ASSERT(prepare_for_vmx_operation(vmx_pages)); + GUEST_ASSERT(load_vmcs(vmx_pages)); + + /* Prepare the VMCS for L2 execution. */ + prepare_vmcs(vmx_pages, l2_guest_code, + &l2_guest_stack[L2_GUEST_STACK_SIZE]); + control = vmreadz(CPU_BASED_VM_EXEC_CONTROL); + control |= CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; + vmwrite(CPU_BASED_VM_EXEC_CONTROL, control); + control = vmreadz(SECONDARY_VM_EXEC_CONTROL); + control |= SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; + vmwrite(SECONDARY_VM_EXEC_CONTROL, control); + vmwrite(APIC_ACCESS_ADDR, vmx_pages->apic_access_gpa); + + /* Try to launch L2 with the memory-backed APIC-access address. */ + GUEST_SYNC(vmreadz(APIC_ACCESS_ADDR)); + GUEST_ASSERT(!vmlaunch()); + GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL); + + vmwrite(APIC_ACCESS_ADDR, high_gpa); + + /* Try to resume L2 with the unbacked APIC-access address. */ + GUEST_SYNC(vmreadz(APIC_ACCESS_ADDR)); + GUEST_ASSERT(!vmresume()); + GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL); + + GUEST_DONE(); +} + +int main(int argc, char *argv[]) +{ + unsigned long apic_access_addr = ~0ul; + vm_vaddr_t vmx_pages_gva; + unsigned long high_gpa; + struct vmx_pages *vmx; + bool done = false; + + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; + + TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX)); + + vm = vm_create_with_one_vcpu(&vcpu, l1_guest_code); + + high_gpa = (vm->max_gfn - 1) << vm->page_shift; + + vmx = vcpu_alloc_vmx(vm, &vmx_pages_gva); + prepare_virtualize_apic_accesses(vmx, vm); + vcpu_args_set(vcpu, 2, vmx_pages_gva, high_gpa); + + while (!done) { + volatile struct kvm_run *run = vcpu->run; + struct ucall uc; + + vcpu_run(vcpu); + if (apic_access_addr == high_gpa) { + TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_INTERNAL_ERROR); + TEST_ASSERT(run->internal.suberror == + KVM_INTERNAL_ERROR_EMULATION, + "Got internal suberror other than KVM_INTERNAL_ERROR_EMULATION: %u", + run->internal.suberror); + break; + } + TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO); + + switch (get_ucall(vcpu, &uc)) { + case UCALL_ABORT: + REPORT_GUEST_ASSERT(uc); + /* NOT REACHED */ + case UCALL_SYNC: + apic_access_addr = uc.args[1]; + break; + case UCALL_DONE: + done = true; + break; + default: + TEST_ASSERT(false, "Unknown ucall %lu", uc.cmd); + } + } + kvm_vm_free(vm); + return 0; +} diff --git a/tools/testing/selftests/kvm/x86_64/vmx_close_while_nested_test.c b/tools/testing/selftests/kvm/x86_64/vmx_close_while_nested_test.c index fe40ade06a49..dad988351493 100644 --- a/tools/testing/selftests/kvm/x86_64/vmx_close_while_nested_test.c +++ b/tools/testing/selftests/kvm/x86_64/vmx_close_while_nested_test.c @@ -18,20 +18,15 @@ #include "kselftest.h" -#define VCPU_ID 5 - enum { PORT_L0_EXIT = 0x2000, }; -/* The virtual machine object. */ -static struct kvm_vm *vm; - static void l2_guest_code(void) { /* Exit to L0 */ - asm volatile("inb %%dx, %%al" - : : [port] "d" (PORT_L0_EXIT) : "rax"); + asm volatile("inb %%dx, %%al" + : : [port] "d" (PORT_L0_EXIT) : "rax"); } static void l1_guest_code(struct vmx_pages *vmx_pages) @@ -53,32 +48,30 @@ static void l1_guest_code(struct vmx_pages *vmx_pages) int main(int argc, char *argv[]) { vm_vaddr_t vmx_pages_gva; + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; - nested_vmx_check_supported(); + TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX)); - vm = vm_create_default(VCPU_ID, 0, (void *) l1_guest_code); - vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid()); + vm = vm_create_with_one_vcpu(&vcpu, l1_guest_code); /* Allocate VMX pages and shared descriptors (vmx_pages). */ vcpu_alloc_vmx(vm, &vmx_pages_gva); - vcpu_args_set(vm, VCPU_ID, 1, vmx_pages_gva); + vcpu_args_set(vcpu, 1, vmx_pages_gva); for (;;) { - volatile struct kvm_run *run = vcpu_state(vm, VCPU_ID); + volatile struct kvm_run *run = vcpu->run; struct ucall uc; - vcpu_run(vm, VCPU_ID); - TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, - "Got exit_reason other than KVM_EXIT_IO: %u (%s)\n", - run->exit_reason, - exit_reason_str(run->exit_reason)); + vcpu_run(vcpu); + TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO); if (run->io.port == PORT_L0_EXIT) break; - switch (get_ucall(vm, VCPU_ID, &uc)) { + switch (get_ucall(vcpu, &uc)) { case UCALL_ABORT: - TEST_FAIL("%s", (const char *)uc.args[0]); + REPORT_GUEST_ASSERT(uc); /* NOT REACHED */ default: TEST_FAIL("Unknown ucall %lu", uc.cmd); diff --git a/tools/testing/selftests/kvm/x86_64/vmx_dirty_log_test.c b/tools/testing/selftests/kvm/x86_64/vmx_dirty_log_test.c index e894a638a155..fa512d033205 100644 --- a/tools/testing/selftests/kvm/x86_64/vmx_dirty_log_test.c +++ b/tools/testing/selftests/kvm/x86_64/vmx_dirty_log_test.c @@ -4,9 +4,6 @@ * * Copyright (C) 2018, Red Hat, Inc. */ - -#define _GNU_SOURCE /* for program_invocation_name */ - #include <stdio.h> #include <stdlib.h> #include <linux/bitmap.h> @@ -17,8 +14,6 @@ #include "processor.h" #include "vmx.h" -#define VCPU_ID 1 - /* The memory slot index to track dirty pages */ #define TEST_MEM_SLOT_INDEX 1 #define TEST_MEM_PAGES 3 @@ -30,16 +25,16 @@ #define NESTED_TEST_MEM1 0xc0001000 #define NESTED_TEST_MEM2 0xc0002000 -static void l2_guest_code(void) +static void l2_guest_code(u64 *a, u64 *b) { - *(volatile uint64_t *)NESTED_TEST_MEM1; - *(volatile uint64_t *)NESTED_TEST_MEM1 = 1; + READ_ONCE(*a); + WRITE_ONCE(*a, 1); GUEST_SYNC(true); GUEST_SYNC(false); - *(volatile uint64_t *)NESTED_TEST_MEM2 = 1; + WRITE_ONCE(*b, 1); GUEST_SYNC(true); - *(volatile uint64_t *)NESTED_TEST_MEM2 = 1; + WRITE_ONCE(*b, 1); GUEST_SYNC(true); GUEST_SYNC(false); @@ -47,17 +42,33 @@ static void l2_guest_code(void) vmcall(); } +static void l2_guest_code_ept_enabled(void) +{ + l2_guest_code((u64 *)NESTED_TEST_MEM1, (u64 *)NESTED_TEST_MEM2); +} + +static void l2_guest_code_ept_disabled(void) +{ + /* Access the same L1 GPAs as l2_guest_code_ept_enabled() */ + l2_guest_code((u64 *)GUEST_TEST_MEM, (u64 *)GUEST_TEST_MEM); +} + void l1_guest_code(struct vmx_pages *vmx) { #define L2_GUEST_STACK_SIZE 64 unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; + void *l2_rip; GUEST_ASSERT(vmx->vmcs_gpa); GUEST_ASSERT(prepare_for_vmx_operation(vmx)); GUEST_ASSERT(load_vmcs(vmx)); - prepare_vmcs(vmx, l2_guest_code, - &l2_guest_stack[L2_GUEST_STACK_SIZE]); + if (vmx->eptp_gpa) + l2_rip = l2_guest_code_ept_enabled; + else + l2_rip = l2_guest_code_ept_disabled; + + prepare_vmcs(vmx, l2_rip, &l2_guest_stack[L2_GUEST_STACK_SIZE]); GUEST_SYNC(false); GUEST_ASSERT(!vmlaunch()); @@ -66,26 +77,24 @@ void l1_guest_code(struct vmx_pages *vmx) GUEST_DONE(); } -int main(int argc, char *argv[]) +static void test_vmx_dirty_log(bool enable_ept) { vm_vaddr_t vmx_pages_gva = 0; struct vmx_pages *vmx; unsigned long *bmap; uint64_t *host_test_mem; + struct kvm_vcpu *vcpu; struct kvm_vm *vm; - struct kvm_run *run; struct ucall uc; bool done = false; - nested_vmx_check_supported(); + pr_info("Nested EPT: %s\n", enable_ept ? "enabled" : "disabled"); /* Create VM */ - vm = vm_create_default(VCPU_ID, 0, l1_guest_code); - vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid()); + vm = vm_create_with_one_vcpu(&vcpu, l1_guest_code); vmx = vcpu_alloc_vmx(vm, &vmx_pages_gva); - vcpu_args_set(vm, VCPU_ID, 1, vmx_pages_gva); - run = vcpu_state(vm, VCPU_ID); + vcpu_args_set(vcpu, 1, vmx_pages_gva); /* Add an extra memory slot for testing dirty logging */ vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, @@ -98,7 +107,7 @@ int main(int argc, char *argv[]) * Add an identity map for GVA range [0xc0000000, 0xc0002000). This * affects both L1 and L2. However... */ - virt_map(vm, GUEST_TEST_MEM, GUEST_TEST_MEM, TEST_MEM_PAGES, 0); + virt_map(vm, GUEST_TEST_MEM, GUEST_TEST_MEM, TEST_MEM_PAGES); /* * ... pages in the L2 GPA range [0xc0001000, 0xc0003000) will map to @@ -106,27 +115,28 @@ int main(int argc, char *argv[]) * * Note that prepare_eptp should be called only L1's GPA map is done, * meaning after the last call to virt_map. + * + * When EPT is disabled, the L2 guest code will still access the same L1 + * GPAs as the EPT enabled case. */ - prepare_eptp(vmx, vm, 0); - nested_map_memslot(vmx, vm, 0, 0); - nested_map(vmx, vm, NESTED_TEST_MEM1, GUEST_TEST_MEM, 4096, 0); - nested_map(vmx, vm, NESTED_TEST_MEM2, GUEST_TEST_MEM, 4096, 0); + if (enable_ept) { + prepare_eptp(vmx, vm, 0); + nested_map_memslot(vmx, vm, 0); + nested_map(vmx, vm, NESTED_TEST_MEM1, GUEST_TEST_MEM, 4096); + nested_map(vmx, vm, NESTED_TEST_MEM2, GUEST_TEST_MEM, 4096); + } - bmap = bitmap_alloc(TEST_MEM_PAGES); + bmap = bitmap_zalloc(TEST_MEM_PAGES); host_test_mem = addr_gpa2hva(vm, GUEST_TEST_MEM); while (!done) { memset(host_test_mem, 0xaa, TEST_MEM_PAGES * 4096); - _vcpu_run(vm, VCPU_ID); - TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, - "Unexpected exit reason: %u (%s),\n", - run->exit_reason, - exit_reason_str(run->exit_reason)); + vcpu_run(vcpu); + TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO); - switch (get_ucall(vm, VCPU_ID, &uc)) { + switch (get_ucall(vcpu, &uc)) { case UCALL_ABORT: - TEST_FAIL("%s at %s:%ld", (const char *)uc.args[0], - __FILE__, uc.args[1]); + REPORT_GUEST_ASSERT(uc); /* NOT REACHED */ case UCALL_SYNC: /* @@ -135,17 +145,17 @@ int main(int argc, char *argv[]) */ kvm_vm_get_dirty_log(vm, TEST_MEM_SLOT_INDEX, bmap); if (uc.args[1]) { - TEST_ASSERT(test_bit(0, bmap), "Page 0 incorrectly reported clean\n"); - TEST_ASSERT(host_test_mem[0] == 1, "Page 0 not written by guest\n"); + TEST_ASSERT(test_bit(0, bmap), "Page 0 incorrectly reported clean"); + TEST_ASSERT(host_test_mem[0] == 1, "Page 0 not written by guest"); } else { - TEST_ASSERT(!test_bit(0, bmap), "Page 0 incorrectly reported dirty\n"); - TEST_ASSERT(host_test_mem[0] == 0xaaaaaaaaaaaaaaaaULL, "Page 0 written by guest\n"); + TEST_ASSERT(!test_bit(0, bmap), "Page 0 incorrectly reported dirty"); + TEST_ASSERT(host_test_mem[0] == 0xaaaaaaaaaaaaaaaaULL, "Page 0 written by guest"); } - TEST_ASSERT(!test_bit(1, bmap), "Page 1 incorrectly reported dirty\n"); - TEST_ASSERT(host_test_mem[4096 / 8] == 0xaaaaaaaaaaaaaaaaULL, "Page 1 written by guest\n"); - TEST_ASSERT(!test_bit(2, bmap), "Page 2 incorrectly reported dirty\n"); - TEST_ASSERT(host_test_mem[8192 / 8] == 0xaaaaaaaaaaaaaaaaULL, "Page 2 written by guest\n"); + TEST_ASSERT(!test_bit(1, bmap), "Page 1 incorrectly reported dirty"); + TEST_ASSERT(host_test_mem[4096 / 8] == 0xaaaaaaaaaaaaaaaaULL, "Page 1 written by guest"); + TEST_ASSERT(!test_bit(2, bmap), "Page 2 incorrectly reported dirty"); + TEST_ASSERT(host_test_mem[8192 / 8] == 0xaaaaaaaaaaaaaaaaULL, "Page 2 written by guest"); break; case UCALL_DONE: done = true; @@ -155,3 +165,15 @@ int main(int argc, char *argv[]) } } } + +int main(int argc, char *argv[]) +{ + TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX)); + + test_vmx_dirty_log(/*enable_ept=*/false); + + if (kvm_cpu_has_ept()) + test_vmx_dirty_log(/*enable_ept=*/true); + + return 0; +} diff --git a/tools/testing/selftests/kvm/x86_64/vmx_exception_with_invalid_guest_state.c b/tools/testing/selftests/kvm/x86_64/vmx_exception_with_invalid_guest_state.c new file mode 100644 index 000000000000..3fd6eceab46f --- /dev/null +++ b/tools/testing/selftests/kvm/x86_64/vmx_exception_with_invalid_guest_state.c @@ -0,0 +1,142 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include "test_util.h" +#include "kvm_util.h" +#include "processor.h" + +#include <signal.h> +#include <string.h> +#include <sys/ioctl.h> +#include <sys/time.h> + +#include "kselftest.h" + +static void guest_ud_handler(struct ex_regs *regs) +{ + /* Loop on the ud2 until guest state is made invalid. */ +} + +static void guest_code(void) +{ + asm volatile("ud2"); +} + +static void __run_vcpu_with_invalid_state(struct kvm_vcpu *vcpu) +{ + struct kvm_run *run = vcpu->run; + + vcpu_run(vcpu); + + TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_INTERNAL_ERROR); + TEST_ASSERT(run->emulation_failure.suberror == KVM_INTERNAL_ERROR_EMULATION, + "Expected emulation failure, got %d", + run->emulation_failure.suberror); +} + +static void run_vcpu_with_invalid_state(struct kvm_vcpu *vcpu) +{ + /* + * Always run twice to verify KVM handles the case where _KVM_ queues + * an exception with invalid state and then exits to userspace, i.e. + * that KVM doesn't explode if userspace ignores the initial error. + */ + __run_vcpu_with_invalid_state(vcpu); + __run_vcpu_with_invalid_state(vcpu); +} + +static void set_timer(void) +{ + struct itimerval timer; + + timer.it_value.tv_sec = 0; + timer.it_value.tv_usec = 200; + timer.it_interval = timer.it_value; + TEST_ASSERT_EQ(setitimer(ITIMER_REAL, &timer, NULL), 0); +} + +static void set_or_clear_invalid_guest_state(struct kvm_vcpu *vcpu, bool set) +{ + static struct kvm_sregs sregs; + + if (!sregs.cr0) + vcpu_sregs_get(vcpu, &sregs); + sregs.tr.unusable = !!set; + vcpu_sregs_set(vcpu, &sregs); +} + +static void set_invalid_guest_state(struct kvm_vcpu *vcpu) +{ + set_or_clear_invalid_guest_state(vcpu, true); +} + +static void clear_invalid_guest_state(struct kvm_vcpu *vcpu) +{ + set_or_clear_invalid_guest_state(vcpu, false); +} + +static struct kvm_vcpu *get_set_sigalrm_vcpu(struct kvm_vcpu *__vcpu) +{ + static struct kvm_vcpu *vcpu = NULL; + + if (__vcpu) + vcpu = __vcpu; + return vcpu; +} + +static void sigalrm_handler(int sig) +{ + struct kvm_vcpu *vcpu = get_set_sigalrm_vcpu(NULL); + struct kvm_vcpu_events events; + + TEST_ASSERT(sig == SIGALRM, "Unexpected signal = %d", sig); + + vcpu_events_get(vcpu, &events); + + /* + * If an exception is pending, attempt KVM_RUN with invalid guest, + * otherwise rearm the timer and keep doing so until the timer fires + * between KVM queueing an exception and re-entering the guest. + */ + if (events.exception.pending) { + set_invalid_guest_state(vcpu); + run_vcpu_with_invalid_state(vcpu); + } else { + set_timer(); + } +} + +int main(int argc, char *argv[]) +{ + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; + + TEST_REQUIRE(host_cpu_is_intel); + TEST_REQUIRE(!vm_is_unrestricted_guest(NULL)); + + vm = vm_create_with_one_vcpu(&vcpu, guest_code); + get_set_sigalrm_vcpu(vcpu); + + vm_install_exception_handler(vm, UD_VECTOR, guest_ud_handler); + + /* + * Stuff invalid guest state for L2 by making TR unusuable. The next + * KVM_RUN should induce a TRIPLE_FAULT in L2 as KVM doesn't support + * emulating invalid guest state for L2. + */ + set_invalid_guest_state(vcpu); + run_vcpu_with_invalid_state(vcpu); + + /* + * Verify KVM also handles the case where userspace gains control while + * an exception is pending and stuffs invalid state. Run with valid + * guest state and a timer firing every 200us, and attempt to enter the + * guest with invalid state when the handler interrupts KVM with an + * exception pending. + */ + clear_invalid_guest_state(vcpu); + TEST_ASSERT(signal(SIGALRM, sigalrm_handler) != SIG_ERR, + "Failed to register SIGALRM handler, errno = %d (%s)", + errno, strerror(errno)); + + set_timer(); + run_vcpu_with_invalid_state(vcpu); +} diff --git a/tools/testing/selftests/kvm/x86_64/vmx_invalid_nested_guest_state.c b/tools/testing/selftests/kvm/x86_64/vmx_invalid_nested_guest_state.c new file mode 100644 index 000000000000..a100ee5f0009 --- /dev/null +++ b/tools/testing/selftests/kvm/x86_64/vmx_invalid_nested_guest_state.c @@ -0,0 +1,103 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include "test_util.h" +#include "kvm_util.h" +#include "processor.h" +#include "vmx.h" + +#include <string.h> +#include <sys/ioctl.h> + +#include "kselftest.h" + +#define ARBITRARY_IO_PORT 0x2000 + +static struct kvm_vm *vm; + +static void l2_guest_code(void) +{ + /* + * Generate an exit to L0 userspace, i.e. main(), via I/O to an + * arbitrary port. + */ + asm volatile("inb %%dx, %%al" + : : [port] "d" (ARBITRARY_IO_PORT) : "rax"); +} + +static void l1_guest_code(struct vmx_pages *vmx_pages) +{ +#define L2_GUEST_STACK_SIZE 64 + unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; + + GUEST_ASSERT(prepare_for_vmx_operation(vmx_pages)); + GUEST_ASSERT(load_vmcs(vmx_pages)); + + /* Prepare the VMCS for L2 execution. */ + prepare_vmcs(vmx_pages, l2_guest_code, + &l2_guest_stack[L2_GUEST_STACK_SIZE]); + + /* + * L2 must be run without unrestricted guest, verify that the selftests + * library hasn't enabled it. Because KVM selftests jump directly to + * 64-bit mode, unrestricted guest support isn't required. + */ + GUEST_ASSERT(!(vmreadz(CPU_BASED_VM_EXEC_CONTROL) & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) || + !(vmreadz(SECONDARY_VM_EXEC_CONTROL) & SECONDARY_EXEC_UNRESTRICTED_GUEST)); + + GUEST_ASSERT(!vmlaunch()); + + /* L2 should triple fault after main() stuffs invalid guest state. */ + GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_TRIPLE_FAULT); + GUEST_DONE(); +} + +int main(int argc, char *argv[]) +{ + vm_vaddr_t vmx_pages_gva; + struct kvm_sregs sregs; + struct kvm_vcpu *vcpu; + struct kvm_run *run; + struct ucall uc; + + TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX)); + + vm = vm_create_with_one_vcpu(&vcpu, l1_guest_code); + + /* Allocate VMX pages and shared descriptors (vmx_pages). */ + vcpu_alloc_vmx(vm, &vmx_pages_gva); + vcpu_args_set(vcpu, 1, vmx_pages_gva); + + vcpu_run(vcpu); + + run = vcpu->run; + + /* + * The first exit to L0 userspace should be an I/O access from L2. + * Running L1 should launch L2 without triggering an exit to userspace. + */ + TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO); + + TEST_ASSERT(run->io.port == ARBITRARY_IO_PORT, + "Expected IN from port %d from L2, got port %d", + ARBITRARY_IO_PORT, run->io.port); + + /* + * Stuff invalid guest state for L2 by making TR unusuable. The next + * KVM_RUN should induce a TRIPLE_FAULT in L2 as KVM doesn't support + * emulating invalid guest state for L2. + */ + memset(&sregs, 0, sizeof(sregs)); + vcpu_sregs_get(vcpu, &sregs); + sregs.tr.unusable = 1; + vcpu_sregs_set(vcpu, &sregs); + + vcpu_run(vcpu); + + switch (get_ucall(vcpu, &uc)) { + case UCALL_DONE: + break; + case UCALL_ABORT: + REPORT_GUEST_ASSERT(uc); + default: + TEST_FAIL("Unexpected ucall: %lu", uc.cmd); + } +} diff --git a/tools/testing/selftests/kvm/x86_64/vmx_msrs_test.c b/tools/testing/selftests/kvm/x86_64/vmx_msrs_test.c new file mode 100644 index 000000000000..90720b6205f4 --- /dev/null +++ b/tools/testing/selftests/kvm/x86_64/vmx_msrs_test.c @@ -0,0 +1,131 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * VMX control MSR test + * + * Copyright (C) 2022 Google LLC. + * + * Tests for KVM ownership of bits in the VMX entry/exit control MSRs. Checks + * that KVM will set owned bits where appropriate, and will not if + * KVM_X86_QUIRK_TWEAK_VMX_CTRL_MSRS is disabled. + */ +#include <linux/bitmap.h> +#include "kvm_util.h" +#include "vmx.h" + +static void vmx_fixed1_msr_test(struct kvm_vcpu *vcpu, uint32_t msr_index, + uint64_t mask) +{ + uint64_t val = vcpu_get_msr(vcpu, msr_index); + uint64_t bit; + + mask &= val; + + for_each_set_bit(bit, &mask, 64) { + vcpu_set_msr(vcpu, msr_index, val & ~BIT_ULL(bit)); + vcpu_set_msr(vcpu, msr_index, val); + } +} + +static void vmx_fixed0_msr_test(struct kvm_vcpu *vcpu, uint32_t msr_index, + uint64_t mask) +{ + uint64_t val = vcpu_get_msr(vcpu, msr_index); + uint64_t bit; + + mask = ~mask | val; + + for_each_clear_bit(bit, &mask, 64) { + vcpu_set_msr(vcpu, msr_index, val | BIT_ULL(bit)); + vcpu_set_msr(vcpu, msr_index, val); + } +} + +static void vmx_fixed0and1_msr_test(struct kvm_vcpu *vcpu, uint32_t msr_index) +{ + vmx_fixed0_msr_test(vcpu, msr_index, GENMASK_ULL(31, 0)); + vmx_fixed1_msr_test(vcpu, msr_index, GENMASK_ULL(63, 32)); +} + +static void vmx_save_restore_msrs_test(struct kvm_vcpu *vcpu) +{ + vcpu_set_msr(vcpu, MSR_IA32_VMX_VMCS_ENUM, 0); + vcpu_set_msr(vcpu, MSR_IA32_VMX_VMCS_ENUM, -1ull); + + vmx_fixed1_msr_test(vcpu, MSR_IA32_VMX_BASIC, + BIT_ULL(49) | BIT_ULL(54) | BIT_ULL(55)); + + vmx_fixed1_msr_test(vcpu, MSR_IA32_VMX_MISC, + BIT_ULL(5) | GENMASK_ULL(8, 6) | BIT_ULL(14) | + BIT_ULL(15) | BIT_ULL(28) | BIT_ULL(29) | BIT_ULL(30)); + + vmx_fixed0and1_msr_test(vcpu, MSR_IA32_VMX_PROCBASED_CTLS2); + vmx_fixed1_msr_test(vcpu, MSR_IA32_VMX_EPT_VPID_CAP, -1ull); + vmx_fixed0and1_msr_test(vcpu, MSR_IA32_VMX_TRUE_PINBASED_CTLS); + vmx_fixed0and1_msr_test(vcpu, MSR_IA32_VMX_TRUE_PROCBASED_CTLS); + vmx_fixed0and1_msr_test(vcpu, MSR_IA32_VMX_TRUE_EXIT_CTLS); + vmx_fixed0and1_msr_test(vcpu, MSR_IA32_VMX_TRUE_ENTRY_CTLS); + vmx_fixed1_msr_test(vcpu, MSR_IA32_VMX_VMFUNC, -1ull); +} + +static void __ia32_feature_control_msr_test(struct kvm_vcpu *vcpu, + uint64_t msr_bit, + struct kvm_x86_cpu_feature feature) +{ + uint64_t val; + + vcpu_clear_cpuid_feature(vcpu, feature); + + val = vcpu_get_msr(vcpu, MSR_IA32_FEAT_CTL); + vcpu_set_msr(vcpu, MSR_IA32_FEAT_CTL, val | msr_bit | FEAT_CTL_LOCKED); + vcpu_set_msr(vcpu, MSR_IA32_FEAT_CTL, (val & ~msr_bit) | FEAT_CTL_LOCKED); + vcpu_set_msr(vcpu, MSR_IA32_FEAT_CTL, val | msr_bit | FEAT_CTL_LOCKED); + vcpu_set_msr(vcpu, MSR_IA32_FEAT_CTL, (val & ~msr_bit) | FEAT_CTL_LOCKED); + vcpu_set_msr(vcpu, MSR_IA32_FEAT_CTL, val); + + if (!kvm_cpu_has(feature)) + return; + + vcpu_set_cpuid_feature(vcpu, feature); +} + +static void ia32_feature_control_msr_test(struct kvm_vcpu *vcpu) +{ + uint64_t supported_bits = FEAT_CTL_LOCKED | + FEAT_CTL_VMX_ENABLED_INSIDE_SMX | + FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX | + FEAT_CTL_SGX_LC_ENABLED | + FEAT_CTL_SGX_ENABLED | + FEAT_CTL_LMCE_ENABLED; + int bit, r; + + __ia32_feature_control_msr_test(vcpu, FEAT_CTL_VMX_ENABLED_INSIDE_SMX, X86_FEATURE_SMX); + __ia32_feature_control_msr_test(vcpu, FEAT_CTL_VMX_ENABLED_INSIDE_SMX, X86_FEATURE_VMX); + __ia32_feature_control_msr_test(vcpu, FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX, X86_FEATURE_VMX); + __ia32_feature_control_msr_test(vcpu, FEAT_CTL_SGX_LC_ENABLED, X86_FEATURE_SGX_LC); + __ia32_feature_control_msr_test(vcpu, FEAT_CTL_SGX_LC_ENABLED, X86_FEATURE_SGX); + __ia32_feature_control_msr_test(vcpu, FEAT_CTL_SGX_ENABLED, X86_FEATURE_SGX); + __ia32_feature_control_msr_test(vcpu, FEAT_CTL_LMCE_ENABLED, X86_FEATURE_MCE); + + for_each_clear_bit(bit, &supported_bits, 64) { + r = _vcpu_set_msr(vcpu, MSR_IA32_FEAT_CTL, BIT(bit)); + TEST_ASSERT(r == 0, + "Setting reserved bit %d in IA32_FEATURE_CONTROL should fail", bit); + } +} + +int main(void) +{ + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; + + TEST_REQUIRE(kvm_has_cap(KVM_CAP_DISABLE_QUIRKS2)); + TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX)); + + /* No need to actually do KVM_RUN, thus no guest code. */ + vm = vm_create_with_one_vcpu(&vcpu, NULL); + + vmx_save_restore_msrs_test(vcpu); + ia32_feature_control_msr_test(vcpu); + + kvm_vm_free(vm); +} diff --git a/tools/testing/selftests/kvm/x86_64/vmx_nested_tsc_scaling_test.c b/tools/testing/selftests/kvm/x86_64/vmx_nested_tsc_scaling_test.c new file mode 100644 index 000000000000..1759fa5cb3f2 --- /dev/null +++ b/tools/testing/selftests/kvm/x86_64/vmx_nested_tsc_scaling_test.c @@ -0,0 +1,206 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * vmx_nested_tsc_scaling_test + * + * Copyright 2021 Amazon.com, Inc. or its affiliates. All Rights Reserved. + * + * This test case verifies that nested TSC scaling behaves as expected when + * both L1 and L2 are scaled using different ratios. For this test we scale + * L1 down and scale L2 up. + */ + +#include <time.h> + +#include "kvm_util.h" +#include "vmx.h" +#include "kselftest.h" + +/* L2 is scaled up (from L1's perspective) by this factor */ +#define L2_SCALE_FACTOR 4ULL + +#define TSC_OFFSET_L2 ((uint64_t) -33125236320908) +#define TSC_MULTIPLIER_L2 (L2_SCALE_FACTOR << 48) + +#define L2_GUEST_STACK_SIZE 64 + +enum { USLEEP, UCHECK_L1, UCHECK_L2 }; +#define GUEST_SLEEP(sec) ucall(UCALL_SYNC, 2, USLEEP, sec) +#define GUEST_CHECK(level, freq) ucall(UCALL_SYNC, 2, level, freq) + + +/* + * This function checks whether the "actual" TSC frequency of a guest matches + * its expected frequency. In order to account for delays in taking the TSC + * measurements, a difference of 1% between the actual and the expected value + * is tolerated. + */ +static void compare_tsc_freq(uint64_t actual, uint64_t expected) +{ + uint64_t tolerance, thresh_low, thresh_high; + + tolerance = expected / 100; + thresh_low = expected - tolerance; + thresh_high = expected + tolerance; + + TEST_ASSERT(thresh_low < actual, + "TSC freq is expected to be between %"PRIu64" and %"PRIu64 + " but it actually is %"PRIu64, + thresh_low, thresh_high, actual); + TEST_ASSERT(thresh_high > actual, + "TSC freq is expected to be between %"PRIu64" and %"PRIu64 + " but it actually is %"PRIu64, + thresh_low, thresh_high, actual); +} + +static void check_tsc_freq(int level) +{ + uint64_t tsc_start, tsc_end, tsc_freq; + + /* + * Reading the TSC twice with about a second's difference should give + * us an approximation of the TSC frequency from the guest's + * perspective. Now, this won't be completely accurate, but it should + * be good enough for the purposes of this test. + */ + tsc_start = rdmsr(MSR_IA32_TSC); + GUEST_SLEEP(1); + tsc_end = rdmsr(MSR_IA32_TSC); + + tsc_freq = tsc_end - tsc_start; + + GUEST_CHECK(level, tsc_freq); +} + +static void l2_guest_code(void) +{ + check_tsc_freq(UCHECK_L2); + + /* exit to L1 */ + __asm__ __volatile__("vmcall"); +} + +static void l1_guest_code(struct vmx_pages *vmx_pages) +{ + unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; + uint32_t control; + + /* check that L1's frequency looks alright before launching L2 */ + check_tsc_freq(UCHECK_L1); + + GUEST_ASSERT(prepare_for_vmx_operation(vmx_pages)); + GUEST_ASSERT(load_vmcs(vmx_pages)); + + /* prepare the VMCS for L2 execution */ + prepare_vmcs(vmx_pages, l2_guest_code, &l2_guest_stack[L2_GUEST_STACK_SIZE]); + + /* enable TSC offsetting and TSC scaling for L2 */ + control = vmreadz(CPU_BASED_VM_EXEC_CONTROL); + control |= CPU_BASED_USE_MSR_BITMAPS | CPU_BASED_USE_TSC_OFFSETTING; + vmwrite(CPU_BASED_VM_EXEC_CONTROL, control); + + control = vmreadz(SECONDARY_VM_EXEC_CONTROL); + control |= SECONDARY_EXEC_TSC_SCALING; + vmwrite(SECONDARY_VM_EXEC_CONTROL, control); + + vmwrite(TSC_OFFSET, TSC_OFFSET_L2); + vmwrite(TSC_MULTIPLIER, TSC_MULTIPLIER_L2); + vmwrite(TSC_MULTIPLIER_HIGH, TSC_MULTIPLIER_L2 >> 32); + + /* launch L2 */ + GUEST_ASSERT(!vmlaunch()); + GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL); + + /* check that L1's frequency still looks good */ + check_tsc_freq(UCHECK_L1); + + GUEST_DONE(); +} + +int main(int argc, char *argv[]) +{ + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; + vm_vaddr_t vmx_pages_gva; + + uint64_t tsc_start, tsc_end; + uint64_t tsc_khz; + uint64_t l1_scale_factor; + uint64_t l0_tsc_freq = 0; + uint64_t l1_tsc_freq = 0; + uint64_t l2_tsc_freq = 0; + + TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX)); + TEST_REQUIRE(kvm_has_cap(KVM_CAP_TSC_CONTROL)); + TEST_REQUIRE(sys_clocksource_is_based_on_tsc()); + + /* + * We set L1's scale factor to be a random number from 2 to 10. + * Ideally we would do the same for L2's factor but that one is + * referenced by both main() and l1_guest_code() and using a global + * variable does not work. + */ + srand(time(NULL)); + l1_scale_factor = (rand() % 9) + 2; + printf("L1's scale down factor is: %"PRIu64"\n", l1_scale_factor); + printf("L2's scale up factor is: %llu\n", L2_SCALE_FACTOR); + + tsc_start = rdtsc(); + sleep(1); + tsc_end = rdtsc(); + + l0_tsc_freq = tsc_end - tsc_start; + printf("real TSC frequency is around: %"PRIu64"\n", l0_tsc_freq); + + vm = vm_create_with_one_vcpu(&vcpu, l1_guest_code); + vcpu_alloc_vmx(vm, &vmx_pages_gva); + vcpu_args_set(vcpu, 1, vmx_pages_gva); + + tsc_khz = __vcpu_ioctl(vcpu, KVM_GET_TSC_KHZ, NULL); + TEST_ASSERT(tsc_khz != -1, "vcpu ioctl KVM_GET_TSC_KHZ failed"); + + /* scale down L1's TSC frequency */ + vcpu_ioctl(vcpu, KVM_SET_TSC_KHZ, (void *) (tsc_khz / l1_scale_factor)); + + for (;;) { + struct ucall uc; + + vcpu_run(vcpu); + TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO); + + switch (get_ucall(vcpu, &uc)) { + case UCALL_ABORT: + REPORT_GUEST_ASSERT(uc); + case UCALL_SYNC: + switch (uc.args[0]) { + case USLEEP: + sleep(uc.args[1]); + break; + case UCHECK_L1: + l1_tsc_freq = uc.args[1]; + printf("L1's TSC frequency is around: %"PRIu64 + "\n", l1_tsc_freq); + + compare_tsc_freq(l1_tsc_freq, + l0_tsc_freq / l1_scale_factor); + break; + case UCHECK_L2: + l2_tsc_freq = uc.args[1]; + printf("L2's TSC frequency is around: %"PRIu64 + "\n", l2_tsc_freq); + + compare_tsc_freq(l2_tsc_freq, + l1_tsc_freq * L2_SCALE_FACTOR); + break; + } + break; + case UCALL_DONE: + goto done; + default: + TEST_FAIL("Unknown ucall %lu", uc.cmd); + } + } + +done: + kvm_vm_free(vm); + return 0; +} diff --git a/tools/testing/selftests/kvm/x86_64/vmx_pmu_caps_test.c b/tools/testing/selftests/kvm/x86_64/vmx_pmu_caps_test.c new file mode 100644 index 000000000000..7c92536551cc --- /dev/null +++ b/tools/testing/selftests/kvm/x86_64/vmx_pmu_caps_test.c @@ -0,0 +1,224 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Test for VMX-pmu perf capability msr + * + * Copyright (C) 2021 Intel Corporation + * + * Test to check the effect of various CPUID settings on + * MSR_IA32_PERF_CAPABILITIES MSR, and check that what + * we write with KVM_SET_MSR is _not_ modified by the guest + * and check it can be retrieved with KVM_GET_MSR, also test + * the invalid LBR formats are rejected. + */ +#include <sys/ioctl.h> + +#include <linux/bitmap.h> + +#include "kvm_test_harness.h" +#include "kvm_util.h" +#include "vmx.h" + +static union perf_capabilities { + struct { + u64 lbr_format:6; + u64 pebs_trap:1; + u64 pebs_arch_reg:1; + u64 pebs_format:4; + u64 smm_freeze:1; + u64 full_width_write:1; + u64 pebs_baseline:1; + u64 perf_metrics:1; + u64 pebs_output_pt_available:1; + u64 anythread_deprecated:1; + }; + u64 capabilities; +} host_cap; + +/* + * The LBR format and most PEBS features are immutable, all other features are + * fungible (if supported by the host and KVM). + */ +static const union perf_capabilities immutable_caps = { + .lbr_format = -1, + .pebs_trap = 1, + .pebs_arch_reg = 1, + .pebs_format = -1, + .pebs_baseline = 1, +}; + +static const union perf_capabilities format_caps = { + .lbr_format = -1, + .pebs_format = -1, +}; + +static void guest_test_perf_capabilities_gp(uint64_t val) +{ + uint8_t vector = wrmsr_safe(MSR_IA32_PERF_CAPABILITIES, val); + + __GUEST_ASSERT(vector == GP_VECTOR, + "Expected #GP for value '0x%lx', got vector '0x%x'", + val, vector); +} + +static void guest_code(uint64_t current_val) +{ + int i; + + guest_test_perf_capabilities_gp(current_val); + guest_test_perf_capabilities_gp(0); + + for (i = 0; i < 64; i++) + guest_test_perf_capabilities_gp(current_val ^ BIT_ULL(i)); + + GUEST_DONE(); +} + +KVM_ONE_VCPU_TEST_SUITE(vmx_pmu_caps); + +/* + * Verify that guest WRMSRs to PERF_CAPABILITIES #GP regardless of the value + * written, that the guest always sees the userspace controlled value, and that + * PERF_CAPABILITIES is immutable after KVM_RUN. + */ +KVM_ONE_VCPU_TEST(vmx_pmu_caps, guest_wrmsr_perf_capabilities, guest_code) +{ + struct ucall uc; + int r, i; + + vcpu_set_msr(vcpu, MSR_IA32_PERF_CAPABILITIES, host_cap.capabilities); + + vcpu_args_set(vcpu, 1, host_cap.capabilities); + vcpu_run(vcpu); + + switch (get_ucall(vcpu, &uc)) { + case UCALL_ABORT: + REPORT_GUEST_ASSERT(uc); + break; + case UCALL_DONE: + break; + default: + TEST_FAIL("Unexpected ucall: %lu", uc.cmd); + } + + TEST_ASSERT_EQ(vcpu_get_msr(vcpu, MSR_IA32_PERF_CAPABILITIES), + host_cap.capabilities); + + vcpu_set_msr(vcpu, MSR_IA32_PERF_CAPABILITIES, host_cap.capabilities); + + r = _vcpu_set_msr(vcpu, MSR_IA32_PERF_CAPABILITIES, 0); + TEST_ASSERT(!r, "Post-KVM_RUN write '0' didn't fail"); + + for (i = 0; i < 64; i++) { + r = _vcpu_set_msr(vcpu, MSR_IA32_PERF_CAPABILITIES, + host_cap.capabilities ^ BIT_ULL(i)); + TEST_ASSERT(!r, "Post-KVM_RUN write '0x%llx'didn't fail", + host_cap.capabilities ^ BIT_ULL(i)); + } +} + +/* + * Verify KVM allows writing PERF_CAPABILITIES with all KVM-supported features + * enabled, as well as '0' (to disable all features). + */ +KVM_ONE_VCPU_TEST(vmx_pmu_caps, basic_perf_capabilities, guest_code) +{ + vcpu_set_msr(vcpu, MSR_IA32_PERF_CAPABILITIES, 0); + vcpu_set_msr(vcpu, MSR_IA32_PERF_CAPABILITIES, host_cap.capabilities); +} + +KVM_ONE_VCPU_TEST(vmx_pmu_caps, fungible_perf_capabilities, guest_code) +{ + const uint64_t fungible_caps = host_cap.capabilities & ~immutable_caps.capabilities; + int bit; + + for_each_set_bit(bit, &fungible_caps, 64) { + vcpu_set_msr(vcpu, MSR_IA32_PERF_CAPABILITIES, BIT_ULL(bit)); + vcpu_set_msr(vcpu, MSR_IA32_PERF_CAPABILITIES, + host_cap.capabilities & ~BIT_ULL(bit)); + } + vcpu_set_msr(vcpu, MSR_IA32_PERF_CAPABILITIES, host_cap.capabilities); +} + +/* + * Verify KVM rejects attempts to set unsupported and/or immutable features in + * PERF_CAPABILITIES. Note, LBR format and PEBS format need to be validated + * separately as they are multi-bit values, e.g. toggling or setting a single + * bit can generate a false positive without dedicated safeguards. + */ +KVM_ONE_VCPU_TEST(vmx_pmu_caps, immutable_perf_capabilities, guest_code) +{ + const uint64_t reserved_caps = (~host_cap.capabilities | + immutable_caps.capabilities) & + ~format_caps.capabilities; + union perf_capabilities val = host_cap; + int r, bit; + + for_each_set_bit(bit, &reserved_caps, 64) { + r = _vcpu_set_msr(vcpu, MSR_IA32_PERF_CAPABILITIES, + host_cap.capabilities ^ BIT_ULL(bit)); + TEST_ASSERT(!r, "%s immutable feature 0x%llx (bit %d) didn't fail", + host_cap.capabilities & BIT_ULL(bit) ? "Setting" : "Clearing", + BIT_ULL(bit), bit); + } + + /* + * KVM only supports the host's native LBR format, as well as '0' (to + * disable LBR support). Verify KVM rejects all other LBR formats. + */ + for (val.lbr_format = 1; val.lbr_format; val.lbr_format++) { + if (val.lbr_format == host_cap.lbr_format) + continue; + + r = _vcpu_set_msr(vcpu, MSR_IA32_PERF_CAPABILITIES, val.capabilities); + TEST_ASSERT(!r, "Bad LBR FMT = 0x%x didn't fail, host = 0x%x", + val.lbr_format, host_cap.lbr_format); + } + + /* Ditto for the PEBS format. */ + for (val.pebs_format = 1; val.pebs_format; val.pebs_format++) { + if (val.pebs_format == host_cap.pebs_format) + continue; + + r = _vcpu_set_msr(vcpu, MSR_IA32_PERF_CAPABILITIES, val.capabilities); + TEST_ASSERT(!r, "Bad PEBS FMT = 0x%x didn't fail, host = 0x%x", + val.pebs_format, host_cap.pebs_format); + } +} + +/* + * Test that LBR MSRs are writable when LBRs are enabled, and then verify that + * disabling the vPMU via CPUID also disables LBR support. Set bits 2:0 of + * LBR_TOS as those bits are writable across all uarch implementations (arch + * LBRs will need to poke a different MSR). + */ +KVM_ONE_VCPU_TEST(vmx_pmu_caps, lbr_perf_capabilities, guest_code) +{ + int r; + + if (!host_cap.lbr_format) + return; + + vcpu_set_msr(vcpu, MSR_IA32_PERF_CAPABILITIES, host_cap.capabilities); + vcpu_set_msr(vcpu, MSR_LBR_TOS, 7); + + vcpu_clear_cpuid_entry(vcpu, X86_PROPERTY_PMU_VERSION.function); + + r = _vcpu_set_msr(vcpu, MSR_LBR_TOS, 7); + TEST_ASSERT(!r, "Writing LBR_TOS should fail after disabling vPMU"); +} + +int main(int argc, char *argv[]) +{ + TEST_REQUIRE(kvm_is_pmu_enabled()); + TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_PDCM)); + + TEST_REQUIRE(kvm_cpu_has_p(X86_PROPERTY_PMU_VERSION)); + TEST_REQUIRE(kvm_cpu_property(X86_PROPERTY_PMU_VERSION) > 0); + + host_cap.capabilities = kvm_get_feature_msr(MSR_IA32_PERF_CAPABILITIES); + + TEST_ASSERT(host_cap.full_width_write, + "Full-width writes should always be supported"); + + return test_harness_run(argc, argv); +} diff --git a/tools/testing/selftests/kvm/x86_64/vmx_preemption_timer_test.c b/tools/testing/selftests/kvm/x86_64/vmx_preemption_timer_test.c index a7737af1224f..00dd2ac07a61 100644 --- a/tools/testing/selftests/kvm/x86_64/vmx_preemption_timer_test.c +++ b/tools/testing/selftests/kvm/x86_64/vmx_preemption_timer_test.c @@ -9,7 +9,6 @@ * value instead of partially decayed timer value * */ -#define _GNU_SOURCE /* for program_invocation_short_name */ #include <fcntl.h> #include <stdio.h> #include <stdlib.h> @@ -22,7 +21,6 @@ #include "processor.h" #include "vmx.h" -#define VCPU_ID 5 #define PREEMPTION_TIMER_VALUE 100000000ull #define PREEMPTION_TIMER_VALUE_THRESHOLD1 80000000ull @@ -158,7 +156,7 @@ int main(int argc, char *argv[]) struct kvm_regs regs1, regs2; struct kvm_vm *vm; - struct kvm_run *run; + struct kvm_vcpu *vcpu; struct kvm_x86_state *state; struct ucall uc; int stage; @@ -167,34 +165,25 @@ int main(int argc, char *argv[]) * AMD currently does not implement any VMX features, so for now we * just early out. */ - nested_vmx_check_supported(); + TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX)); + + TEST_REQUIRE(kvm_has_cap(KVM_CAP_NESTED_STATE)); /* Create VM */ - vm = vm_create_default(VCPU_ID, 0, guest_code); - vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid()); - run = vcpu_state(vm, VCPU_ID); - - vcpu_regs_get(vm, VCPU_ID, ®s1); - - if (kvm_check_cap(KVM_CAP_NESTED_STATE)) { - vcpu_alloc_vmx(vm, &vmx_pages_gva); - vcpu_args_set(vm, VCPU_ID, 1, vmx_pages_gva); - } else { - pr_info("will skip vmx preemption timer checks\n"); - goto done; - } + vm = vm_create_with_one_vcpu(&vcpu, guest_code); + + vcpu_regs_get(vcpu, ®s1); + + vcpu_alloc_vmx(vm, &vmx_pages_gva); + vcpu_args_set(vcpu, 1, vmx_pages_gva); for (stage = 1;; stage++) { - _vcpu_run(vm, VCPU_ID); - TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, - "Stage %d: unexpected exit reason: %u (%s),\n", - stage, run->exit_reason, - exit_reason_str(run->exit_reason)); + vcpu_run(vcpu); + TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO); - switch (get_ucall(vm, VCPU_ID, &uc)) { + switch (get_ucall(vcpu, &uc)) { case UCALL_ABORT: - TEST_FAIL("%s at %s:%ld", (const char *)uc.args[0], - __FILE__, uc.args[1]); + REPORT_GUEST_ASSERT(uc); /* NOT REACHED */ case UCALL_SYNC: break; @@ -233,22 +222,19 @@ int main(int argc, char *argv[]) stage, uc.args[4], uc.args[5]); } - state = vcpu_save_state(vm, VCPU_ID); + state = vcpu_save_state(vcpu); memset(®s1, 0, sizeof(regs1)); - vcpu_regs_get(vm, VCPU_ID, ®s1); + vcpu_regs_get(vcpu, ®s1); kvm_vm_release(vm); /* Restore state in a new VM. */ - kvm_vm_restart(vm, O_RDWR); - vm_vcpu_add(vm, VCPU_ID); - vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid()); - vcpu_load_state(vm, VCPU_ID, state); - run = vcpu_state(vm, VCPU_ID); - free(state); + vcpu = vm_recreate_with_one_vcpu(vm); + vcpu_load_state(vcpu, state); + kvm_x86_state_cleanup(state); memset(®s2, 0, sizeof(regs2)); - vcpu_regs_get(vm, VCPU_ID, ®s2); + vcpu_regs_get(vcpu, ®s2); TEST_ASSERT(!memcmp(®s1, ®s2, sizeof(regs2)), "Unexpected register values after vcpu_load_state; rdi: %lx rsi: %lx", (ulong) regs2.rdi, (ulong) regs2.rsi); diff --git a/tools/testing/selftests/kvm/x86_64/vmx_set_nested_state_test.c b/tools/testing/selftests/kvm/x86_64/vmx_set_nested_state_test.c index d59f3eb67c8f..67a62a5a8895 100644 --- a/tools/testing/selftests/kvm/x86_64/vmx_set_nested_state_test.c +++ b/tools/testing/selftests/kvm/x86_64/vmx_set_nested_state_test.c @@ -23,38 +23,37 @@ * changes this should be updated. */ #define VMCS12_REVISION 0x11e57ed0 -#define VCPU_ID 5 bool have_evmcs; -void test_nested_state(struct kvm_vm *vm, struct kvm_nested_state *state) +void test_nested_state(struct kvm_vcpu *vcpu, struct kvm_nested_state *state) { - vcpu_nested_state_set(vm, VCPU_ID, state, false); + vcpu_nested_state_set(vcpu, state); } -void test_nested_state_expect_errno(struct kvm_vm *vm, +void test_nested_state_expect_errno(struct kvm_vcpu *vcpu, struct kvm_nested_state *state, int expected_errno) { int rv; - rv = vcpu_nested_state_set(vm, VCPU_ID, state, true); + rv = __vcpu_nested_state_set(vcpu, state); TEST_ASSERT(rv == -1 && errno == expected_errno, "Expected %s (%d) from vcpu_nested_state_set but got rv: %i errno: %s (%d)", strerror(expected_errno), expected_errno, rv, strerror(errno), errno); } -void test_nested_state_expect_einval(struct kvm_vm *vm, +void test_nested_state_expect_einval(struct kvm_vcpu *vcpu, struct kvm_nested_state *state) { - test_nested_state_expect_errno(vm, state, EINVAL); + test_nested_state_expect_errno(vcpu, state, EINVAL); } -void test_nested_state_expect_efault(struct kvm_vm *vm, +void test_nested_state_expect_efault(struct kvm_vcpu *vcpu, struct kvm_nested_state *state) { - test_nested_state_expect_errno(vm, state, EFAULT); + test_nested_state_expect_errno(vcpu, state, EFAULT); } void set_revision_id_for_vmcs12(struct kvm_nested_state *state, @@ -86,7 +85,7 @@ void set_default_vmx_state(struct kvm_nested_state *state, int size) set_revision_id_for_vmcs12(state, VMCS12_REVISION); } -void test_vmx_nested_state(struct kvm_vm *vm) +void test_vmx_nested_state(struct kvm_vcpu *vcpu) { /* Add a page for VMCS12. */ const int state_sz = sizeof(struct kvm_nested_state) + getpagesize(); @@ -96,14 +95,14 @@ void test_vmx_nested_state(struct kvm_vm *vm) /* The format must be set to 0. 0 for VMX, 1 for SVM. */ set_default_vmx_state(state, state_sz); state->format = 1; - test_nested_state_expect_einval(vm, state); + test_nested_state_expect_einval(vcpu, state); /* * We cannot virtualize anything if the guest does not have VMX * enabled. */ set_default_vmx_state(state, state_sz); - test_nested_state_expect_einval(vm, state); + test_nested_state_expect_einval(vcpu, state); /* * We cannot virtualize anything if the guest does not have VMX @@ -112,55 +111,59 @@ void test_vmx_nested_state(struct kvm_vm *vm) */ set_default_vmx_state(state, state_sz); state->hdr.vmx.vmxon_pa = -1ull; - test_nested_state_expect_einval(vm, state); + test_nested_state_expect_einval(vcpu, state); state->hdr.vmx.vmcs12_pa = -1ull; state->flags = KVM_STATE_NESTED_EVMCS; - test_nested_state_expect_einval(vm, state); + test_nested_state_expect_einval(vcpu, state); state->flags = 0; - test_nested_state(vm, state); + test_nested_state(vcpu, state); /* Enable VMX in the guest CPUID. */ - vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid()); + vcpu_set_cpuid_feature(vcpu, X86_FEATURE_VMX); /* * Setting vmxon_pa == -1ull and vmcs_pa == -1ull exits early without - * setting the nested state but flags other than eVMCS must be clear. - * The eVMCS flag can be set if the enlightened VMCS capability has - * been enabled. + * setting the nested state. When the eVMCS flag is not set, the + * expected return value is '0'. */ set_default_vmx_state(state, state_sz); + state->flags = 0; state->hdr.vmx.vmxon_pa = -1ull; state->hdr.vmx.vmcs12_pa = -1ull; - test_nested_state_expect_einval(vm, state); + test_nested_state(vcpu, state); - state->flags &= KVM_STATE_NESTED_EVMCS; + /* + * When eVMCS is supported, the eVMCS flag can only be set if the + * enlightened VMCS capability has been enabled. + */ if (have_evmcs) { - test_nested_state_expect_einval(vm, state); - vcpu_enable_evmcs(vm, VCPU_ID); + state->flags = KVM_STATE_NESTED_EVMCS; + test_nested_state_expect_einval(vcpu, state); + vcpu_enable_evmcs(vcpu); + test_nested_state(vcpu, state); } - test_nested_state(vm, state); /* It is invalid to have vmxon_pa == -1ull and SMM flags non-zero. */ state->hdr.vmx.smm.flags = 1; - test_nested_state_expect_einval(vm, state); + test_nested_state_expect_einval(vcpu, state); /* Invalid flags are rejected. */ set_default_vmx_state(state, state_sz); state->hdr.vmx.flags = ~0; - test_nested_state_expect_einval(vm, state); + test_nested_state_expect_einval(vcpu, state); /* It is invalid to have vmxon_pa == -1ull and vmcs_pa != -1ull. */ set_default_vmx_state(state, state_sz); state->hdr.vmx.vmxon_pa = -1ull; state->flags = 0; - test_nested_state_expect_einval(vm, state); + test_nested_state_expect_einval(vcpu, state); /* It is invalid to have vmxon_pa set to a non-page aligned address. */ set_default_vmx_state(state, state_sz); state->hdr.vmx.vmxon_pa = 1; - test_nested_state_expect_einval(vm, state); + test_nested_state_expect_einval(vcpu, state); /* * It is invalid to have KVM_STATE_NESTED_SMM_GUEST_MODE and @@ -170,7 +173,7 @@ void test_vmx_nested_state(struct kvm_vm *vm) state->flags = KVM_STATE_NESTED_GUEST_MODE | KVM_STATE_NESTED_RUN_PENDING; state->hdr.vmx.smm.flags = KVM_STATE_NESTED_SMM_GUEST_MODE; - test_nested_state_expect_einval(vm, state); + test_nested_state_expect_einval(vcpu, state); /* * It is invalid to have any of the SMM flags set besides: @@ -180,13 +183,13 @@ void test_vmx_nested_state(struct kvm_vm *vm) set_default_vmx_state(state, state_sz); state->hdr.vmx.smm.flags = ~(KVM_STATE_NESTED_SMM_GUEST_MODE | KVM_STATE_NESTED_SMM_VMXON); - test_nested_state_expect_einval(vm, state); + test_nested_state_expect_einval(vcpu, state); /* Outside SMM, SMM flags must be zero. */ set_default_vmx_state(state, state_sz); state->flags = 0; state->hdr.vmx.smm.flags = KVM_STATE_NESTED_SMM_GUEST_MODE; - test_nested_state_expect_einval(vm, state); + test_nested_state_expect_einval(vcpu, state); /* * Size must be large enough to fit kvm_nested_state and vmcs12 @@ -195,13 +198,13 @@ void test_vmx_nested_state(struct kvm_vm *vm) set_default_vmx_state(state, state_sz); state->size = sizeof(*state); state->flags = 0; - test_nested_state_expect_einval(vm, state); + test_nested_state_expect_einval(vcpu, state); set_default_vmx_state(state, state_sz); state->size = sizeof(*state); state->flags = 0; state->hdr.vmx.vmcs12_pa = -1; - test_nested_state(vm, state); + test_nested_state(vcpu, state); /* * KVM_SET_NESTED_STATE succeeds with invalid VMCS @@ -209,7 +212,7 @@ void test_vmx_nested_state(struct kvm_vm *vm) */ set_default_vmx_state(state, state_sz); state->flags = 0; - test_nested_state(vm, state); + test_nested_state(vcpu, state); /* Invalid flags are rejected, even if no VMCS loaded. */ set_default_vmx_state(state, state_sz); @@ -217,13 +220,13 @@ void test_vmx_nested_state(struct kvm_vm *vm) state->flags = 0; state->hdr.vmx.vmcs12_pa = -1; state->hdr.vmx.flags = ~0; - test_nested_state_expect_einval(vm, state); + test_nested_state_expect_einval(vcpu, state); /* vmxon_pa cannot be the same address as vmcs_pa. */ set_default_vmx_state(state, state_sz); state->hdr.vmx.vmxon_pa = 0; state->hdr.vmx.vmcs12_pa = 0; - test_nested_state_expect_einval(vm, state); + test_nested_state_expect_einval(vcpu, state); /* * Test that if we leave nesting the state reflects that when we get @@ -233,8 +236,8 @@ void test_vmx_nested_state(struct kvm_vm *vm) state->hdr.vmx.vmxon_pa = -1ull; state->hdr.vmx.vmcs12_pa = -1ull; state->flags = 0; - test_nested_state(vm, state); - vcpu_nested_state_get(vm, VCPU_ID, state); + test_nested_state(vcpu, state); + vcpu_nested_state_get(vcpu, state); TEST_ASSERT(state->size >= sizeof(*state) && state->size <= state_sz, "Size must be between %ld and %d. The size returned was %d.", sizeof(*state), state_sz, state->size); @@ -248,29 +251,32 @@ int main(int argc, char *argv[]) { struct kvm_vm *vm; struct kvm_nested_state state; + struct kvm_vcpu *vcpu; have_evmcs = kvm_check_cap(KVM_CAP_HYPERV_ENLIGHTENED_VMCS); - if (!kvm_check_cap(KVM_CAP_NESTED_STATE)) { - print_skip("KVM_CAP_NESTED_STATE not available"); - exit(KSFT_SKIP); - } + TEST_REQUIRE(kvm_has_cap(KVM_CAP_NESTED_STATE)); /* * AMD currently does not implement set_nested_state, so for now we * just early out. */ - nested_vmx_check_supported(); + TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX)); + + vm = vm_create_with_one_vcpu(&vcpu, NULL); - vm = vm_create_default(VCPU_ID, 0, 0); + /* + * First run tests with VMX disabled to check error handling. + */ + vcpu_clear_cpuid_feature(vcpu, X86_FEATURE_VMX); /* Passing a NULL kvm_nested_state causes a EFAULT. */ - test_nested_state_expect_efault(vm, NULL); + test_nested_state_expect_efault(vcpu, NULL); /* 'size' cannot be smaller than sizeof(kvm_nested_state). */ set_default_state(&state); state.size = 0; - test_nested_state_expect_einval(vm, &state); + test_nested_state_expect_einval(vcpu, &state); /* * Setting the flags 0xf fails the flags check. The only flags that @@ -281,7 +287,7 @@ int main(int argc, char *argv[]) */ set_default_state(&state); state.flags = 0xf; - test_nested_state_expect_einval(vm, &state); + test_nested_state_expect_einval(vcpu, &state); /* * If KVM_STATE_NESTED_RUN_PENDING is set then @@ -289,9 +295,9 @@ int main(int argc, char *argv[]) */ set_default_state(&state); state.flags = KVM_STATE_NESTED_RUN_PENDING; - test_nested_state_expect_einval(vm, &state); + test_nested_state_expect_einval(vcpu, &state); - test_vmx_nested_state(vm); + test_vmx_nested_state(vcpu); kvm_vm_free(vm); return 0; diff --git a/tools/testing/selftests/kvm/x86_64/vmx_tsc_adjust_test.c b/tools/testing/selftests/kvm/x86_64/vmx_tsc_adjust_test.c index fbe8417cbc2c..2ceb5c78c442 100644 --- a/tools/testing/selftests/kvm/x86_64/vmx_tsc_adjust_test.c +++ b/tools/testing/selftests/kvm/x86_64/vmx_tsc_adjust_test.c @@ -32,9 +32,6 @@ #define MSR_IA32_TSC_ADJUST 0x3b #endif -#define PAGE_SIZE 4096 -#define VCPU_ID 5 - #define TSC_ADJUST_VALUE (1ll << 32) #define TSC_OFFSET_VALUE -(1ll << 48) @@ -52,11 +49,6 @@ enum { NUM_VMX_PAGES, }; -struct kvm_single_msr { - struct kvm_msrs header; - struct kvm_msr_entry entry; -} __attribute__((packed)); - /* The virtual machine object. */ static struct kvm_vm *vm; @@ -128,29 +120,25 @@ static void report(int64_t val) int main(int argc, char *argv[]) { vm_vaddr_t vmx_pages_gva; + struct kvm_vcpu *vcpu; - nested_vmx_check_supported(); + TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX)); - vm = vm_create_default(VCPU_ID, 0, (void *) l1_guest_code); - vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid()); + vm = vm_create_with_one_vcpu(&vcpu, (void *) l1_guest_code); /* Allocate VMX pages and shared descriptors (vmx_pages). */ vcpu_alloc_vmx(vm, &vmx_pages_gva); - vcpu_args_set(vm, VCPU_ID, 1, vmx_pages_gva); + vcpu_args_set(vcpu, 1, vmx_pages_gva); for (;;) { - volatile struct kvm_run *run = vcpu_state(vm, VCPU_ID); struct ucall uc; - vcpu_run(vm, VCPU_ID); - TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, - "Got exit_reason other than KVM_EXIT_IO: %u (%s)\n", - run->exit_reason, - exit_reason_str(run->exit_reason)); + vcpu_run(vcpu); + TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO); - switch (get_ucall(vm, VCPU_ID, &uc)) { + switch (get_ucall(vcpu, &uc)) { case UCALL_ABORT: - TEST_FAIL("%s", (const char *)uc.args[0]); + REPORT_GUEST_ASSERT(uc); /* NOT REACHED */ case UCALL_SYNC: report(uc.args[1]); @@ -162,7 +150,7 @@ int main(int argc, char *argv[]) } } - kvm_vm_free(vm); done: + kvm_vm_free(vm); return 0; } diff --git a/tools/testing/selftests/kvm/x86_64/xapic_ipi_test.c b/tools/testing/selftests/kvm/x86_64/xapic_ipi_test.c new file mode 100644 index 000000000000..a76078a08ff8 --- /dev/null +++ b/tools/testing/selftests/kvm/x86_64/xapic_ipi_test.c @@ -0,0 +1,487 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * xapic_ipi_test + * + * Copyright (C) 2020, Google LLC. + * + * This work is licensed under the terms of the GNU GPL, version 2. + * + * Test that when the APIC is in xAPIC mode, a vCPU can send an IPI to wake + * another vCPU that is halted when KVM's backing page for the APIC access + * address has been moved by mm. + * + * The test starts two vCPUs: one that sends IPIs and one that continually + * executes HLT. The sender checks that the halter has woken from the HLT and + * has reentered HLT before sending the next IPI. While the vCPUs are running, + * the host continually calls migrate_pages to move all of the process' pages + * amongst the available numa nodes on the machine. + * + * Migration is a command line option. When used on non-numa machines will + * exit with error. Test is still usefull on non-numa for testing IPIs. + */ +#include <getopt.h> +#include <pthread.h> +#include <inttypes.h> +#include <string.h> +#include <time.h> + +#include "kvm_util.h" +#include "numaif.h" +#include "processor.h" +#include "test_util.h" +#include "vmx.h" + +/* Default running time for the test */ +#define DEFAULT_RUN_SECS 3 + +/* Default delay between migrate_pages calls (microseconds) */ +#define DEFAULT_DELAY_USECS 500000 + +/* + * Vector for IPI from sender vCPU to halting vCPU. + * Value is arbitrary and was chosen for the alternating bit pattern. Any + * value should work. + */ +#define IPI_VECTOR 0xa5 + +/* + * Incremented in the IPI handler. Provides evidence to the sender that the IPI + * arrived at the destination + */ +static volatile uint64_t ipis_rcvd; + +/* Data struct shared between host main thread and vCPUs */ +struct test_data_page { + uint32_t halter_apic_id; + volatile uint64_t hlt_count; + volatile uint64_t wake_count; + uint64_t ipis_sent; + uint64_t migrations_attempted; + uint64_t migrations_completed; + uint32_t icr; + uint32_t icr2; + uint32_t halter_tpr; + uint32_t halter_ppr; + + /* + * Record local version register as a cross-check that APIC access + * worked. Value should match what KVM reports (APIC_VERSION in + * arch/x86/kvm/lapic.c). If test is failing, check that values match + * to determine whether APIC access exits are working. + */ + uint32_t halter_lvr; +}; + +struct thread_params { + struct test_data_page *data; + struct kvm_vcpu *vcpu; + uint64_t *pipis_rcvd; /* host address of ipis_rcvd global */ +}; + +void verify_apic_base_addr(void) +{ + uint64_t msr = rdmsr(MSR_IA32_APICBASE); + uint64_t base = GET_APIC_BASE(msr); + + GUEST_ASSERT(base == APIC_DEFAULT_GPA); +} + +static void halter_guest_code(struct test_data_page *data) +{ + verify_apic_base_addr(); + xapic_enable(); + + data->halter_apic_id = GET_APIC_ID_FIELD(xapic_read_reg(APIC_ID)); + data->halter_lvr = xapic_read_reg(APIC_LVR); + + /* + * Loop forever HLTing and recording halts & wakes. Disable interrupts + * each time around to minimize window between signaling the pending + * halt to the sender vCPU and executing the halt. No need to disable on + * first run as this vCPU executes first and the host waits for it to + * signal going into first halt before starting the sender vCPU. Record + * TPR and PPR for diagnostic purposes in case the test fails. + */ + for (;;) { + data->halter_tpr = xapic_read_reg(APIC_TASKPRI); + data->halter_ppr = xapic_read_reg(APIC_PROCPRI); + data->hlt_count++; + asm volatile("sti; hlt; cli"); + data->wake_count++; + } +} + +/* + * Runs on halter vCPU when IPI arrives. Write an arbitrary non-zero value to + * enable diagnosing errant writes to the APIC access address backing page in + * case of test failure. + */ +static void guest_ipi_handler(struct ex_regs *regs) +{ + ipis_rcvd++; + xapic_write_reg(APIC_EOI, 77); +} + +static void sender_guest_code(struct test_data_page *data) +{ + uint64_t last_wake_count; + uint64_t last_hlt_count; + uint64_t last_ipis_rcvd_count; + uint32_t icr_val; + uint32_t icr2_val; + uint64_t tsc_start; + + verify_apic_base_addr(); + xapic_enable(); + + /* + * Init interrupt command register for sending IPIs + * + * Delivery mode=fixed, per SDM: + * "Delivers the interrupt specified in the vector field to the target + * processor." + * + * Destination mode=physical i.e. specify target by its local APIC + * ID. This vCPU assumes that the halter vCPU has already started and + * set data->halter_apic_id. + */ + icr_val = (APIC_DEST_PHYSICAL | APIC_DM_FIXED | IPI_VECTOR); + icr2_val = SET_APIC_DEST_FIELD(data->halter_apic_id); + data->icr = icr_val; + data->icr2 = icr2_val; + + last_wake_count = data->wake_count; + last_hlt_count = data->hlt_count; + last_ipis_rcvd_count = ipis_rcvd; + for (;;) { + /* + * Send IPI to halter vCPU. + * First IPI can be sent unconditionally because halter vCPU + * starts earlier. + */ + xapic_write_reg(APIC_ICR2, icr2_val); + xapic_write_reg(APIC_ICR, icr_val); + data->ipis_sent++; + + /* + * Wait up to ~1 sec for halter to indicate that it has: + * 1. Received the IPI + * 2. Woken up from the halt + * 3. Gone back into halt + * Current CPUs typically run at 2.x Ghz which is ~2 + * billion ticks per second. + */ + tsc_start = rdtsc(); + while (rdtsc() - tsc_start < 2000000000) { + if ((ipis_rcvd != last_ipis_rcvd_count) && + (data->wake_count != last_wake_count) && + (data->hlt_count != last_hlt_count)) + break; + } + + GUEST_ASSERT((ipis_rcvd != last_ipis_rcvd_count) && + (data->wake_count != last_wake_count) && + (data->hlt_count != last_hlt_count)); + + last_wake_count = data->wake_count; + last_hlt_count = data->hlt_count; + last_ipis_rcvd_count = ipis_rcvd; + } +} + +static void *vcpu_thread(void *arg) +{ + struct thread_params *params = (struct thread_params *)arg; + struct kvm_vcpu *vcpu = params->vcpu; + struct ucall uc; + int old; + int r; + + r = pthread_setcanceltype(PTHREAD_CANCEL_ASYNCHRONOUS, &old); + TEST_ASSERT(r == 0, + "pthread_setcanceltype failed on vcpu_id=%u with errno=%d", + vcpu->id, r); + + fprintf(stderr, "vCPU thread running vCPU %u\n", vcpu->id); + vcpu_run(vcpu); + + TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO); + + if (get_ucall(vcpu, &uc) == UCALL_ABORT) { + TEST_ASSERT(false, + "vCPU %u exited with error: %s.\n" + "Sending vCPU sent %lu IPIs to halting vCPU\n" + "Halting vCPU halted %lu times, woke %lu times, received %lu IPIs.\n" + "Halter TPR=%#x PPR=%#x LVR=%#x\n" + "Migrations attempted: %lu\n" + "Migrations completed: %lu", + vcpu->id, (const char *)uc.args[0], + params->data->ipis_sent, params->data->hlt_count, + params->data->wake_count, + *params->pipis_rcvd, params->data->halter_tpr, + params->data->halter_ppr, params->data->halter_lvr, + params->data->migrations_attempted, + params->data->migrations_completed); + } + + return NULL; +} + +static void cancel_join_vcpu_thread(pthread_t thread, struct kvm_vcpu *vcpu) +{ + void *retval; + int r; + + r = pthread_cancel(thread); + TEST_ASSERT(r == 0, + "pthread_cancel on vcpu_id=%d failed with errno=%d", + vcpu->id, r); + + r = pthread_join(thread, &retval); + TEST_ASSERT(r == 0, + "pthread_join on vcpu_id=%d failed with errno=%d", + vcpu->id, r); + TEST_ASSERT(retval == PTHREAD_CANCELED, + "expected retval=%p, got %p", PTHREAD_CANCELED, + retval); +} + +void do_migrations(struct test_data_page *data, int run_secs, int delay_usecs, + uint64_t *pipis_rcvd) +{ + long pages_not_moved; + unsigned long nodemask = 0; + unsigned long nodemasks[sizeof(nodemask) * 8]; + int nodes = 0; + time_t start_time, last_update, now; + time_t interval_secs = 1; + int i, r; + int from, to; + unsigned long bit; + uint64_t hlt_count; + uint64_t wake_count; + uint64_t ipis_sent; + + fprintf(stderr, "Calling migrate_pages every %d microseconds\n", + delay_usecs); + + /* Get set of first 64 numa nodes available */ + r = get_mempolicy(NULL, &nodemask, sizeof(nodemask) * 8, + 0, MPOL_F_MEMS_ALLOWED); + TEST_ASSERT(r == 0, "get_mempolicy failed errno=%d", errno); + + fprintf(stderr, "Numa nodes found amongst first %lu possible nodes " + "(each 1-bit indicates node is present): %#lx\n", + sizeof(nodemask) * 8, nodemask); + + /* Init array of masks containing a single-bit in each, one for each + * available node. migrate_pages called below requires specifying nodes + * as bit masks. + */ + for (i = 0, bit = 1; i < sizeof(nodemask) * 8; i++, bit <<= 1) { + if (nodemask & bit) { + nodemasks[nodes] = nodemask & bit; + nodes++; + } + } + + TEST_ASSERT(nodes > 1, + "Did not find at least 2 numa nodes. Can't do migration"); + + fprintf(stderr, "Migrating amongst %d nodes found\n", nodes); + + from = 0; + to = 1; + start_time = time(NULL); + last_update = start_time; + + ipis_sent = data->ipis_sent; + hlt_count = data->hlt_count; + wake_count = data->wake_count; + + while ((int)(time(NULL) - start_time) < run_secs) { + data->migrations_attempted++; + + /* + * migrate_pages with PID=0 will migrate all pages of this + * process between the nodes specified as bitmasks. The page + * backing the APIC access address belongs to this process + * because it is allocated by KVM in the context of the + * KVM_CREATE_VCPU ioctl. If that assumption ever changes this + * test may break or give a false positive signal. + */ + pages_not_moved = migrate_pages(0, sizeof(nodemasks[from]), + &nodemasks[from], + &nodemasks[to]); + if (pages_not_moved < 0) + fprintf(stderr, + "migrate_pages failed, errno=%d\n", errno); + else if (pages_not_moved > 0) + fprintf(stderr, + "migrate_pages could not move %ld pages\n", + pages_not_moved); + else + data->migrations_completed++; + + from = to; + to++; + if (to == nodes) + to = 0; + + now = time(NULL); + if (((now - start_time) % interval_secs == 0) && + (now != last_update)) { + last_update = now; + fprintf(stderr, + "%lu seconds: Migrations attempted=%lu completed=%lu, " + "IPIs sent=%lu received=%lu, HLTs=%lu wakes=%lu\n", + now - start_time, data->migrations_attempted, + data->migrations_completed, + data->ipis_sent, *pipis_rcvd, + data->hlt_count, data->wake_count); + + TEST_ASSERT(ipis_sent != data->ipis_sent && + hlt_count != data->hlt_count && + wake_count != data->wake_count, + "IPI, HLT and wake count have not increased " + "in the last %lu seconds. " + "HLTer is likely hung.", interval_secs); + + ipis_sent = data->ipis_sent; + hlt_count = data->hlt_count; + wake_count = data->wake_count; + } + usleep(delay_usecs); + } +} + +void get_cmdline_args(int argc, char *argv[], int *run_secs, + bool *migrate, int *delay_usecs) +{ + for (;;) { + int opt = getopt(argc, argv, "s:d:m"); + + if (opt == -1) + break; + switch (opt) { + case 's': + *run_secs = parse_size(optarg); + break; + case 'm': + *migrate = true; + break; + case 'd': + *delay_usecs = parse_size(optarg); + break; + default: + TEST_ASSERT(false, + "Usage: -s <runtime seconds>. Default is %d seconds.\n" + "-m adds calls to migrate_pages while vCPUs are running." + " Default is no migrations.\n" + "-d <delay microseconds> - delay between migrate_pages() calls." + " Default is %d microseconds.", + DEFAULT_RUN_SECS, DEFAULT_DELAY_USECS); + } + } +} + +int main(int argc, char *argv[]) +{ + int r; + int wait_secs; + const int max_halter_wait = 10; + int run_secs = 0; + int delay_usecs = 0; + struct test_data_page *data; + vm_vaddr_t test_data_page_vaddr; + bool migrate = false; + pthread_t threads[2]; + struct thread_params params[2]; + struct kvm_vm *vm; + uint64_t *pipis_rcvd; + + get_cmdline_args(argc, argv, &run_secs, &migrate, &delay_usecs); + if (run_secs <= 0) + run_secs = DEFAULT_RUN_SECS; + if (delay_usecs <= 0) + delay_usecs = DEFAULT_DELAY_USECS; + + vm = vm_create_with_one_vcpu(¶ms[0].vcpu, halter_guest_code); + + vm_install_exception_handler(vm, IPI_VECTOR, guest_ipi_handler); + + virt_pg_map(vm, APIC_DEFAULT_GPA, APIC_DEFAULT_GPA); + + params[1].vcpu = vm_vcpu_add(vm, 1, sender_guest_code); + + test_data_page_vaddr = vm_vaddr_alloc_page(vm); + data = addr_gva2hva(vm, test_data_page_vaddr); + memset(data, 0, sizeof(*data)); + params[0].data = data; + params[1].data = data; + + vcpu_args_set(params[0].vcpu, 1, test_data_page_vaddr); + vcpu_args_set(params[1].vcpu, 1, test_data_page_vaddr); + + pipis_rcvd = (uint64_t *)addr_gva2hva(vm, (uint64_t)&ipis_rcvd); + params[0].pipis_rcvd = pipis_rcvd; + params[1].pipis_rcvd = pipis_rcvd; + + /* Start halter vCPU thread and wait for it to execute first HLT. */ + r = pthread_create(&threads[0], NULL, vcpu_thread, ¶ms[0]); + TEST_ASSERT(r == 0, + "pthread_create halter failed errno=%d", errno); + fprintf(stderr, "Halter vCPU thread started\n"); + + wait_secs = 0; + while ((wait_secs < max_halter_wait) && !data->hlt_count) { + sleep(1); + wait_secs++; + } + + TEST_ASSERT(data->hlt_count, + "Halter vCPU did not execute first HLT within %d seconds", + max_halter_wait); + + fprintf(stderr, + "Halter vCPU thread reported its APIC ID: %u after %d seconds.\n", + data->halter_apic_id, wait_secs); + + r = pthread_create(&threads[1], NULL, vcpu_thread, ¶ms[1]); + TEST_ASSERT(r == 0, "pthread_create sender failed errno=%d", errno); + + fprintf(stderr, + "IPI sender vCPU thread started. Letting vCPUs run for %d seconds.\n", + run_secs); + + if (!migrate) + sleep(run_secs); + else + do_migrations(data, run_secs, delay_usecs, pipis_rcvd); + + /* + * Cancel threads and wait for them to stop. + */ + cancel_join_vcpu_thread(threads[0], params[0].vcpu); + cancel_join_vcpu_thread(threads[1], params[1].vcpu); + + fprintf(stderr, + "Test successful after running for %d seconds.\n" + "Sending vCPU sent %lu IPIs to halting vCPU\n" + "Halting vCPU halted %lu times, woke %lu times, received %lu IPIs.\n" + "Halter APIC ID=%#x\n" + "Sender ICR value=%#x ICR2 value=%#x\n" + "Halter TPR=%#x PPR=%#x LVR=%#x\n" + "Migrations attempted: %lu\n" + "Migrations completed: %lu\n", + run_secs, data->ipis_sent, + data->hlt_count, data->wake_count, *pipis_rcvd, + data->halter_apic_id, + data->icr, data->icr2, + data->halter_tpr, data->halter_ppr, data->halter_lvr, + data->migrations_attempted, data->migrations_completed); + + kvm_vm_free(vm); + + return 0; +} diff --git a/tools/testing/selftests/kvm/x86_64/xapic_state_test.c b/tools/testing/selftests/kvm/x86_64/xapic_state_test.c new file mode 100644 index 000000000000..69849acd95b0 --- /dev/null +++ b/tools/testing/selftests/kvm/x86_64/xapic_state_test.c @@ -0,0 +1,214 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include <fcntl.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/ioctl.h> + +#include "apic.h" +#include "kvm_util.h" +#include "processor.h" +#include "test_util.h" + +struct xapic_vcpu { + struct kvm_vcpu *vcpu; + bool is_x2apic; +}; + +static void xapic_guest_code(void) +{ + asm volatile("cli"); + + xapic_enable(); + + while (1) { + uint64_t val = (u64)xapic_read_reg(APIC_IRR) | + (u64)xapic_read_reg(APIC_IRR + 0x10) << 32; + + xapic_write_reg(APIC_ICR2, val >> 32); + xapic_write_reg(APIC_ICR, val); + GUEST_SYNC(val); + } +} + +static void x2apic_guest_code(void) +{ + asm volatile("cli"); + + x2apic_enable(); + + do { + uint64_t val = x2apic_read_reg(APIC_IRR) | + x2apic_read_reg(APIC_IRR + 0x10) << 32; + + x2apic_write_reg(APIC_ICR, val); + GUEST_SYNC(val); + } while (1); +} + +static void ____test_icr(struct xapic_vcpu *x, uint64_t val) +{ + struct kvm_vcpu *vcpu = x->vcpu; + struct kvm_lapic_state xapic; + struct ucall uc; + uint64_t icr; + + /* + * Tell the guest what ICR value to write. Use the IRR to pass info, + * all bits are valid and should not be modified by KVM (ignoring the + * fact that vectors 0-15 are technically illegal). + */ + vcpu_ioctl(vcpu, KVM_GET_LAPIC, &xapic); + *((u32 *)&xapic.regs[APIC_IRR]) = val; + *((u32 *)&xapic.regs[APIC_IRR + 0x10]) = val >> 32; + vcpu_ioctl(vcpu, KVM_SET_LAPIC, &xapic); + + vcpu_run(vcpu); + TEST_ASSERT_EQ(get_ucall(vcpu, &uc), UCALL_SYNC); + TEST_ASSERT_EQ(uc.args[1], val); + + vcpu_ioctl(vcpu, KVM_GET_LAPIC, &xapic); + icr = (u64)(*((u32 *)&xapic.regs[APIC_ICR])) | + (u64)(*((u32 *)&xapic.regs[APIC_ICR2])) << 32; + if (!x->is_x2apic) { + val &= (-1u | (0xffull << (32 + 24))); + TEST_ASSERT_EQ(icr, val & ~APIC_ICR_BUSY); + } else { + TEST_ASSERT_EQ(icr & ~APIC_ICR_BUSY, val & ~APIC_ICR_BUSY); + } +} + +#define X2APIC_RSVED_BITS_MASK (GENMASK_ULL(31,20) | \ + GENMASK_ULL(17,16) | \ + GENMASK_ULL(13,13)) + +static void __test_icr(struct xapic_vcpu *x, uint64_t val) +{ + if (x->is_x2apic) { + /* Hardware writing vICR register requires reserved bits 31:20, + * 17:16 and 13 kept as zero to avoid #GP exception. Data value + * written to vICR should mask out those bits above. + */ + val &= ~X2APIC_RSVED_BITS_MASK; + } + ____test_icr(x, val | APIC_ICR_BUSY); + ____test_icr(x, val & ~(u64)APIC_ICR_BUSY); +} + +static void test_icr(struct xapic_vcpu *x) +{ + struct kvm_vcpu *vcpu = x->vcpu; + uint64_t icr, i, j; + + icr = APIC_DEST_SELF | APIC_INT_ASSERT | APIC_DM_FIXED; + for (i = 0; i <= 0xff; i++) + __test_icr(x, icr | i); + + icr = APIC_INT_ASSERT | APIC_DM_FIXED; + for (i = 0; i <= 0xff; i++) + __test_icr(x, icr | i); + + /* + * Send all flavors of IPIs to non-existent vCPUs. TODO: use number of + * vCPUs, not vcpu.id + 1. Arbitrarily use vector 0xff. + */ + icr = APIC_INT_ASSERT | 0xff; + for (i = 0; i < 0xff; i++) { + if (i == vcpu->id) + continue; + for (j = 0; j < 8; j++) + __test_icr(x, i << (32 + 24) | icr | (j << 8)); + } + + /* And again with a shorthand destination for all types of IPIs. */ + icr = APIC_DEST_ALLBUT | APIC_INT_ASSERT; + for (i = 0; i < 8; i++) + __test_icr(x, icr | (i << 8)); + + /* And a few garbage value, just make sure it's an IRQ (blocked). */ + __test_icr(x, 0xa5a5a5a5a5a5a5a5 & ~APIC_DM_FIXED_MASK); + __test_icr(x, 0x5a5a5a5a5a5a5a5a & ~APIC_DM_FIXED_MASK); + __test_icr(x, -1ull & ~APIC_DM_FIXED_MASK); +} + +static void __test_apic_id(struct kvm_vcpu *vcpu, uint64_t apic_base) +{ + uint32_t apic_id, expected; + struct kvm_lapic_state xapic; + + vcpu_set_msr(vcpu, MSR_IA32_APICBASE, apic_base); + + vcpu_ioctl(vcpu, KVM_GET_LAPIC, &xapic); + + expected = apic_base & X2APIC_ENABLE ? vcpu->id : vcpu->id << 24; + apic_id = *((u32 *)&xapic.regs[APIC_ID]); + + TEST_ASSERT(apic_id == expected, + "APIC_ID not set back to %s format; wanted = %x, got = %x", + (apic_base & X2APIC_ENABLE) ? "x2APIC" : "xAPIC", + expected, apic_id); +} + +/* + * Verify that KVM switches the APIC_ID between xAPIC and x2APIC when userspace + * stuffs MSR_IA32_APICBASE. Setting the APIC_ID when x2APIC is enabled and + * when the APIC transitions for DISABLED to ENABLED is architectural behavior + * (on Intel), whereas the x2APIC => xAPIC transition behavior is KVM ABI since + * attempted to transition from x2APIC to xAPIC without disabling the APIC is + * architecturally disallowed. + */ +static void test_apic_id(void) +{ + const uint32_t NR_VCPUS = 3; + struct kvm_vcpu *vcpus[NR_VCPUS]; + uint64_t apic_base; + struct kvm_vm *vm; + int i; + + vm = vm_create_with_vcpus(NR_VCPUS, NULL, vcpus); + vm_enable_cap(vm, KVM_CAP_X2APIC_API, KVM_X2APIC_API_USE_32BIT_IDS); + + for (i = 0; i < NR_VCPUS; i++) { + apic_base = vcpu_get_msr(vcpus[i], MSR_IA32_APICBASE); + + TEST_ASSERT(apic_base & MSR_IA32_APICBASE_ENABLE, + "APIC not in ENABLED state at vCPU RESET"); + TEST_ASSERT(!(apic_base & X2APIC_ENABLE), + "APIC not in xAPIC mode at vCPU RESET"); + + __test_apic_id(vcpus[i], apic_base); + __test_apic_id(vcpus[i], apic_base | X2APIC_ENABLE); + __test_apic_id(vcpus[i], apic_base); + } + + kvm_vm_free(vm); +} + +int main(int argc, char *argv[]) +{ + struct xapic_vcpu x = { + .vcpu = NULL, + .is_x2apic = true, + }; + struct kvm_vm *vm; + + vm = vm_create_with_one_vcpu(&x.vcpu, x2apic_guest_code); + test_icr(&x); + kvm_vm_free(vm); + + /* + * Use a second VM for the xAPIC test so that x2APIC can be hidden from + * the guest in order to test AVIC. KVM disallows changing CPUID after + * KVM_RUN and AVIC is disabled if _any_ vCPU is allowed to use x2APIC. + */ + vm = vm_create_with_one_vcpu(&x.vcpu, xapic_guest_code); + x.is_x2apic = false; + + vcpu_clear_cpuid_feature(x.vcpu, X86_FEATURE_X2APIC); + + virt_pg_map(vm, APIC_DEFAULT_GPA, APIC_DEFAULT_GPA); + test_icr(&x); + kvm_vm_free(vm); + + test_apic_id(); +} diff --git a/tools/testing/selftests/kvm/x86_64/xcr0_cpuid_test.c b/tools/testing/selftests/kvm/x86_64/xcr0_cpuid_test.c new file mode 100644 index 000000000000..95ce192d0753 --- /dev/null +++ b/tools/testing/selftests/kvm/x86_64/xcr0_cpuid_test.c @@ -0,0 +1,134 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * XCR0 cpuid test + * + * Copyright (C) 2022, Google LLC. + */ +#include <fcntl.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/ioctl.h> + +#include "test_util.h" + +#include "kvm_util.h" +#include "processor.h" + +/* + * Assert that architectural dependency rules are satisfied, e.g. that AVX is + * supported if and only if SSE is supported. + */ +#define ASSERT_XFEATURE_DEPENDENCIES(supported_xcr0, xfeatures, dependencies) \ +do { \ + uint64_t __supported = (supported_xcr0) & ((xfeatures) | (dependencies)); \ + \ + __GUEST_ASSERT((__supported & (xfeatures)) != (xfeatures) || \ + __supported == ((xfeatures) | (dependencies)), \ + "supported = 0x%lx, xfeatures = 0x%llx, dependencies = 0x%llx", \ + __supported, (xfeatures), (dependencies)); \ +} while (0) + +/* + * Assert that KVM reports a sane, usable as-is XCR0. Architecturally, a CPU + * isn't strictly required to _support_ all XFeatures related to a feature, but + * at the same time XSETBV will #GP if bundled XFeatures aren't enabled and + * disabled coherently. E.g. a CPU can technically enumerate supported for + * XTILE_CFG but not XTILE_DATA, but attempting to enable XTILE_CFG without + * XTILE_DATA will #GP. + */ +#define ASSERT_ALL_OR_NONE_XFEATURE(supported_xcr0, xfeatures) \ +do { \ + uint64_t __supported = (supported_xcr0) & (xfeatures); \ + \ + __GUEST_ASSERT(!__supported || __supported == (xfeatures), \ + "supported = 0x%lx, xfeatures = 0x%llx", \ + __supported, (xfeatures)); \ +} while (0) + +static void guest_code(void) +{ + uint64_t xcr0_reset; + uint64_t supported_xcr0; + int i, vector; + + set_cr4(get_cr4() | X86_CR4_OSXSAVE); + + xcr0_reset = xgetbv(0); + supported_xcr0 = this_cpu_supported_xcr0(); + + GUEST_ASSERT(xcr0_reset == XFEATURE_MASK_FP); + + /* Check AVX */ + ASSERT_XFEATURE_DEPENDENCIES(supported_xcr0, + XFEATURE_MASK_YMM, + XFEATURE_MASK_SSE); + + /* Check MPX */ + ASSERT_ALL_OR_NONE_XFEATURE(supported_xcr0, + XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR); + + /* Check AVX-512 */ + ASSERT_XFEATURE_DEPENDENCIES(supported_xcr0, + XFEATURE_MASK_AVX512, + XFEATURE_MASK_SSE | XFEATURE_MASK_YMM); + ASSERT_ALL_OR_NONE_XFEATURE(supported_xcr0, + XFEATURE_MASK_AVX512); + + /* Check AMX */ + ASSERT_ALL_OR_NONE_XFEATURE(supported_xcr0, + XFEATURE_MASK_XTILE); + + vector = xsetbv_safe(0, supported_xcr0); + __GUEST_ASSERT(!vector, + "Expected success on XSETBV(0x%lx), got vector '0x%x'", + supported_xcr0, vector); + + for (i = 0; i < 64; i++) { + if (supported_xcr0 & BIT_ULL(i)) + continue; + + vector = xsetbv_safe(0, supported_xcr0 | BIT_ULL(i)); + __GUEST_ASSERT(vector == GP_VECTOR, + "Expected #GP on XSETBV(0x%llx), supported XCR0 = %lx, got vector '0x%x'", + BIT_ULL(i), supported_xcr0, vector); + } + + GUEST_DONE(); +} + +int main(int argc, char *argv[]) +{ + struct kvm_vcpu *vcpu; + struct kvm_run *run; + struct kvm_vm *vm; + struct ucall uc; + + TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_XSAVE)); + + vm = vm_create_with_one_vcpu(&vcpu, guest_code); + run = vcpu->run; + + while (1) { + vcpu_run(vcpu); + + TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, + "Unexpected exit reason: %u (%s),", + run->exit_reason, + exit_reason_str(run->exit_reason)); + + switch (get_ucall(vcpu, &uc)) { + case UCALL_ABORT: + REPORT_GUEST_ASSERT(uc); + break; + case UCALL_DONE: + goto done; + default: + TEST_FAIL("Unknown ucall %lu", uc.cmd); + } + } + +done: + kvm_vm_free(vm); + return 0; +} diff --git a/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c b/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c new file mode 100644 index 000000000000..a59b3c799bb2 --- /dev/null +++ b/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c @@ -0,0 +1,1161 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright © 2021 Amazon.com, Inc. or its affiliates. + */ + +#include "test_util.h" +#include "kvm_util.h" +#include "processor.h" + +#include <stdint.h> +#include <time.h> +#include <sched.h> +#include <signal.h> +#include <pthread.h> + +#include <sys/eventfd.h> + +#define SHINFO_REGION_GVA 0xc0000000ULL +#define SHINFO_REGION_GPA 0xc0000000ULL +#define SHINFO_REGION_SLOT 10 + +#define DUMMY_REGION_GPA (SHINFO_REGION_GPA + (3 * PAGE_SIZE)) +#define DUMMY_REGION_SLOT 11 + +#define DUMMY_REGION_GPA_2 (SHINFO_REGION_GPA + (4 * PAGE_SIZE)) +#define DUMMY_REGION_SLOT_2 12 + +#define SHINFO_ADDR (SHINFO_REGION_GPA) +#define VCPU_INFO_ADDR (SHINFO_REGION_GPA + 0x40) +#define PVTIME_ADDR (SHINFO_REGION_GPA + PAGE_SIZE) +#define RUNSTATE_ADDR (SHINFO_REGION_GPA + PAGE_SIZE + PAGE_SIZE - 15) + +#define SHINFO_VADDR (SHINFO_REGION_GVA) +#define VCPU_INFO_VADDR (SHINFO_REGION_GVA + 0x40) +#define RUNSTATE_VADDR (SHINFO_REGION_GVA + PAGE_SIZE + PAGE_SIZE - 15) + +#define EVTCHN_VECTOR 0x10 + +#define EVTCHN_TEST1 15 +#define EVTCHN_TEST2 66 +#define EVTCHN_TIMER 13 + +enum { + TEST_INJECT_VECTOR = 0, + TEST_RUNSTATE_runnable, + TEST_RUNSTATE_blocked, + TEST_RUNSTATE_offline, + TEST_RUNSTATE_ADJUST, + TEST_RUNSTATE_DATA, + TEST_STEAL_TIME, + TEST_EVTCHN_MASKED, + TEST_EVTCHN_UNMASKED, + TEST_EVTCHN_SLOWPATH, + TEST_EVTCHN_SEND_IOCTL, + TEST_EVTCHN_HCALL, + TEST_EVTCHN_HCALL_SLOWPATH, + TEST_EVTCHN_HCALL_EVENTFD, + TEST_TIMER_SETUP, + TEST_TIMER_WAIT, + TEST_TIMER_RESTORE, + TEST_POLL_READY, + TEST_POLL_TIMEOUT, + TEST_POLL_MASKED, + TEST_POLL_WAKE, + SET_VCPU_INFO, + TEST_TIMER_PAST, + TEST_LOCKING_SEND_RACE, + TEST_LOCKING_POLL_RACE, + TEST_LOCKING_POLL_TIMEOUT, + TEST_DONE, + + TEST_GUEST_SAW_IRQ, +}; + +#define XEN_HYPERCALL_MSR 0x40000000 + +#define MIN_STEAL_TIME 50000 + +#define SHINFO_RACE_TIMEOUT 2 /* seconds */ + +#define __HYPERVISOR_set_timer_op 15 +#define __HYPERVISOR_sched_op 29 +#define __HYPERVISOR_event_channel_op 32 + +#define SCHEDOP_poll 3 + +#define EVTCHNOP_send 4 + +#define EVTCHNSTAT_interdomain 2 + +struct evtchn_send { + u32 port; +}; + +struct sched_poll { + u32 *ports; + unsigned int nr_ports; + u64 timeout; +}; + +struct pvclock_vcpu_time_info { + u32 version; + u32 pad0; + u64 tsc_timestamp; + u64 system_time; + u32 tsc_to_system_mul; + s8 tsc_shift; + u8 flags; + u8 pad[2]; +} __attribute__((__packed__)); /* 32 bytes */ + +struct pvclock_wall_clock { + u32 version; + u32 sec; + u32 nsec; +} __attribute__((__packed__)); + +struct vcpu_runstate_info { + uint32_t state; + uint64_t state_entry_time; + uint64_t time[5]; /* Extra field for overrun check */ +}; + +struct compat_vcpu_runstate_info { + uint32_t state; + uint64_t state_entry_time; + uint64_t time[5]; +} __attribute__((__packed__)); + +struct arch_vcpu_info { + unsigned long cr2; + unsigned long pad; /* sizeof(vcpu_info_t) == 64 */ +}; + +struct vcpu_info { + uint8_t evtchn_upcall_pending; + uint8_t evtchn_upcall_mask; + unsigned long evtchn_pending_sel; + struct arch_vcpu_info arch; + struct pvclock_vcpu_time_info time; +}; /* 64 bytes (x86) */ + +struct shared_info { + struct vcpu_info vcpu_info[32]; + unsigned long evtchn_pending[64]; + unsigned long evtchn_mask[64]; + struct pvclock_wall_clock wc; + uint32_t wc_sec_hi; + /* arch_shared_info here */ +}; + +#define RUNSTATE_running 0 +#define RUNSTATE_runnable 1 +#define RUNSTATE_blocked 2 +#define RUNSTATE_offline 3 + +static const char *runstate_names[] = { + "running", + "runnable", + "blocked", + "offline" +}; + +struct { + struct kvm_irq_routing info; + struct kvm_irq_routing_entry entries[2]; +} irq_routes; + +static volatile bool guest_saw_irq; + +static void evtchn_handler(struct ex_regs *regs) +{ + struct vcpu_info *vi = (void *)VCPU_INFO_VADDR; + + vcpu_arch_put_guest(vi->evtchn_upcall_pending, 0); + vcpu_arch_put_guest(vi->evtchn_pending_sel, 0); + guest_saw_irq = true; + + GUEST_SYNC(TEST_GUEST_SAW_IRQ); +} + +static void guest_wait_for_irq(void) +{ + while (!guest_saw_irq) + __asm__ __volatile__ ("rep nop" : : : "memory"); + guest_saw_irq = false; +} + +static void guest_code(void) +{ + struct vcpu_runstate_info *rs = (void *)RUNSTATE_VADDR; + int i; + + __asm__ __volatile__( + "sti\n" + "nop\n" + ); + + /* Trigger an interrupt injection */ + GUEST_SYNC(TEST_INJECT_VECTOR); + + guest_wait_for_irq(); + + /* Test having the host set runstates manually */ + GUEST_SYNC(TEST_RUNSTATE_runnable); + GUEST_ASSERT(rs->time[RUNSTATE_runnable] != 0); + GUEST_ASSERT(rs->state == 0); + + GUEST_SYNC(TEST_RUNSTATE_blocked); + GUEST_ASSERT(rs->time[RUNSTATE_blocked] != 0); + GUEST_ASSERT(rs->state == 0); + + GUEST_SYNC(TEST_RUNSTATE_offline); + GUEST_ASSERT(rs->time[RUNSTATE_offline] != 0); + GUEST_ASSERT(rs->state == 0); + + /* Test runstate time adjust */ + GUEST_SYNC(TEST_RUNSTATE_ADJUST); + GUEST_ASSERT(rs->time[RUNSTATE_blocked] == 0x5a); + GUEST_ASSERT(rs->time[RUNSTATE_offline] == 0x6b6b); + + /* Test runstate time set */ + GUEST_SYNC(TEST_RUNSTATE_DATA); + GUEST_ASSERT(rs->state_entry_time >= 0x8000); + GUEST_ASSERT(rs->time[RUNSTATE_runnable] == 0); + GUEST_ASSERT(rs->time[RUNSTATE_blocked] == 0x6b6b); + GUEST_ASSERT(rs->time[RUNSTATE_offline] == 0x5a); + + /* sched_yield() should result in some 'runnable' time */ + GUEST_SYNC(TEST_STEAL_TIME); + GUEST_ASSERT(rs->time[RUNSTATE_runnable] >= MIN_STEAL_TIME); + + /* Attempt to deliver a *masked* interrupt */ + GUEST_SYNC(TEST_EVTCHN_MASKED); + + /* Wait until we see the bit set */ + struct shared_info *si = (void *)SHINFO_VADDR; + while (!si->evtchn_pending[0]) + __asm__ __volatile__ ("rep nop" : : : "memory"); + + /* Now deliver an *unmasked* interrupt */ + GUEST_SYNC(TEST_EVTCHN_UNMASKED); + + guest_wait_for_irq(); + + /* Change memslots and deliver an interrupt */ + GUEST_SYNC(TEST_EVTCHN_SLOWPATH); + + guest_wait_for_irq(); + + /* Deliver event channel with KVM_XEN_HVM_EVTCHN_SEND */ + GUEST_SYNC(TEST_EVTCHN_SEND_IOCTL); + + guest_wait_for_irq(); + + GUEST_SYNC(TEST_EVTCHN_HCALL); + + /* Our turn. Deliver event channel (to ourselves) with + * EVTCHNOP_send hypercall. */ + struct evtchn_send s = { .port = 127 }; + xen_hypercall(__HYPERVISOR_event_channel_op, EVTCHNOP_send, &s); + + guest_wait_for_irq(); + + GUEST_SYNC(TEST_EVTCHN_HCALL_SLOWPATH); + + /* + * Same again, but this time the host has messed with memslots so it + * should take the slow path in kvm_xen_set_evtchn(). + */ + xen_hypercall(__HYPERVISOR_event_channel_op, EVTCHNOP_send, &s); + + guest_wait_for_irq(); + + GUEST_SYNC(TEST_EVTCHN_HCALL_EVENTFD); + + /* Deliver "outbound" event channel to an eventfd which + * happens to be one of our own irqfds. */ + s.port = 197; + xen_hypercall(__HYPERVISOR_event_channel_op, EVTCHNOP_send, &s); + + guest_wait_for_irq(); + + GUEST_SYNC(TEST_TIMER_SETUP); + + /* Set a timer 100ms in the future. */ + xen_hypercall(__HYPERVISOR_set_timer_op, + rs->state_entry_time + 100000000, NULL); + + GUEST_SYNC(TEST_TIMER_WAIT); + + /* Now wait for the timer */ + guest_wait_for_irq(); + + GUEST_SYNC(TEST_TIMER_RESTORE); + + /* The host has 'restored' the timer. Just wait for it. */ + guest_wait_for_irq(); + + GUEST_SYNC(TEST_POLL_READY); + + /* Poll for an event channel port which is already set */ + u32 ports[1] = { EVTCHN_TIMER }; + struct sched_poll p = { + .ports = ports, + .nr_ports = 1, + .timeout = 0, + }; + + xen_hypercall(__HYPERVISOR_sched_op, SCHEDOP_poll, &p); + + GUEST_SYNC(TEST_POLL_TIMEOUT); + + /* Poll for an unset port and wait for the timeout. */ + p.timeout = 100000000; + xen_hypercall(__HYPERVISOR_sched_op, SCHEDOP_poll, &p); + + GUEST_SYNC(TEST_POLL_MASKED); + + /* A timer will wake the masked port we're waiting on, while we poll */ + p.timeout = 0; + xen_hypercall(__HYPERVISOR_sched_op, SCHEDOP_poll, &p); + + GUEST_SYNC(TEST_POLL_WAKE); + + /* Set the vcpu_info to point at exactly the place it already is to + * make sure the attribute is functional. */ + GUEST_SYNC(SET_VCPU_INFO); + + /* A timer wake an *unmasked* port which should wake us with an + * actual interrupt, while we're polling on a different port. */ + ports[0]++; + p.timeout = 0; + xen_hypercall(__HYPERVISOR_sched_op, SCHEDOP_poll, &p); + + guest_wait_for_irq(); + + GUEST_SYNC(TEST_TIMER_PAST); + + /* Timer should have fired already */ + guest_wait_for_irq(); + + GUEST_SYNC(TEST_LOCKING_SEND_RACE); + /* Racing host ioctls */ + + guest_wait_for_irq(); + + GUEST_SYNC(TEST_LOCKING_POLL_RACE); + /* Racing vmcall against host ioctl */ + + ports[0] = 0; + + p = (struct sched_poll) { + .ports = ports, + .nr_ports = 1, + .timeout = 0 + }; + +wait_for_timer: + /* + * Poll for a timer wake event while the worker thread is mucking with + * the shared info. KVM XEN drops timer IRQs if the shared info is + * invalid when the timer expires. Arbitrarily poll 100 times before + * giving up and asking the VMM to re-arm the timer. 100 polls should + * consume enough time to beat on KVM without taking too long if the + * timer IRQ is dropped due to an invalid event channel. + */ + for (i = 0; i < 100 && !guest_saw_irq; i++) + __xen_hypercall(__HYPERVISOR_sched_op, SCHEDOP_poll, &p); + + /* + * Re-send the timer IRQ if it was (likely) dropped due to the timer + * expiring while the event channel was invalid. + */ + if (!guest_saw_irq) { + GUEST_SYNC(TEST_LOCKING_POLL_TIMEOUT); + goto wait_for_timer; + } + guest_saw_irq = false; + + GUEST_SYNC(TEST_DONE); +} + +static struct shared_info *shinfo; +static struct vcpu_info *vinfo; +static struct kvm_vcpu *vcpu; + +static void handle_alrm(int sig) +{ + if (vinfo) + printf("evtchn_upcall_pending 0x%x\n", vinfo->evtchn_upcall_pending); + vcpu_dump(stdout, vcpu, 0); + TEST_FAIL("IRQ delivery timed out"); +} + +static void *juggle_shinfo_state(void *arg) +{ + struct kvm_vm *vm = (struct kvm_vm *)arg; + + struct kvm_xen_hvm_attr cache_activate_gfn = { + .type = KVM_XEN_ATTR_TYPE_SHARED_INFO, + .u.shared_info.gfn = SHINFO_REGION_GPA / PAGE_SIZE + }; + + struct kvm_xen_hvm_attr cache_deactivate_gfn = { + .type = KVM_XEN_ATTR_TYPE_SHARED_INFO, + .u.shared_info.gfn = KVM_XEN_INVALID_GFN + }; + + struct kvm_xen_hvm_attr cache_activate_hva = { + .type = KVM_XEN_ATTR_TYPE_SHARED_INFO_HVA, + .u.shared_info.hva = (unsigned long)shinfo + }; + + struct kvm_xen_hvm_attr cache_deactivate_hva = { + .type = KVM_XEN_ATTR_TYPE_SHARED_INFO, + .u.shared_info.hva = 0 + }; + + int xen_caps = kvm_check_cap(KVM_CAP_XEN_HVM); + + for (;;) { + __vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &cache_activate_gfn); + pthread_testcancel(); + __vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &cache_deactivate_gfn); + + if (xen_caps & KVM_XEN_HVM_CONFIG_SHARED_INFO_HVA) { + __vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &cache_activate_hva); + pthread_testcancel(); + __vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &cache_deactivate_hva); + } + } + + return NULL; +} + +int main(int argc, char *argv[]) +{ + struct kvm_xen_hvm_attr evt_reset; + struct kvm_vm *vm; + pthread_t thread; + bool verbose; + int ret; + + verbose = argc > 1 && (!strncmp(argv[1], "-v", 3) || + !strncmp(argv[1], "--verbose", 10)); + + int xen_caps = kvm_check_cap(KVM_CAP_XEN_HVM); + TEST_REQUIRE(xen_caps & KVM_XEN_HVM_CONFIG_SHARED_INFO); + + bool do_runstate_tests = !!(xen_caps & KVM_XEN_HVM_CONFIG_RUNSTATE); + bool do_runstate_flag = !!(xen_caps & KVM_XEN_HVM_CONFIG_RUNSTATE_UPDATE_FLAG); + bool do_eventfd_tests = !!(xen_caps & KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL); + bool do_evtchn_tests = do_eventfd_tests && !!(xen_caps & KVM_XEN_HVM_CONFIG_EVTCHN_SEND); + bool has_shinfo_hva = !!(xen_caps & KVM_XEN_HVM_CONFIG_SHARED_INFO_HVA); + + vm = vm_create_with_one_vcpu(&vcpu, guest_code); + + /* Map a region for the shared_info page */ + vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, + SHINFO_REGION_GPA, SHINFO_REGION_SLOT, 3, 0); + virt_map(vm, SHINFO_REGION_GVA, SHINFO_REGION_GPA, 3); + + shinfo = addr_gpa2hva(vm, SHINFO_VADDR); + + int zero_fd = open("/dev/zero", O_RDONLY); + TEST_ASSERT(zero_fd != -1, "Failed to open /dev/zero"); + + struct kvm_xen_hvm_config hvmc = { + .flags = KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL, + .msr = XEN_HYPERCALL_MSR, + }; + + /* Let the kernel know that we *will* use it for sending all + * event channels, which lets it intercept SCHEDOP_poll */ + if (do_evtchn_tests) + hvmc.flags |= KVM_XEN_HVM_CONFIG_EVTCHN_SEND; + + vm_ioctl(vm, KVM_XEN_HVM_CONFIG, &hvmc); + + struct kvm_xen_hvm_attr lm = { + .type = KVM_XEN_ATTR_TYPE_LONG_MODE, + .u.long_mode = 1, + }; + vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &lm); + + if (do_runstate_flag) { + struct kvm_xen_hvm_attr ruf = { + .type = KVM_XEN_ATTR_TYPE_RUNSTATE_UPDATE_FLAG, + .u.runstate_update_flag = 1, + }; + vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &ruf); + + ruf.u.runstate_update_flag = 0; + vm_ioctl(vm, KVM_XEN_HVM_GET_ATTR, &ruf); + TEST_ASSERT(ruf.u.runstate_update_flag == 1, + "Failed to read back RUNSTATE_UPDATE_FLAG attr"); + } + + struct kvm_xen_hvm_attr ha = {}; + + if (has_shinfo_hva) { + ha.type = KVM_XEN_ATTR_TYPE_SHARED_INFO_HVA; + ha.u.shared_info.hva = (unsigned long)shinfo; + } else { + ha.type = KVM_XEN_ATTR_TYPE_SHARED_INFO; + ha.u.shared_info.gfn = SHINFO_ADDR / PAGE_SIZE; + } + + vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &ha); + + /* + * Test what happens when the HVA of the shinfo page is remapped after + * the kernel has a reference to it. But make sure we copy the clock + * info over since that's only set at setup time, and we test it later. + */ + struct pvclock_wall_clock wc_copy = shinfo->wc; + void *m = mmap(shinfo, PAGE_SIZE, PROT_READ|PROT_WRITE, MAP_FIXED|MAP_PRIVATE, zero_fd, 0); + TEST_ASSERT(m == shinfo, "Failed to map /dev/zero over shared info"); + shinfo->wc = wc_copy; + + struct kvm_xen_vcpu_attr vi = { + .type = KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO, + .u.gpa = VCPU_INFO_ADDR, + }; + vcpu_ioctl(vcpu, KVM_XEN_VCPU_SET_ATTR, &vi); + + struct kvm_xen_vcpu_attr pvclock = { + .type = KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO, + .u.gpa = PVTIME_ADDR, + }; + vcpu_ioctl(vcpu, KVM_XEN_VCPU_SET_ATTR, &pvclock); + + struct kvm_xen_hvm_attr vec = { + .type = KVM_XEN_ATTR_TYPE_UPCALL_VECTOR, + .u.vector = EVTCHN_VECTOR, + }; + vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &vec); + + vm_install_exception_handler(vm, EVTCHN_VECTOR, evtchn_handler); + + if (do_runstate_tests) { + struct kvm_xen_vcpu_attr st = { + .type = KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR, + .u.gpa = RUNSTATE_ADDR, + }; + vcpu_ioctl(vcpu, KVM_XEN_VCPU_SET_ATTR, &st); + } + + int irq_fd[2] = { -1, -1 }; + + if (do_eventfd_tests) { + irq_fd[0] = eventfd(0, 0); + irq_fd[1] = eventfd(0, 0); + + /* Unexpected, but not a KVM failure */ + if (irq_fd[0] == -1 || irq_fd[1] == -1) + do_evtchn_tests = do_eventfd_tests = false; + } + + if (do_eventfd_tests) { + irq_routes.info.nr = 2; + + irq_routes.entries[0].gsi = 32; + irq_routes.entries[0].type = KVM_IRQ_ROUTING_XEN_EVTCHN; + irq_routes.entries[0].u.xen_evtchn.port = EVTCHN_TEST1; + irq_routes.entries[0].u.xen_evtchn.vcpu = vcpu->id; + irq_routes.entries[0].u.xen_evtchn.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL; + + irq_routes.entries[1].gsi = 33; + irq_routes.entries[1].type = KVM_IRQ_ROUTING_XEN_EVTCHN; + irq_routes.entries[1].u.xen_evtchn.port = EVTCHN_TEST2; + irq_routes.entries[1].u.xen_evtchn.vcpu = vcpu->id; + irq_routes.entries[1].u.xen_evtchn.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL; + + vm_ioctl(vm, KVM_SET_GSI_ROUTING, &irq_routes.info); + + struct kvm_irqfd ifd = { }; + + ifd.fd = irq_fd[0]; + ifd.gsi = 32; + vm_ioctl(vm, KVM_IRQFD, &ifd); + + ifd.fd = irq_fd[1]; + ifd.gsi = 33; + vm_ioctl(vm, KVM_IRQFD, &ifd); + + struct sigaction sa = { }; + sa.sa_handler = handle_alrm; + sigaction(SIGALRM, &sa, NULL); + } + + struct kvm_xen_vcpu_attr tmr = { + .type = KVM_XEN_VCPU_ATTR_TYPE_TIMER, + .u.timer.port = EVTCHN_TIMER, + .u.timer.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL, + .u.timer.expires_ns = 0 + }; + + if (do_evtchn_tests) { + struct kvm_xen_hvm_attr inj = { + .type = KVM_XEN_ATTR_TYPE_EVTCHN, + .u.evtchn.send_port = 127, + .u.evtchn.type = EVTCHNSTAT_interdomain, + .u.evtchn.flags = 0, + .u.evtchn.deliver.port.port = EVTCHN_TEST1, + .u.evtchn.deliver.port.vcpu = vcpu->id + 1, + .u.evtchn.deliver.port.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL, + }; + vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &inj); + + /* Test migration to a different vCPU */ + inj.u.evtchn.flags = KVM_XEN_EVTCHN_UPDATE; + inj.u.evtchn.deliver.port.vcpu = vcpu->id; + vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &inj); + + inj.u.evtchn.send_port = 197; + inj.u.evtchn.deliver.eventfd.port = 0; + inj.u.evtchn.deliver.eventfd.fd = irq_fd[1]; + inj.u.evtchn.flags = 0; + vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &inj); + + vcpu_ioctl(vcpu, KVM_XEN_VCPU_SET_ATTR, &tmr); + } + vinfo = addr_gpa2hva(vm, VCPU_INFO_VADDR); + vinfo->evtchn_upcall_pending = 0; + + struct vcpu_runstate_info *rs = addr_gpa2hva(vm, RUNSTATE_ADDR); + rs->state = 0x5a; + + bool evtchn_irq_expected = false; + + for (;;) { + struct ucall uc; + + vcpu_run(vcpu); + TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO); + + switch (get_ucall(vcpu, &uc)) { + case UCALL_ABORT: + REPORT_GUEST_ASSERT(uc); + /* NOT REACHED */ + case UCALL_SYNC: { + struct kvm_xen_vcpu_attr rst; + long rundelay; + + if (do_runstate_tests) + TEST_ASSERT(rs->state_entry_time == rs->time[0] + + rs->time[1] + rs->time[2] + rs->time[3], + "runstate times don't add up"); + + switch (uc.args[1]) { + case TEST_INJECT_VECTOR: + if (verbose) + printf("Delivering evtchn upcall\n"); + evtchn_irq_expected = true; + vinfo->evtchn_upcall_pending = 1; + break; + + case TEST_RUNSTATE_runnable...TEST_RUNSTATE_offline: + TEST_ASSERT(!evtchn_irq_expected, "Event channel IRQ not seen"); + if (!do_runstate_tests) + goto done; + if (verbose) + printf("Testing runstate %s\n", runstate_names[uc.args[1]]); + rst.type = KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_CURRENT; + rst.u.runstate.state = uc.args[1] + RUNSTATE_runnable - + TEST_RUNSTATE_runnable; + vcpu_ioctl(vcpu, KVM_XEN_VCPU_SET_ATTR, &rst); + break; + + case TEST_RUNSTATE_ADJUST: + if (verbose) + printf("Testing RUNSTATE_ADJUST\n"); + rst.type = KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADJUST; + memset(&rst.u, 0, sizeof(rst.u)); + rst.u.runstate.state = (uint64_t)-1; + rst.u.runstate.time_blocked = + 0x5a - rs->time[RUNSTATE_blocked]; + rst.u.runstate.time_offline = + 0x6b6b - rs->time[RUNSTATE_offline]; + rst.u.runstate.time_runnable = -rst.u.runstate.time_blocked - + rst.u.runstate.time_offline; + vcpu_ioctl(vcpu, KVM_XEN_VCPU_SET_ATTR, &rst); + break; + + case TEST_RUNSTATE_DATA: + if (verbose) + printf("Testing RUNSTATE_DATA\n"); + rst.type = KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_DATA; + memset(&rst.u, 0, sizeof(rst.u)); + rst.u.runstate.state = RUNSTATE_running; + rst.u.runstate.state_entry_time = 0x6b6b + 0x5a; + rst.u.runstate.time_blocked = 0x6b6b; + rst.u.runstate.time_offline = 0x5a; + vcpu_ioctl(vcpu, KVM_XEN_VCPU_SET_ATTR, &rst); + break; + + case TEST_STEAL_TIME: + if (verbose) + printf("Testing steal time\n"); + /* Yield until scheduler delay exceeds target */ + rundelay = get_run_delay() + MIN_STEAL_TIME; + do { + sched_yield(); + } while (get_run_delay() < rundelay); + break; + + case TEST_EVTCHN_MASKED: + if (!do_eventfd_tests) + goto done; + if (verbose) + printf("Testing masked event channel\n"); + shinfo->evtchn_mask[0] = 1UL << EVTCHN_TEST1; + eventfd_write(irq_fd[0], 1UL); + alarm(1); + break; + + case TEST_EVTCHN_UNMASKED: + if (verbose) + printf("Testing unmasked event channel\n"); + /* Unmask that, but deliver the other one */ + shinfo->evtchn_pending[0] = 0; + shinfo->evtchn_mask[0] = 0; + eventfd_write(irq_fd[1], 1UL); + evtchn_irq_expected = true; + alarm(1); + break; + + case TEST_EVTCHN_SLOWPATH: + TEST_ASSERT(!evtchn_irq_expected, + "Expected event channel IRQ but it didn't happen"); + shinfo->evtchn_pending[1] = 0; + if (verbose) + printf("Testing event channel after memslot change\n"); + vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, + DUMMY_REGION_GPA, DUMMY_REGION_SLOT, 1, 0); + eventfd_write(irq_fd[0], 1UL); + evtchn_irq_expected = true; + alarm(1); + break; + + case TEST_EVTCHN_SEND_IOCTL: + TEST_ASSERT(!evtchn_irq_expected, + "Expected event channel IRQ but it didn't happen"); + if (!do_evtchn_tests) + goto done; + + shinfo->evtchn_pending[0] = 0; + if (verbose) + printf("Testing injection with KVM_XEN_HVM_EVTCHN_SEND\n"); + + struct kvm_irq_routing_xen_evtchn e; + e.port = EVTCHN_TEST2; + e.vcpu = vcpu->id; + e.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL; + + vm_ioctl(vm, KVM_XEN_HVM_EVTCHN_SEND, &e); + evtchn_irq_expected = true; + alarm(1); + break; + + case TEST_EVTCHN_HCALL: + TEST_ASSERT(!evtchn_irq_expected, + "Expected event channel IRQ but it didn't happen"); + shinfo->evtchn_pending[1] = 0; + + if (verbose) + printf("Testing guest EVTCHNOP_send direct to evtchn\n"); + evtchn_irq_expected = true; + alarm(1); + break; + + case TEST_EVTCHN_HCALL_SLOWPATH: + TEST_ASSERT(!evtchn_irq_expected, + "Expected event channel IRQ but it didn't happen"); + shinfo->evtchn_pending[0] = 0; + + if (verbose) + printf("Testing guest EVTCHNOP_send direct to evtchn after memslot change\n"); + vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, + DUMMY_REGION_GPA_2, DUMMY_REGION_SLOT_2, 1, 0); + evtchn_irq_expected = true; + alarm(1); + break; + + case TEST_EVTCHN_HCALL_EVENTFD: + TEST_ASSERT(!evtchn_irq_expected, + "Expected event channel IRQ but it didn't happen"); + shinfo->evtchn_pending[0] = 0; + + if (verbose) + printf("Testing guest EVTCHNOP_send to eventfd\n"); + evtchn_irq_expected = true; + alarm(1); + break; + + case TEST_TIMER_SETUP: + TEST_ASSERT(!evtchn_irq_expected, + "Expected event channel IRQ but it didn't happen"); + shinfo->evtchn_pending[1] = 0; + + if (verbose) + printf("Testing guest oneshot timer\n"); + break; + + case TEST_TIMER_WAIT: + memset(&tmr, 0, sizeof(tmr)); + tmr.type = KVM_XEN_VCPU_ATTR_TYPE_TIMER; + vcpu_ioctl(vcpu, KVM_XEN_VCPU_GET_ATTR, &tmr); + TEST_ASSERT(tmr.u.timer.port == EVTCHN_TIMER, + "Timer port not returned"); + TEST_ASSERT(tmr.u.timer.priority == KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL, + "Timer priority not returned"); + TEST_ASSERT(tmr.u.timer.expires_ns > rs->state_entry_time, + "Timer expiry not returned"); + evtchn_irq_expected = true; + alarm(1); + break; + + case TEST_TIMER_RESTORE: + TEST_ASSERT(!evtchn_irq_expected, + "Expected event channel IRQ but it didn't happen"); + shinfo->evtchn_pending[0] = 0; + + if (verbose) + printf("Testing restored oneshot timer\n"); + + tmr.u.timer.expires_ns = rs->state_entry_time + 100000000; + vcpu_ioctl(vcpu, KVM_XEN_VCPU_SET_ATTR, &tmr); + evtchn_irq_expected = true; + alarm(1); + break; + + case TEST_POLL_READY: + TEST_ASSERT(!evtchn_irq_expected, + "Expected event channel IRQ but it didn't happen"); + + if (verbose) + printf("Testing SCHEDOP_poll with already pending event\n"); + shinfo->evtchn_pending[0] = shinfo->evtchn_mask[0] = 1UL << EVTCHN_TIMER; + alarm(1); + break; + + case TEST_POLL_TIMEOUT: + if (verbose) + printf("Testing SCHEDOP_poll timeout\n"); + shinfo->evtchn_pending[0] = 0; + alarm(1); + break; + + case TEST_POLL_MASKED: + if (verbose) + printf("Testing SCHEDOP_poll wake on masked event\n"); + + tmr.u.timer.expires_ns = rs->state_entry_time + 100000000; + vcpu_ioctl(vcpu, KVM_XEN_VCPU_SET_ATTR, &tmr); + alarm(1); + break; + + case TEST_POLL_WAKE: + shinfo->evtchn_pending[0] = shinfo->evtchn_mask[0] = 0; + if (verbose) + printf("Testing SCHEDOP_poll wake on unmasked event\n"); + + evtchn_irq_expected = true; + tmr.u.timer.expires_ns = rs->state_entry_time + 100000000; + vcpu_ioctl(vcpu, KVM_XEN_VCPU_SET_ATTR, &tmr); + + /* Read it back and check the pending time is reported correctly */ + tmr.u.timer.expires_ns = 0; + vcpu_ioctl(vcpu, KVM_XEN_VCPU_GET_ATTR, &tmr); + TEST_ASSERT(tmr.u.timer.expires_ns == rs->state_entry_time + 100000000, + "Timer not reported pending"); + alarm(1); + break; + + case SET_VCPU_INFO: + if (has_shinfo_hva) { + struct kvm_xen_vcpu_attr vih = { + .type = KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO_HVA, + .u.hva = (unsigned long)vinfo + }; + vcpu_ioctl(vcpu, KVM_XEN_VCPU_SET_ATTR, &vih); + } + break; + + case TEST_TIMER_PAST: + TEST_ASSERT(!evtchn_irq_expected, + "Expected event channel IRQ but it didn't happen"); + /* Read timer and check it is no longer pending */ + vcpu_ioctl(vcpu, KVM_XEN_VCPU_GET_ATTR, &tmr); + TEST_ASSERT(!tmr.u.timer.expires_ns, "Timer still reported pending"); + + shinfo->evtchn_pending[0] = 0; + if (verbose) + printf("Testing timer in the past\n"); + + evtchn_irq_expected = true; + tmr.u.timer.expires_ns = rs->state_entry_time - 100000000ULL; + vcpu_ioctl(vcpu, KVM_XEN_VCPU_SET_ATTR, &tmr); + alarm(1); + break; + + case TEST_LOCKING_SEND_RACE: + TEST_ASSERT(!evtchn_irq_expected, + "Expected event channel IRQ but it didn't happen"); + alarm(0); + + if (verbose) + printf("Testing shinfo lock corruption (KVM_XEN_HVM_EVTCHN_SEND)\n"); + + ret = pthread_create(&thread, NULL, &juggle_shinfo_state, (void *)vm); + TEST_ASSERT(ret == 0, "pthread_create() failed: %s", strerror(ret)); + + struct kvm_irq_routing_xen_evtchn uxe = { + .port = 1, + .vcpu = vcpu->id, + .priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL + }; + + evtchn_irq_expected = true; + for (time_t t = time(NULL) + SHINFO_RACE_TIMEOUT; time(NULL) < t;) + __vm_ioctl(vm, KVM_XEN_HVM_EVTCHN_SEND, &uxe); + break; + + case TEST_LOCKING_POLL_RACE: + TEST_ASSERT(!evtchn_irq_expected, + "Expected event channel IRQ but it didn't happen"); + + if (verbose) + printf("Testing shinfo lock corruption (SCHEDOP_poll)\n"); + + shinfo->evtchn_pending[0] = 1; + + evtchn_irq_expected = true; + tmr.u.timer.expires_ns = rs->state_entry_time + + SHINFO_RACE_TIMEOUT * 1000000000ULL; + vcpu_ioctl(vcpu, KVM_XEN_VCPU_SET_ATTR, &tmr); + break; + + case TEST_LOCKING_POLL_TIMEOUT: + /* + * Optional and possibly repeated sync point. + * Injecting the timer IRQ may fail if the + * shinfo is invalid when the timer expires. + * If the timer has expired but the IRQ hasn't + * been delivered, rearm the timer and retry. + */ + vcpu_ioctl(vcpu, KVM_XEN_VCPU_GET_ATTR, &tmr); + + /* Resume the guest if the timer is still pending. */ + if (tmr.u.timer.expires_ns) + break; + + /* All done if the IRQ was delivered. */ + if (!evtchn_irq_expected) + break; + + tmr.u.timer.expires_ns = rs->state_entry_time + + SHINFO_RACE_TIMEOUT * 1000000000ULL; + vcpu_ioctl(vcpu, KVM_XEN_VCPU_SET_ATTR, &tmr); + break; + case TEST_DONE: + TEST_ASSERT(!evtchn_irq_expected, + "Expected event channel IRQ but it didn't happen"); + + ret = pthread_cancel(thread); + TEST_ASSERT(ret == 0, "pthread_cancel() failed: %s", strerror(ret)); + + ret = pthread_join(thread, 0); + TEST_ASSERT(ret == 0, "pthread_join() failed: %s", strerror(ret)); + goto done; + + case TEST_GUEST_SAW_IRQ: + TEST_ASSERT(evtchn_irq_expected, "Unexpected event channel IRQ"); + evtchn_irq_expected = false; + break; + } + break; + } + case UCALL_DONE: + goto done; + default: + TEST_FAIL("Unknown ucall 0x%lx.", uc.cmd); + } + } + + done: + evt_reset.type = KVM_XEN_ATTR_TYPE_EVTCHN; + evt_reset.u.evtchn.flags = KVM_XEN_EVTCHN_RESET; + vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &evt_reset); + + alarm(0); + + /* + * Just a *really* basic check that things are being put in the + * right place. The actual calculations are much the same for + * Xen as they are for the KVM variants, so no need to check. + */ + struct pvclock_wall_clock *wc; + struct pvclock_vcpu_time_info *ti, *ti2; + struct kvm_clock_data kcdata; + long long delta; + + wc = addr_gpa2hva(vm, SHINFO_REGION_GPA + 0xc00); + ti = addr_gpa2hva(vm, SHINFO_REGION_GPA + 0x40 + 0x20); + ti2 = addr_gpa2hva(vm, PVTIME_ADDR); + + if (verbose) { + printf("Wall clock (v %d) %d.%09d\n", wc->version, wc->sec, wc->nsec); + printf("Time info 1: v %u tsc %" PRIu64 " time %" PRIu64 " mul %u shift %u flags %x\n", + ti->version, ti->tsc_timestamp, ti->system_time, ti->tsc_to_system_mul, + ti->tsc_shift, ti->flags); + printf("Time info 2: v %u tsc %" PRIu64 " time %" PRIu64 " mul %u shift %u flags %x\n", + ti2->version, ti2->tsc_timestamp, ti2->system_time, ti2->tsc_to_system_mul, + ti2->tsc_shift, ti2->flags); + } + + TEST_ASSERT(wc->version && !(wc->version & 1), + "Bad wallclock version %x", wc->version); + + vm_ioctl(vm, KVM_GET_CLOCK, &kcdata); + + if (kcdata.flags & KVM_CLOCK_REALTIME) { + if (verbose) { + printf("KVM_GET_CLOCK clock: %lld.%09lld\n", + kcdata.clock / NSEC_PER_SEC, kcdata.clock % NSEC_PER_SEC); + printf("KVM_GET_CLOCK realtime: %lld.%09lld\n", + kcdata.realtime / NSEC_PER_SEC, kcdata.realtime % NSEC_PER_SEC); + } + + delta = (wc->sec * NSEC_PER_SEC + wc->nsec) - (kcdata.realtime - kcdata.clock); + + /* + * KVM_GET_CLOCK gives CLOCK_REALTIME which jumps on leap seconds updates but + * unfortunately KVM doesn't currently offer a CLOCK_TAI alternative. Accept 1s + * delta as testing clock accuracy is not the goal here. The test just needs to + * check that the value in shinfo is somewhat sane. + */ + TEST_ASSERT(llabs(delta) < NSEC_PER_SEC, + "Guest's epoch from shinfo %d.%09d differs from KVM_GET_CLOCK %lld.%lld", + wc->sec, wc->nsec, (kcdata.realtime - kcdata.clock) / NSEC_PER_SEC, + (kcdata.realtime - kcdata.clock) % NSEC_PER_SEC); + } else { + pr_info("Missing KVM_CLOCK_REALTIME, skipping shinfo epoch sanity check\n"); + } + + TEST_ASSERT(ti->version && !(ti->version & 1), + "Bad time_info version %x", ti->version); + TEST_ASSERT(ti2->version && !(ti2->version & 1), + "Bad time_info version %x", ti->version); + + if (do_runstate_tests) { + /* + * Fetch runstate and check sanity. Strictly speaking in the + * general case we might not expect the numbers to be identical + * but in this case we know we aren't running the vCPU any more. + */ + struct kvm_xen_vcpu_attr rst = { + .type = KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_DATA, + }; + vcpu_ioctl(vcpu, KVM_XEN_VCPU_GET_ATTR, &rst); + + if (verbose) { + printf("Runstate: %s(%d), entry %" PRIu64 " ns\n", + rs->state <= RUNSTATE_offline ? runstate_names[rs->state] : "unknown", + rs->state, rs->state_entry_time); + for (int i = RUNSTATE_running; i <= RUNSTATE_offline; i++) { + printf("State %s: %" PRIu64 " ns\n", + runstate_names[i], rs->time[i]); + } + } + + /* + * Exercise runstate info at all points across the page boundary, in + * 32-bit and 64-bit mode. In particular, test the case where it is + * configured in 32-bit mode and then switched to 64-bit mode while + * active, which takes it onto the second page. + */ + unsigned long runstate_addr; + struct compat_vcpu_runstate_info *crs; + for (runstate_addr = SHINFO_REGION_GPA + PAGE_SIZE + PAGE_SIZE - sizeof(*rs) - 4; + runstate_addr < SHINFO_REGION_GPA + PAGE_SIZE + PAGE_SIZE + 4; runstate_addr++) { + + rs = addr_gpa2hva(vm, runstate_addr); + crs = (void *)rs; + + memset(rs, 0xa5, sizeof(*rs)); + + /* Set to compatibility mode */ + lm.u.long_mode = 0; + vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &lm); + + /* Set runstate to new address (kernel will write it) */ + struct kvm_xen_vcpu_attr st = { + .type = KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR, + .u.gpa = runstate_addr, + }; + vcpu_ioctl(vcpu, KVM_XEN_VCPU_SET_ATTR, &st); + + if (verbose) + printf("Compatibility runstate at %08lx\n", runstate_addr); + + TEST_ASSERT(crs->state == rst.u.runstate.state, "Runstate mismatch"); + TEST_ASSERT(crs->state_entry_time == rst.u.runstate.state_entry_time, + "State entry time mismatch"); + TEST_ASSERT(crs->time[RUNSTATE_running] == rst.u.runstate.time_running, + "Running time mismatch"); + TEST_ASSERT(crs->time[RUNSTATE_runnable] == rst.u.runstate.time_runnable, + "Runnable time mismatch"); + TEST_ASSERT(crs->time[RUNSTATE_blocked] == rst.u.runstate.time_blocked, + "Blocked time mismatch"); + TEST_ASSERT(crs->time[RUNSTATE_offline] == rst.u.runstate.time_offline, + "Offline time mismatch"); + TEST_ASSERT(crs->time[RUNSTATE_offline + 1] == 0xa5a5a5a5a5a5a5a5ULL, + "Structure overrun"); + TEST_ASSERT(crs->state_entry_time == crs->time[0] + + crs->time[1] + crs->time[2] + crs->time[3], + "runstate times don't add up"); + + + /* Now switch to 64-bit mode */ + lm.u.long_mode = 1; + vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &lm); + + memset(rs, 0xa5, sizeof(*rs)); + + /* Don't change the address, just trigger a write */ + struct kvm_xen_vcpu_attr adj = { + .type = KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADJUST, + .u.runstate.state = (uint64_t)-1 + }; + vcpu_ioctl(vcpu, KVM_XEN_VCPU_SET_ATTR, &adj); + + if (verbose) + printf("64-bit runstate at %08lx\n", runstate_addr); + + TEST_ASSERT(rs->state == rst.u.runstate.state, "Runstate mismatch"); + TEST_ASSERT(rs->state_entry_time == rst.u.runstate.state_entry_time, + "State entry time mismatch"); + TEST_ASSERT(rs->time[RUNSTATE_running] == rst.u.runstate.time_running, + "Running time mismatch"); + TEST_ASSERT(rs->time[RUNSTATE_runnable] == rst.u.runstate.time_runnable, + "Runnable time mismatch"); + TEST_ASSERT(rs->time[RUNSTATE_blocked] == rst.u.runstate.time_blocked, + "Blocked time mismatch"); + TEST_ASSERT(rs->time[RUNSTATE_offline] == rst.u.runstate.time_offline, + "Offline time mismatch"); + TEST_ASSERT(rs->time[RUNSTATE_offline + 1] == 0xa5a5a5a5a5a5a5a5ULL, + "Structure overrun"); + + TEST_ASSERT(rs->state_entry_time == rs->time[0] + + rs->time[1] + rs->time[2] + rs->time[3], + "runstate times don't add up"); + } + } + + kvm_vm_free(vm); + return 0; +} diff --git a/tools/testing/selftests/kvm/x86_64/xen_vmcall_test.c b/tools/testing/selftests/kvm/x86_64/xen_vmcall_test.c new file mode 100644 index 000000000000..e149d0574961 --- /dev/null +++ b/tools/testing/selftests/kvm/x86_64/xen_vmcall_test.c @@ -0,0 +1,142 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * xen_vmcall_test + * + * Copyright © 2020 Amazon.com, Inc. or its affiliates. + * + * Userspace hypercall testing + */ + +#include "test_util.h" +#include "kvm_util.h" +#include "processor.h" + +#define HCALL_REGION_GPA 0xc0000000ULL +#define HCALL_REGION_SLOT 10 + +#define INPUTVALUE 17 +#define ARGVALUE(x) (0xdeadbeef5a5a0000UL + x) +#define RETVALUE 0xcafef00dfbfbffffUL + +#define XEN_HYPERCALL_MSR 0x40000200 +#define HV_GUEST_OS_ID_MSR 0x40000000 +#define HV_HYPERCALL_MSR 0x40000001 + +#define HVCALL_SIGNAL_EVENT 0x005d +#define HV_STATUS_INVALID_ALIGNMENT 4 + +static void guest_code(void) +{ + unsigned long rax = INPUTVALUE; + unsigned long rdi = ARGVALUE(1); + unsigned long rsi = ARGVALUE(2); + unsigned long rdx = ARGVALUE(3); + unsigned long rcx; + register unsigned long r10 __asm__("r10") = ARGVALUE(4); + register unsigned long r8 __asm__("r8") = ARGVALUE(5); + register unsigned long r9 __asm__("r9") = ARGVALUE(6); + + /* First a direct invocation of 'vmcall' */ + __asm__ __volatile__("vmcall" : + "=a"(rax) : + "a"(rax), "D"(rdi), "S"(rsi), "d"(rdx), + "r"(r10), "r"(r8), "r"(r9)); + GUEST_ASSERT(rax == RETVALUE); + + /* Fill in the Xen hypercall page */ + __asm__ __volatile__("wrmsr" : : "c" (XEN_HYPERCALL_MSR), + "a" (HCALL_REGION_GPA & 0xffffffff), + "d" (HCALL_REGION_GPA >> 32)); + + /* Set Hyper-V Guest OS ID */ + __asm__ __volatile__("wrmsr" : : "c" (HV_GUEST_OS_ID_MSR), + "a" (0x5a), "d" (0)); + + /* Hyper-V hypercall page */ + u64 msrval = HCALL_REGION_GPA + PAGE_SIZE + 1; + __asm__ __volatile__("wrmsr" : : "c" (HV_HYPERCALL_MSR), + "a" (msrval & 0xffffffff), + "d" (msrval >> 32)); + + /* Invoke a Xen hypercall */ + __asm__ __volatile__("call *%1" : "=a"(rax) : + "r"(HCALL_REGION_GPA + INPUTVALUE * 32), + "a"(rax), "D"(rdi), "S"(rsi), "d"(rdx), + "r"(r10), "r"(r8), "r"(r9)); + GUEST_ASSERT(rax == RETVALUE); + + /* Invoke a Hyper-V hypercall */ + rax = 0; + rcx = HVCALL_SIGNAL_EVENT; /* code */ + rdx = 0x5a5a5a5a; /* ingpa (badly aligned) */ + __asm__ __volatile__("call *%1" : "=a"(rax) : + "r"(HCALL_REGION_GPA + PAGE_SIZE), + "a"(rax), "c"(rcx), "d"(rdx), + "r"(r8)); + GUEST_ASSERT(rax == HV_STATUS_INVALID_ALIGNMENT); + + GUEST_DONE(); +} + +int main(int argc, char *argv[]) +{ + unsigned int xen_caps; + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; + + xen_caps = kvm_check_cap(KVM_CAP_XEN_HVM); + TEST_REQUIRE(xen_caps & KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL); + + vm = vm_create_with_one_vcpu(&vcpu, guest_code); + vcpu_set_hv_cpuid(vcpu); + + struct kvm_xen_hvm_config hvmc = { + .flags = KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL, + .msr = XEN_HYPERCALL_MSR, + }; + vm_ioctl(vm, KVM_XEN_HVM_CONFIG, &hvmc); + + /* Map a region for the hypercall pages */ + vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, + HCALL_REGION_GPA, HCALL_REGION_SLOT, 2, 0); + virt_map(vm, HCALL_REGION_GPA, HCALL_REGION_GPA, 2); + + for (;;) { + volatile struct kvm_run *run = vcpu->run; + struct ucall uc; + + vcpu_run(vcpu); + + if (run->exit_reason == KVM_EXIT_XEN) { + TEST_ASSERT_EQ(run->xen.type, KVM_EXIT_XEN_HCALL); + TEST_ASSERT_EQ(run->xen.u.hcall.cpl, 0); + TEST_ASSERT_EQ(run->xen.u.hcall.longmode, 1); + TEST_ASSERT_EQ(run->xen.u.hcall.input, INPUTVALUE); + TEST_ASSERT_EQ(run->xen.u.hcall.params[0], ARGVALUE(1)); + TEST_ASSERT_EQ(run->xen.u.hcall.params[1], ARGVALUE(2)); + TEST_ASSERT_EQ(run->xen.u.hcall.params[2], ARGVALUE(3)); + TEST_ASSERT_EQ(run->xen.u.hcall.params[3], ARGVALUE(4)); + TEST_ASSERT_EQ(run->xen.u.hcall.params[4], ARGVALUE(5)); + TEST_ASSERT_EQ(run->xen.u.hcall.params[5], ARGVALUE(6)); + run->xen.u.hcall.result = RETVALUE; + continue; + } + + TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO); + + switch (get_ucall(vcpu, &uc)) { + case UCALL_ABORT: + REPORT_GUEST_ASSERT(uc); + /* NOT REACHED */ + case UCALL_SYNC: + break; + case UCALL_DONE: + goto done; + default: + TEST_FAIL("Unknown ucall 0x%lx.", uc.cmd); + } + } +done: + kvm_vm_free(vm); + return 0; +} diff --git a/tools/testing/selftests/kvm/x86_64/xss_msr_test.c b/tools/testing/selftests/kvm/x86_64/xss_msr_test.c index 3529376747c2..f331a4e9bae3 100644 --- a/tools/testing/selftests/kvm/x86_64/xss_msr_test.c +++ b/tools/testing/selftests/kvm/x86_64/xss_msr_test.c @@ -4,72 +4,50 @@ * * Tests for the IA32_XSS MSR. */ - -#define _GNU_SOURCE /* for program_invocation_short_name */ #include <sys/ioctl.h> #include "test_util.h" #include "kvm_util.h" #include "vmx.h" -#define VCPU_ID 1 #define MSR_BITS 64 -#define X86_FEATURE_XSAVES (1<<3) - -bool is_supported_msr(u32 msr_index) -{ - struct kvm_msr_list *list; - bool found = false; - int i; - - list = kvm_get_msr_index_list(); - for (i = 0; i < list->nmsrs; ++i) { - if (list->indices[i] == msr_index) { - found = true; - break; - } - } - - free(list); - return found; -} - int main(int argc, char *argv[]) { - struct kvm_cpuid_entry2 *entry; - bool xss_supported = false; + bool xss_in_msr_list; struct kvm_vm *vm; + struct kvm_vcpu *vcpu; uint64_t xss_val; int i, r; /* Create VM */ - vm = vm_create_default(VCPU_ID, 0, 0); + vm = vm_create_with_one_vcpu(&vcpu, NULL); - if (kvm_get_cpuid_max_basic() >= 0xd) { - entry = kvm_get_supported_cpuid_index(0xd, 1); - xss_supported = entry && !!(entry->eax & X86_FEATURE_XSAVES); - } - if (!xss_supported) { - print_skip("IA32_XSS is not supported by the vCPU"); - exit(KSFT_SKIP); - } + TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_XSAVES)); - xss_val = vcpu_get_msr(vm, VCPU_ID, MSR_IA32_XSS); + xss_val = vcpu_get_msr(vcpu, MSR_IA32_XSS); TEST_ASSERT(xss_val == 0, - "MSR_IA32_XSS should be initialized to zero\n"); + "MSR_IA32_XSS should be initialized to zero"); + + vcpu_set_msr(vcpu, MSR_IA32_XSS, xss_val); - vcpu_set_msr(vm, VCPU_ID, MSR_IA32_XSS, xss_val); /* * At present, KVM only supports a guest IA32_XSS value of 0. Verify * that trying to set the guest IA32_XSS to an unsupported value fails. * Also, in the future when a non-zero value succeeds check that - * IA32_XSS is in the KVM_GET_MSR_INDEX_LIST. + * IA32_XSS is in the list of MSRs to save/restore. */ + xss_in_msr_list = kvm_msr_is_in_save_restore_list(MSR_IA32_XSS); for (i = 0; i < MSR_BITS; ++i) { - r = _vcpu_set_msr(vm, VCPU_ID, MSR_IA32_XSS, 1ull << i); - TEST_ASSERT(r == 0 || is_supported_msr(MSR_IA32_XSS), - "IA32_XSS was able to be set, but was not found in KVM_GET_MSR_INDEX_LIST.\n"); + r = _vcpu_set_msr(vcpu, MSR_IA32_XSS, 1ull << i); + + /* + * Setting a list of MSRs returns the entry that "faulted", or + * the last entry +1 if all MSRs were successfully written. + */ + TEST_ASSERT(!r || r == 1, KVM_IOCTL_ERROR(KVM_SET_MSRS, r)); + TEST_ASSERT(r != 1 || xss_in_msr_list, + "IA32_XSS was able to be set, but was not in save/restore list"); } kvm_vm_free(vm); |