diff options
Diffstat (limited to 'tools/testing/selftests/kvm/lib')
31 files changed, 4801 insertions, 1774 deletions
diff --git a/tools/testing/selftests/kvm/lib/aarch64/gic.c b/tools/testing/selftests/kvm/lib/aarch64/gic.c new file mode 100644 index 000000000000..55668631d546 --- /dev/null +++ b/tools/testing/selftests/kvm/lib/aarch64/gic.c @@ -0,0 +1,161 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * ARM Generic Interrupt Controller (GIC) support + */ + +#include <errno.h> +#include <linux/bits.h> +#include <linux/sizes.h> + +#include "kvm_util.h" + +#include <gic.h> +#include "gic_private.h" +#include "processor.h" +#include "spinlock.h" + +static const struct gic_common_ops *gic_common_ops; +static struct spinlock gic_lock; + +static void gic_cpu_init(unsigned int cpu, void *redist_base) +{ + gic_common_ops->gic_cpu_init(cpu, redist_base); +} + +static void +gic_dist_init(enum gic_type type, unsigned int nr_cpus, void *dist_base) +{ + const struct gic_common_ops *gic_ops = NULL; + + spin_lock(&gic_lock); + + /* Distributor initialization is needed only once per VM */ + if (gic_common_ops) { + spin_unlock(&gic_lock); + return; + } + + if (type == GIC_V3) + gic_ops = &gicv3_ops; + + GUEST_ASSERT(gic_ops); + + gic_ops->gic_init(nr_cpus, dist_base); + gic_common_ops = gic_ops; + + /* Make sure that the initialized data is visible to all the vCPUs */ + dsb(sy); + + spin_unlock(&gic_lock); +} + +void gic_init(enum gic_type type, unsigned int nr_cpus, + void *dist_base, void *redist_base) +{ + uint32_t cpu = guest_get_vcpuid(); + + GUEST_ASSERT(type < GIC_TYPE_MAX); + GUEST_ASSERT(dist_base); + GUEST_ASSERT(redist_base); + GUEST_ASSERT(nr_cpus); + + gic_dist_init(type, nr_cpus, dist_base); + gic_cpu_init(cpu, redist_base); +} + +void gic_irq_enable(unsigned int intid) +{ + GUEST_ASSERT(gic_common_ops); + gic_common_ops->gic_irq_enable(intid); +} + +void gic_irq_disable(unsigned int intid) +{ + GUEST_ASSERT(gic_common_ops); + gic_common_ops->gic_irq_disable(intid); +} + +unsigned int gic_get_and_ack_irq(void) +{ + uint64_t irqstat; + unsigned int intid; + + GUEST_ASSERT(gic_common_ops); + + irqstat = gic_common_ops->gic_read_iar(); + intid = irqstat & GENMASK(23, 0); + + return intid; +} + +void gic_set_eoi(unsigned int intid) +{ + GUEST_ASSERT(gic_common_ops); + gic_common_ops->gic_write_eoir(intid); +} + +void gic_set_dir(unsigned int intid) +{ + GUEST_ASSERT(gic_common_ops); + gic_common_ops->gic_write_dir(intid); +} + +void gic_set_eoi_split(bool split) +{ + GUEST_ASSERT(gic_common_ops); + gic_common_ops->gic_set_eoi_split(split); +} + +void gic_set_priority_mask(uint64_t pmr) +{ + GUEST_ASSERT(gic_common_ops); + gic_common_ops->gic_set_priority_mask(pmr); +} + +void gic_set_priority(unsigned int intid, unsigned int prio) +{ + GUEST_ASSERT(gic_common_ops); + gic_common_ops->gic_set_priority(intid, prio); +} + +void gic_irq_set_active(unsigned int intid) +{ + GUEST_ASSERT(gic_common_ops); + gic_common_ops->gic_irq_set_active(intid); +} + +void gic_irq_clear_active(unsigned int intid) +{ + GUEST_ASSERT(gic_common_ops); + gic_common_ops->gic_irq_clear_active(intid); +} + +bool gic_irq_get_active(unsigned int intid) +{ + GUEST_ASSERT(gic_common_ops); + return gic_common_ops->gic_irq_get_active(intid); +} + +void gic_irq_set_pending(unsigned int intid) +{ + GUEST_ASSERT(gic_common_ops); + gic_common_ops->gic_irq_set_pending(intid); +} + +void gic_irq_clear_pending(unsigned int intid) +{ + GUEST_ASSERT(gic_common_ops); + gic_common_ops->gic_irq_clear_pending(intid); +} + +bool gic_irq_get_pending(unsigned int intid) +{ + GUEST_ASSERT(gic_common_ops); + return gic_common_ops->gic_irq_get_pending(intid); +} + +void gic_irq_set_config(unsigned int intid, bool is_edge) +{ + GUEST_ASSERT(gic_common_ops); + gic_common_ops->gic_irq_set_config(intid, is_edge); +} diff --git a/tools/testing/selftests/kvm/lib/aarch64/gic_private.h b/tools/testing/selftests/kvm/lib/aarch64/gic_private.h new file mode 100644 index 000000000000..75d07313c893 --- /dev/null +++ b/tools/testing/selftests/kvm/lib/aarch64/gic_private.h @@ -0,0 +1,32 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * ARM Generic Interrupt Controller (GIC) private defines that's only + * shared among the GIC library code. + */ + +#ifndef SELFTEST_KVM_GIC_PRIVATE_H +#define SELFTEST_KVM_GIC_PRIVATE_H + +struct gic_common_ops { + void (*gic_init)(unsigned int nr_cpus, void *dist_base); + void (*gic_cpu_init)(unsigned int cpu, void *redist_base); + void (*gic_irq_enable)(unsigned int intid); + void (*gic_irq_disable)(unsigned int intid); + uint64_t (*gic_read_iar)(void); + void (*gic_write_eoir)(uint32_t irq); + void (*gic_write_dir)(uint32_t irq); + void (*gic_set_eoi_split)(bool split); + void (*gic_set_priority_mask)(uint64_t mask); + void (*gic_set_priority)(uint32_t intid, uint32_t prio); + void (*gic_irq_set_active)(uint32_t intid); + void (*gic_irq_clear_active)(uint32_t intid); + bool (*gic_irq_get_active)(uint32_t intid); + void (*gic_irq_set_pending)(uint32_t intid); + void (*gic_irq_clear_pending)(uint32_t intid); + bool (*gic_irq_get_pending)(uint32_t intid); + void (*gic_irq_set_config)(uint32_t intid, bool is_edge); +}; + +extern const struct gic_common_ops gicv3_ops; + +#endif /* SELFTEST_KVM_GIC_PRIVATE_H */ diff --git a/tools/testing/selftests/kvm/lib/aarch64/gic_v3.c b/tools/testing/selftests/kvm/lib/aarch64/gic_v3.c new file mode 100644 index 000000000000..263bf3ed8fd5 --- /dev/null +++ b/tools/testing/selftests/kvm/lib/aarch64/gic_v3.c @@ -0,0 +1,398 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * ARM Generic Interrupt Controller (GIC) v3 support + */ + +#include <linux/sizes.h> + +#include "kvm_util.h" +#include "processor.h" +#include "delay.h" + +#include "gic_v3.h" +#include "gic_private.h" + +struct gicv3_data { + void *dist_base; + void *redist_base[GICV3_MAX_CPUS]; + unsigned int nr_cpus; + unsigned int nr_spis; +}; + +#define sgi_base_from_redist(redist_base) (redist_base + SZ_64K) +#define DIST_BIT (1U << 31) + +enum gicv3_intid_range { + SGI_RANGE, + PPI_RANGE, + SPI_RANGE, + INVALID_RANGE, +}; + +static struct gicv3_data gicv3_data; + +static void gicv3_gicd_wait_for_rwp(void) +{ + unsigned int count = 100000; /* 1s */ + + while (readl(gicv3_data.dist_base + GICD_CTLR) & GICD_CTLR_RWP) { + GUEST_ASSERT(count--); + udelay(10); + } +} + +static void gicv3_gicr_wait_for_rwp(void *redist_base) +{ + unsigned int count = 100000; /* 1s */ + + while (readl(redist_base + GICR_CTLR) & GICR_CTLR_RWP) { + GUEST_ASSERT(count--); + udelay(10); + } +} + +static void gicv3_wait_for_rwp(uint32_t cpu_or_dist) +{ + if (cpu_or_dist & DIST_BIT) + gicv3_gicd_wait_for_rwp(); + else + gicv3_gicr_wait_for_rwp(gicv3_data.redist_base[cpu_or_dist]); +} + +static enum gicv3_intid_range get_intid_range(unsigned int intid) +{ + switch (intid) { + case 0 ... 15: + return SGI_RANGE; + case 16 ... 31: + return PPI_RANGE; + case 32 ... 1019: + return SPI_RANGE; + } + + /* We should not be reaching here */ + GUEST_ASSERT(0); + + return INVALID_RANGE; +} + +static uint64_t gicv3_read_iar(void) +{ + uint64_t irqstat = read_sysreg_s(SYS_ICC_IAR1_EL1); + + dsb(sy); + return irqstat; +} + +static void gicv3_write_eoir(uint32_t irq) +{ + write_sysreg_s(irq, SYS_ICC_EOIR1_EL1); + isb(); +} + +static void gicv3_write_dir(uint32_t irq) +{ + write_sysreg_s(irq, SYS_ICC_DIR_EL1); + isb(); +} + +static void gicv3_set_priority_mask(uint64_t mask) +{ + write_sysreg_s(mask, SYS_ICC_PMR_EL1); +} + +static void gicv3_set_eoi_split(bool split) +{ + uint32_t val; + + /* + * All other fields are read-only, so no need to read CTLR first. In + * fact, the kernel does the same. + */ + val = split ? (1U << 1) : 0; + write_sysreg_s(val, SYS_ICC_CTLR_EL1); + isb(); +} + +uint32_t gicv3_reg_readl(uint32_t cpu_or_dist, uint64_t offset) +{ + void *base = cpu_or_dist & DIST_BIT ? gicv3_data.dist_base + : sgi_base_from_redist(gicv3_data.redist_base[cpu_or_dist]); + return readl(base + offset); +} + +void gicv3_reg_writel(uint32_t cpu_or_dist, uint64_t offset, uint32_t reg_val) +{ + void *base = cpu_or_dist & DIST_BIT ? gicv3_data.dist_base + : sgi_base_from_redist(gicv3_data.redist_base[cpu_or_dist]); + writel(reg_val, base + offset); +} + +uint32_t gicv3_getl_fields(uint32_t cpu_or_dist, uint64_t offset, uint32_t mask) +{ + return gicv3_reg_readl(cpu_or_dist, offset) & mask; +} + +void gicv3_setl_fields(uint32_t cpu_or_dist, uint64_t offset, + uint32_t mask, uint32_t reg_val) +{ + uint32_t tmp = gicv3_reg_readl(cpu_or_dist, offset) & ~mask; + + tmp |= (reg_val & mask); + gicv3_reg_writel(cpu_or_dist, offset, tmp); +} + +/* + * We use a single offset for the distributor and redistributor maps as they + * have the same value in both. The only exceptions are registers that only + * exist in one and not the other, like GICR_WAKER that doesn't exist in the + * distributor map. Such registers are conveniently marked as reserved in the + * map that doesn't implement it; like GICR_WAKER's offset of 0x0014 being + * marked as "Reserved" in the Distributor map. + */ +static void gicv3_access_reg(uint32_t intid, uint64_t offset, + uint32_t reg_bits, uint32_t bits_per_field, + bool write, uint32_t *val) +{ + uint32_t cpu = guest_get_vcpuid(); + enum gicv3_intid_range intid_range = get_intid_range(intid); + uint32_t fields_per_reg, index, mask, shift; + uint32_t cpu_or_dist; + + GUEST_ASSERT(bits_per_field <= reg_bits); + GUEST_ASSERT(!write || *val < (1U << bits_per_field)); + /* + * This function does not support 64 bit accesses. Just asserting here + * until we implement readq/writeq. + */ + GUEST_ASSERT(reg_bits == 32); + + fields_per_reg = reg_bits / bits_per_field; + index = intid % fields_per_reg; + shift = index * bits_per_field; + mask = ((1U << bits_per_field) - 1) << shift; + + /* Set offset to the actual register holding intid's config. */ + offset += (intid / fields_per_reg) * (reg_bits / 8); + + cpu_or_dist = (intid_range == SPI_RANGE) ? DIST_BIT : cpu; + + if (write) + gicv3_setl_fields(cpu_or_dist, offset, mask, *val << shift); + *val = gicv3_getl_fields(cpu_or_dist, offset, mask) >> shift; +} + +static void gicv3_write_reg(uint32_t intid, uint64_t offset, + uint32_t reg_bits, uint32_t bits_per_field, uint32_t val) +{ + gicv3_access_reg(intid, offset, reg_bits, + bits_per_field, true, &val); +} + +static uint32_t gicv3_read_reg(uint32_t intid, uint64_t offset, + uint32_t reg_bits, uint32_t bits_per_field) +{ + uint32_t val; + + gicv3_access_reg(intid, offset, reg_bits, + bits_per_field, false, &val); + return val; +} + +static void gicv3_set_priority(uint32_t intid, uint32_t prio) +{ + gicv3_write_reg(intid, GICD_IPRIORITYR, 32, 8, prio); +} + +/* Sets the intid to be level-sensitive or edge-triggered. */ +static void gicv3_irq_set_config(uint32_t intid, bool is_edge) +{ + uint32_t val; + + /* N/A for private interrupts. */ + GUEST_ASSERT(get_intid_range(intid) == SPI_RANGE); + val = is_edge ? 2 : 0; + gicv3_write_reg(intid, GICD_ICFGR, 32, 2, val); +} + +static void gicv3_irq_enable(uint32_t intid) +{ + bool is_spi = get_intid_range(intid) == SPI_RANGE; + uint32_t cpu = guest_get_vcpuid(); + + gicv3_write_reg(intid, GICD_ISENABLER, 32, 1, 1); + gicv3_wait_for_rwp(is_spi ? DIST_BIT : cpu); +} + +static void gicv3_irq_disable(uint32_t intid) +{ + bool is_spi = get_intid_range(intid) == SPI_RANGE; + uint32_t cpu = guest_get_vcpuid(); + + gicv3_write_reg(intid, GICD_ICENABLER, 32, 1, 1); + gicv3_wait_for_rwp(is_spi ? DIST_BIT : cpu); +} + +static void gicv3_irq_set_active(uint32_t intid) +{ + gicv3_write_reg(intid, GICD_ISACTIVER, 32, 1, 1); +} + +static void gicv3_irq_clear_active(uint32_t intid) +{ + gicv3_write_reg(intid, GICD_ICACTIVER, 32, 1, 1); +} + +static bool gicv3_irq_get_active(uint32_t intid) +{ + return gicv3_read_reg(intid, GICD_ISACTIVER, 32, 1); +} + +static void gicv3_irq_set_pending(uint32_t intid) +{ + gicv3_write_reg(intid, GICD_ISPENDR, 32, 1, 1); +} + +static void gicv3_irq_clear_pending(uint32_t intid) +{ + gicv3_write_reg(intid, GICD_ICPENDR, 32, 1, 1); +} + +static bool gicv3_irq_get_pending(uint32_t intid) +{ + return gicv3_read_reg(intid, GICD_ISPENDR, 32, 1); +} + +static void gicv3_enable_redist(void *redist_base) +{ + uint32_t val = readl(redist_base + GICR_WAKER); + unsigned int count = 100000; /* 1s */ + + val &= ~GICR_WAKER_ProcessorSleep; + writel(val, redist_base + GICR_WAKER); + + /* Wait until the processor is 'active' */ + while (readl(redist_base + GICR_WAKER) & GICR_WAKER_ChildrenAsleep) { + GUEST_ASSERT(count--); + udelay(10); + } +} + +static inline void *gicr_base_cpu(void *redist_base, uint32_t cpu) +{ + /* Align all the redistributors sequentially */ + return redist_base + cpu * SZ_64K * 2; +} + +static void gicv3_cpu_init(unsigned int cpu, void *redist_base) +{ + void *sgi_base; + unsigned int i; + void *redist_base_cpu; + + GUEST_ASSERT(cpu < gicv3_data.nr_cpus); + + redist_base_cpu = gicr_base_cpu(redist_base, cpu); + sgi_base = sgi_base_from_redist(redist_base_cpu); + + gicv3_enable_redist(redist_base_cpu); + + /* + * Mark all the SGI and PPI interrupts as non-secure Group-1. + * Also, deactivate and disable them. + */ + writel(~0, sgi_base + GICR_IGROUPR0); + writel(~0, sgi_base + GICR_ICACTIVER0); + writel(~0, sgi_base + GICR_ICENABLER0); + + /* Set a default priority for all the SGIs and PPIs */ + for (i = 0; i < 32; i += 4) + writel(GICD_INT_DEF_PRI_X4, + sgi_base + GICR_IPRIORITYR0 + i); + + gicv3_gicr_wait_for_rwp(redist_base_cpu); + + /* Enable the GIC system register (ICC_*) access */ + write_sysreg_s(read_sysreg_s(SYS_ICC_SRE_EL1) | ICC_SRE_EL1_SRE, + SYS_ICC_SRE_EL1); + + /* Set a default priority threshold */ + write_sysreg_s(ICC_PMR_DEF_PRIO, SYS_ICC_PMR_EL1); + + /* Enable non-secure Group-1 interrupts */ + write_sysreg_s(ICC_IGRPEN1_EL1_ENABLE, SYS_ICC_GRPEN1_EL1); + + gicv3_data.redist_base[cpu] = redist_base_cpu; +} + +static void gicv3_dist_init(void) +{ + void *dist_base = gicv3_data.dist_base; + unsigned int i; + + /* Disable the distributor until we set things up */ + writel(0, dist_base + GICD_CTLR); + gicv3_gicd_wait_for_rwp(); + + /* + * Mark all the SPI interrupts as non-secure Group-1. + * Also, deactivate and disable them. + */ + for (i = 32; i < gicv3_data.nr_spis; i += 32) { + writel(~0, dist_base + GICD_IGROUPR + i / 8); + writel(~0, dist_base + GICD_ICACTIVER + i / 8); + writel(~0, dist_base + GICD_ICENABLER + i / 8); + } + + /* Set a default priority for all the SPIs */ + for (i = 32; i < gicv3_data.nr_spis; i += 4) + writel(GICD_INT_DEF_PRI_X4, + dist_base + GICD_IPRIORITYR + i); + + /* Wait for the settings to sync-in */ + gicv3_gicd_wait_for_rwp(); + + /* Finally, enable the distributor globally with ARE */ + writel(GICD_CTLR_ARE_NS | GICD_CTLR_ENABLE_G1A | + GICD_CTLR_ENABLE_G1, dist_base + GICD_CTLR); + gicv3_gicd_wait_for_rwp(); +} + +static void gicv3_init(unsigned int nr_cpus, void *dist_base) +{ + GUEST_ASSERT(nr_cpus <= GICV3_MAX_CPUS); + + gicv3_data.nr_cpus = nr_cpus; + gicv3_data.dist_base = dist_base; + gicv3_data.nr_spis = GICD_TYPER_SPIS( + readl(gicv3_data.dist_base + GICD_TYPER)); + if (gicv3_data.nr_spis > 1020) + gicv3_data.nr_spis = 1020; + + /* + * Initialize only the distributor for now. + * The redistributor and CPU interfaces are initialized + * later for every PE. + */ + gicv3_dist_init(); +} + +const struct gic_common_ops gicv3_ops = { + .gic_init = gicv3_init, + .gic_cpu_init = gicv3_cpu_init, + .gic_irq_enable = gicv3_irq_enable, + .gic_irq_disable = gicv3_irq_disable, + .gic_read_iar = gicv3_read_iar, + .gic_write_eoir = gicv3_write_eoir, + .gic_write_dir = gicv3_write_dir, + .gic_set_priority_mask = gicv3_set_priority_mask, + .gic_set_eoi_split = gicv3_set_eoi_split, + .gic_set_priority = gicv3_set_priority, + .gic_irq_set_active = gicv3_irq_set_active, + .gic_irq_clear_active = gicv3_irq_clear_active, + .gic_irq_get_active = gicv3_irq_get_active, + .gic_irq_set_pending = gicv3_irq_set_pending, + .gic_irq_clear_pending = gicv3_irq_clear_pending, + .gic_irq_get_pending = gicv3_irq_get_pending, + .gic_irq_set_config = gicv3_irq_set_config, +}; diff --git a/tools/testing/selftests/kvm/lib/aarch64/handlers.S b/tools/testing/selftests/kvm/lib/aarch64/handlers.S new file mode 100644 index 000000000000..0e443eadfac6 --- /dev/null +++ b/tools/testing/selftests/kvm/lib/aarch64/handlers.S @@ -0,0 +1,126 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +.macro save_registers + add sp, sp, #-16 * 17 + + stp x0, x1, [sp, #16 * 0] + stp x2, x3, [sp, #16 * 1] + stp x4, x5, [sp, #16 * 2] + stp x6, x7, [sp, #16 * 3] + stp x8, x9, [sp, #16 * 4] + stp x10, x11, [sp, #16 * 5] + stp x12, x13, [sp, #16 * 6] + stp x14, x15, [sp, #16 * 7] + stp x16, x17, [sp, #16 * 8] + stp x18, x19, [sp, #16 * 9] + stp x20, x21, [sp, #16 * 10] + stp x22, x23, [sp, #16 * 11] + stp x24, x25, [sp, #16 * 12] + stp x26, x27, [sp, #16 * 13] + stp x28, x29, [sp, #16 * 14] + + /* + * This stores sp_el1 into ex_regs.sp so exception handlers can "look" + * at it. It will _not_ be used to restore the sp on return from the + * exception so handlers can not update it. + */ + add x1, sp, #16 * 17 + stp x30, x1, [sp, #16 * 15] /* x30, SP */ + + mrs x1, elr_el1 + mrs x2, spsr_el1 + stp x1, x2, [sp, #16 * 16] /* PC, PSTATE */ +.endm + +.macro restore_registers + ldp x1, x2, [sp, #16 * 16] /* PC, PSTATE */ + msr elr_el1, x1 + msr spsr_el1, x2 + + /* sp is not restored */ + ldp x30, xzr, [sp, #16 * 15] /* x30, SP */ + + ldp x28, x29, [sp, #16 * 14] + ldp x26, x27, [sp, #16 * 13] + ldp x24, x25, [sp, #16 * 12] + ldp x22, x23, [sp, #16 * 11] + ldp x20, x21, [sp, #16 * 10] + ldp x18, x19, [sp, #16 * 9] + ldp x16, x17, [sp, #16 * 8] + ldp x14, x15, [sp, #16 * 7] + ldp x12, x13, [sp, #16 * 6] + ldp x10, x11, [sp, #16 * 5] + ldp x8, x9, [sp, #16 * 4] + ldp x6, x7, [sp, #16 * 3] + ldp x4, x5, [sp, #16 * 2] + ldp x2, x3, [sp, #16 * 1] + ldp x0, x1, [sp, #16 * 0] + + add sp, sp, #16 * 17 + + eret +.endm + +.pushsection ".entry.text", "ax" +.balign 0x800 +.global vectors +vectors: +.popsection + +.set vector, 0 + +/* + * Build an exception handler for vector and append a jump to it into + * vectors (while making sure that it's 0x80 aligned). + */ +.macro HANDLER, label +handler_\label: + save_registers + mov x0, sp + mov x1, #vector + bl route_exception + restore_registers + +.pushsection ".entry.text", "ax" +.balign 0x80 + b handler_\label +.popsection + +.set vector, vector + 1 +.endm + +.macro HANDLER_INVALID +.pushsection ".entry.text", "ax" +.balign 0x80 +/* This will abort so no need to save and restore registers. */ + mov x0, #vector + mov x1, #0 /* ec */ + mov x2, #0 /* valid_ec */ + b kvm_exit_unexpected_exception +.popsection + +.set vector, vector + 1 +.endm + +/* + * Caution: be sure to not add anything between the declaration of vectors + * above and these macro calls that will build the vectors table below it. + */ + HANDLER_INVALID // Synchronous EL1t + HANDLER_INVALID // IRQ EL1t + HANDLER_INVALID // FIQ EL1t + HANDLER_INVALID // Error EL1t + + HANDLER el1h_sync // Synchronous EL1h + HANDLER el1h_irq // IRQ EL1h + HANDLER el1h_fiq // FIQ EL1h + HANDLER el1h_error // Error EL1h + + HANDLER el0_sync_64 // Synchronous 64-bit EL0 + HANDLER el0_irq_64 // IRQ 64-bit EL0 + HANDLER el0_fiq_64 // FIQ 64-bit EL0 + HANDLER el0_error_64 // Error 64-bit EL0 + + HANDLER el0_sync_32 // Synchronous 32-bit EL0 + HANDLER el0_irq_32 // IRQ 32-bit EL0 + HANDLER el0_fiq_32 // FIQ 32-bit EL0 + HANDLER el0_error_32 // Error 32-bit EL0 diff --git a/tools/testing/selftests/kvm/lib/aarch64/processor.c b/tools/testing/selftests/kvm/lib/aarch64/processor.c index 86036a59a668..6f5551368944 100644 --- a/tools/testing/selftests/kvm/lib/aarch64/processor.c +++ b/tools/testing/selftests/kvm/lib/aarch64/processor.c @@ -5,17 +5,17 @@ * Copyright (C) 2018, Red Hat, Inc. */ -#define _GNU_SOURCE /* for program_invocation_name */ - #include <linux/compiler.h> +#include <assert.h> +#include "guest_modes.h" #include "kvm_util.h" -#include "../kvm_util_internal.h" #include "processor.h" -#define KVM_GUEST_PAGE_TABLE_MIN_PADDR 0x180000 #define DEFAULT_ARM64_GUEST_STACK_VADDR_MIN 0xac0000 +static vm_vaddr_t exception_handlers; + static uint64_t page_align(struct kvm_vm *vm, uint64_t v) { return (v + vm->page_size) & ~(vm->page_size - 1); @@ -74,19 +74,19 @@ static uint64_t __maybe_unused ptrs_per_pte(struct kvm_vm *vm) return 1 << (vm->page_shift - 3); } -void virt_pgd_alloc(struct kvm_vm *vm, uint32_t pgd_memslot) +void virt_arch_pgd_alloc(struct kvm_vm *vm) { if (!vm->pgd_created) { vm_paddr_t paddr = vm_phy_pages_alloc(vm, page_align(vm, ptrs_per_pgd(vm) * 8) / vm->page_size, - KVM_GUEST_PAGE_TABLE_MIN_PADDR, pgd_memslot); + KVM_GUEST_PAGE_TABLE_MIN_PADDR, 0); vm->pgd = paddr; vm->pgd_created = true; } } -void _virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, - uint32_t pgd_memslot, uint64_t flags) +static void _virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, + uint64_t flags) { uint8_t attr_idx = flags & 7; uint64_t *ptep; @@ -106,46 +106,39 @@ void _virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, paddr, vm->max_gfn, vm->page_size); ptep = addr_gpa2hva(vm, vm->pgd) + pgd_index(vm, vaddr) * 8; - if (!*ptep) { - *ptep = vm_phy_page_alloc(vm, KVM_GUEST_PAGE_TABLE_MIN_PADDR, pgd_memslot); - *ptep |= 3; - } + if (!*ptep) + *ptep = vm_alloc_page_table(vm) | 3; switch (vm->pgtable_levels) { case 4: ptep = addr_gpa2hva(vm, pte_addr(vm, *ptep)) + pud_index(vm, vaddr) * 8; - if (!*ptep) { - *ptep = vm_phy_page_alloc(vm, KVM_GUEST_PAGE_TABLE_MIN_PADDR, pgd_memslot); - *ptep |= 3; - } + if (!*ptep) + *ptep = vm_alloc_page_table(vm) | 3; /* fall through */ case 3: ptep = addr_gpa2hva(vm, pte_addr(vm, *ptep)) + pmd_index(vm, vaddr) * 8; - if (!*ptep) { - *ptep = vm_phy_page_alloc(vm, KVM_GUEST_PAGE_TABLE_MIN_PADDR, pgd_memslot); - *ptep |= 3; - } + if (!*ptep) + *ptep = vm_alloc_page_table(vm) | 3; /* fall through */ case 2: ptep = addr_gpa2hva(vm, pte_addr(vm, *ptep)) + pte_index(vm, vaddr) * 8; break; default: - TEST_ASSERT(false, "Page table levels must be 2, 3, or 4"); + TEST_FAIL("Page table levels must be 2, 3, or 4"); } *ptep = paddr | 3; *ptep |= (attr_idx << 2) | (1 << 10) /* Access Flag */; } -void virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, - uint32_t pgd_memslot) +void virt_arch_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr) { uint64_t attr_idx = 4; /* NORMAL (See DEFAULT_MAIR_EL1) */ - _virt_pg_map(vm, vaddr, paddr, pgd_memslot, attr_idx); + _virt_pg_map(vm, vaddr, paddr, attr_idx); } -vm_paddr_t addr_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva) +vm_paddr_t addr_arch_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva) { uint64_t *ptep; @@ -173,20 +166,19 @@ vm_paddr_t addr_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva) goto unmapped_gva; break; default: - TEST_ASSERT(false, "Page table levels must be 2, 3, or 4"); + TEST_FAIL("Page table levels must be 2, 3, or 4"); } return pte_addr(vm, *ptep) + (gva & (vm->page_size - 1)); unmapped_gva: - TEST_ASSERT(false, "No mapping for vm virtual address, " - "gva: 0x%lx", gva); + TEST_FAIL("No mapping for vm virtual address, gva: 0x%lx", gva); exit(1); } static void pte_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent, uint64_t page, int level) { -#ifdef DEBUG_VM +#ifdef DEBUG static const char * const type[] = { "", "pud", "pmd", "pte" }; uint64_t pte, *ptep; @@ -197,13 +189,13 @@ static void pte_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent, uint64_t p ptep = addr_gpa2hva(vm, pte); if (!*ptep) continue; - printf("%*s%s: %lx: %lx at %p\n", indent, "", type[level], pte, *ptep, ptep); + fprintf(stream, "%*s%s: %lx: %lx at %p\n", indent, "", type[level], pte, *ptep, ptep); pte_dump(stream, vm, indent + 1, pte_addr(vm, *ptep), level + 1); } #endif } -void virt_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent) +void virt_arch_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent) { int level = 4 - (vm->pgtable_levels - 1); uint64_t pgd, *ptep; @@ -215,29 +207,15 @@ void virt_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent) ptep = addr_gpa2hva(vm, pgd); if (!*ptep) continue; - printf("%*spgd: %lx: %lx at %p\n", indent, "", pgd, *ptep, ptep); + fprintf(stream, "%*spgd: %lx: %lx at %p\n", indent, "", pgd, *ptep, ptep); pte_dump(stream, vm, indent + 1, pte_addr(vm, *ptep), level); } } -struct kvm_vm *vm_create_default(uint32_t vcpuid, uint64_t extra_mem_pages, - void *guest_code) -{ - uint64_t ptrs_per_4k_pte = 512; - uint64_t extra_pg_pages = (extra_mem_pages / ptrs_per_4k_pte) * 2; - struct kvm_vm *vm; - - vm = vm_create(VM_MODE_DEFAULT, DEFAULT_GUEST_PHY_PAGES + extra_pg_pages, O_RDWR); - - kvm_vm_elf_load(vm, program_invocation_name, 0, 0); - vm_vcpu_add_default(vm, vcpuid, guest_code); - - return vm; -} - -void aarch64_vcpu_setup(struct kvm_vm *vm, int vcpuid, struct kvm_vcpu_init *init) +void aarch64_vcpu_setup(struct kvm_vcpu *vcpu, struct kvm_vcpu_init *init) { struct kvm_vcpu_init default_init = { .target = -1, }; + struct kvm_vm *vm = vcpu->vm; uint64_t sctlr_el1, tcr_el1; if (!init) @@ -249,46 +227,69 @@ void aarch64_vcpu_setup(struct kvm_vm *vm, int vcpuid, struct kvm_vcpu_init *ini init->target = preferred.target; } - vcpu_ioctl(vm, vcpuid, KVM_ARM_VCPU_INIT, init); + vcpu_ioctl(vcpu, KVM_ARM_VCPU_INIT, init); /* * Enable FP/ASIMD to avoid trapping when accessing Q0-Q15 * registers, which the variable argument list macros do. */ - set_reg(vm, vcpuid, ARM64_SYS_REG(CPACR_EL1), 3 << 20); + vcpu_set_reg(vcpu, KVM_ARM64_SYS_REG(SYS_CPACR_EL1), 3 << 20); - get_reg(vm, vcpuid, ARM64_SYS_REG(SCTLR_EL1), &sctlr_el1); - get_reg(vm, vcpuid, ARM64_SYS_REG(TCR_EL1), &tcr_el1); + vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(SYS_SCTLR_EL1), &sctlr_el1); + vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(SYS_TCR_EL1), &tcr_el1); + /* Configure base granule size */ switch (vm->mode) { case VM_MODE_P52V48_4K: - TEST_ASSERT(false, "AArch64 does not support 4K sized pages " - "with 52-bit physical address ranges"); + TEST_FAIL("AArch64 does not support 4K sized pages " + "with 52-bit physical address ranges"); case VM_MODE_PXXV48_4K: - TEST_ASSERT(false, "AArch64 does not support 4K sized pages " - "with ANY-bit physical address ranges"); + TEST_FAIL("AArch64 does not support 4K sized pages " + "with ANY-bit physical address ranges"); case VM_MODE_P52V48_64K: + case VM_MODE_P48V48_64K: + case VM_MODE_P40V48_64K: + case VM_MODE_P36V48_64K: tcr_el1 |= 1ul << 14; /* TG0 = 64KB */ - tcr_el1 |= 6ul << 32; /* IPS = 52 bits */ + break; + case VM_MODE_P48V48_16K: + case VM_MODE_P40V48_16K: + case VM_MODE_P36V48_16K: + case VM_MODE_P36V47_16K: + tcr_el1 |= 2ul << 14; /* TG0 = 16KB */ break; case VM_MODE_P48V48_4K: + case VM_MODE_P40V48_4K: + case VM_MODE_P36V48_4K: tcr_el1 |= 0ul << 14; /* TG0 = 4KB */ - tcr_el1 |= 5ul << 32; /* IPS = 48 bits */ break; + default: + TEST_FAIL("Unknown guest mode, mode: 0x%x", vm->mode); + } + + /* Configure output size */ + switch (vm->mode) { + case VM_MODE_P52V48_64K: + tcr_el1 |= 6ul << 32; /* IPS = 52 bits */ + break; + case VM_MODE_P48V48_4K: + case VM_MODE_P48V48_16K: case VM_MODE_P48V48_64K: - tcr_el1 |= 1ul << 14; /* TG0 = 64KB */ tcr_el1 |= 5ul << 32; /* IPS = 48 bits */ break; case VM_MODE_P40V48_4K: - tcr_el1 |= 0ul << 14; /* TG0 = 4KB */ - tcr_el1 |= 2ul << 32; /* IPS = 40 bits */ - break; + case VM_MODE_P40V48_16K: case VM_MODE_P40V48_64K: - tcr_el1 |= 1ul << 14; /* TG0 = 64KB */ tcr_el1 |= 2ul << 32; /* IPS = 40 bits */ break; + case VM_MODE_P36V48_4K: + case VM_MODE_P36V48_16K: + case VM_MODE_P36V48_64K: + case VM_MODE_P36V47_16K: + tcr_el1 |= 1ul << 32; /* IPS = 36 bits */ + break; default: - TEST_ASSERT(false, "Unknown guest mode, mode: 0x%x", vm->mode); + TEST_FAIL("Unknown guest mode, mode: 0x%x", vm->mode); } sctlr_el1 |= (1 << 0) | (1 << 2) | (1 << 12) /* M | C | I */; @@ -296,40 +297,234 @@ void aarch64_vcpu_setup(struct kvm_vm *vm, int vcpuid, struct kvm_vcpu_init *ini tcr_el1 |= (1 << 8) | (1 << 10) | (3 << 12); tcr_el1 |= (64 - vm->va_bits) /* T0SZ */; - set_reg(vm, vcpuid, ARM64_SYS_REG(SCTLR_EL1), sctlr_el1); - set_reg(vm, vcpuid, ARM64_SYS_REG(TCR_EL1), tcr_el1); - set_reg(vm, vcpuid, ARM64_SYS_REG(MAIR_EL1), DEFAULT_MAIR_EL1); - set_reg(vm, vcpuid, ARM64_SYS_REG(TTBR0_EL1), vm->pgd); + vcpu_set_reg(vcpu, KVM_ARM64_SYS_REG(SYS_SCTLR_EL1), sctlr_el1); + vcpu_set_reg(vcpu, KVM_ARM64_SYS_REG(SYS_TCR_EL1), tcr_el1); + vcpu_set_reg(vcpu, KVM_ARM64_SYS_REG(SYS_MAIR_EL1), DEFAULT_MAIR_EL1); + vcpu_set_reg(vcpu, KVM_ARM64_SYS_REG(SYS_TTBR0_EL1), vm->pgd); + vcpu_set_reg(vcpu, KVM_ARM64_SYS_REG(SYS_TPIDR_EL1), vcpu->id); } -void vcpu_dump(FILE *stream, struct kvm_vm *vm, uint32_t vcpuid, uint8_t indent) +void vcpu_arch_dump(FILE *stream, struct kvm_vcpu *vcpu, uint8_t indent) { uint64_t pstate, pc; - get_reg(vm, vcpuid, ARM64_CORE_REG(regs.pstate), &pstate); - get_reg(vm, vcpuid, ARM64_CORE_REG(regs.pc), &pc); + vcpu_get_reg(vcpu, ARM64_CORE_REG(regs.pstate), &pstate); + vcpu_get_reg(vcpu, ARM64_CORE_REG(regs.pc), &pc); fprintf(stream, "%*spstate: 0x%.16lx pc: 0x%.16lx\n", indent, "", pstate, pc); } -void aarch64_vcpu_add_default(struct kvm_vm *vm, uint32_t vcpuid, - struct kvm_vcpu_init *init, void *guest_code) +struct kvm_vcpu *aarch64_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id, + struct kvm_vcpu_init *init, void *guest_code) { size_t stack_size = vm->page_size == 4096 ? DEFAULT_STACK_PGS * vm->page_size : vm->page_size; uint64_t stack_vaddr = vm_vaddr_alloc(vm, stack_size, - DEFAULT_ARM64_GUEST_STACK_VADDR_MIN, 0, 0); + DEFAULT_ARM64_GUEST_STACK_VADDR_MIN); + struct kvm_vcpu *vcpu = __vm_vcpu_add(vm, vcpu_id); + + aarch64_vcpu_setup(vcpu, init); + + vcpu_set_reg(vcpu, ARM64_CORE_REG(sp_el1), stack_vaddr + stack_size); + vcpu_set_reg(vcpu, ARM64_CORE_REG(regs.pc), (uint64_t)guest_code); + + return vcpu; +} + +struct kvm_vcpu *vm_arch_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id, + void *guest_code) +{ + return aarch64_vcpu_add(vm, vcpu_id, NULL, guest_code); +} + +void vcpu_args_set(struct kvm_vcpu *vcpu, unsigned int num, ...) +{ + va_list ap; + int i; + + TEST_ASSERT(num >= 1 && num <= 8, "Unsupported number of args,\n" + " num: %u\n", num); + + va_start(ap, num); + + for (i = 0; i < num; i++) { + vcpu_set_reg(vcpu, ARM64_CORE_REG(regs.regs[i]), + va_arg(ap, uint64_t)); + } + + va_end(ap); +} + +void kvm_exit_unexpected_exception(int vector, uint64_t ec, bool valid_ec) +{ + ucall(UCALL_UNHANDLED, 3, vector, ec, valid_ec); + while (1) + ; +} + +void assert_on_unhandled_exception(struct kvm_vcpu *vcpu) +{ + struct ucall uc; - vm_vcpu_add(vm, vcpuid); - aarch64_vcpu_setup(vm, vcpuid, init); + if (get_ucall(vcpu, &uc) != UCALL_UNHANDLED) + return; + + if (uc.args[2]) /* valid_ec */ { + assert(VECTOR_IS_SYNC(uc.args[0])); + TEST_FAIL("Unexpected exception (vector:0x%lx, ec:0x%lx)", + uc.args[0], uc.args[1]); + } else { + assert(!VECTOR_IS_SYNC(uc.args[0])); + TEST_FAIL("Unexpected exception (vector:0x%lx)", + uc.args[0]); + } +} + +struct handlers { + handler_fn exception_handlers[VECTOR_NUM][ESR_EC_NUM]; +}; - set_reg(vm, vcpuid, ARM64_CORE_REG(sp_el1), stack_vaddr + stack_size); - set_reg(vm, vcpuid, ARM64_CORE_REG(regs.pc), (uint64_t)guest_code); +void vcpu_init_descriptor_tables(struct kvm_vcpu *vcpu) +{ + extern char vectors; + + vcpu_set_reg(vcpu, KVM_ARM64_SYS_REG(SYS_VBAR_EL1), (uint64_t)&vectors); +} + +void route_exception(struct ex_regs *regs, int vector) +{ + struct handlers *handlers = (struct handlers *)exception_handlers; + bool valid_ec; + int ec = 0; + + switch (vector) { + case VECTOR_SYNC_CURRENT: + case VECTOR_SYNC_LOWER_64: + ec = (read_sysreg(esr_el1) >> ESR_EC_SHIFT) & ESR_EC_MASK; + valid_ec = true; + break; + case VECTOR_IRQ_CURRENT: + case VECTOR_IRQ_LOWER_64: + case VECTOR_FIQ_CURRENT: + case VECTOR_FIQ_LOWER_64: + case VECTOR_ERROR_CURRENT: + case VECTOR_ERROR_LOWER_64: + ec = 0; + valid_ec = false; + break; + default: + valid_ec = false; + goto unexpected_exception; + } + + if (handlers && handlers->exception_handlers[vector][ec]) + return handlers->exception_handlers[vector][ec](regs); + +unexpected_exception: + kvm_exit_unexpected_exception(vector, ec, valid_ec); +} + +void vm_init_descriptor_tables(struct kvm_vm *vm) +{ + vm->handlers = vm_vaddr_alloc(vm, sizeof(struct handlers), + vm->page_size); + + *(vm_vaddr_t *)addr_gva2hva(vm, (vm_vaddr_t)(&exception_handlers)) = vm->handlers; +} + +void vm_install_sync_handler(struct kvm_vm *vm, int vector, int ec, + void (*handler)(struct ex_regs *)) +{ + struct handlers *handlers = addr_gva2hva(vm, vm->handlers); + + assert(VECTOR_IS_SYNC(vector)); + assert(vector < VECTOR_NUM); + assert(ec < ESR_EC_NUM); + handlers->exception_handlers[vector][ec] = handler; +} + +void vm_install_exception_handler(struct kvm_vm *vm, int vector, + void (*handler)(struct ex_regs *)) +{ + struct handlers *handlers = addr_gva2hva(vm, vm->handlers); + + assert(!VECTOR_IS_SYNC(vector)); + assert(vector < VECTOR_NUM); + handlers->exception_handlers[vector][0] = handler; +} + +uint32_t guest_get_vcpuid(void) +{ + return read_sysreg(tpidr_el1); +} + +void aarch64_get_supported_page_sizes(uint32_t ipa, + bool *ps4k, bool *ps16k, bool *ps64k) +{ + struct kvm_vcpu_init preferred_init; + int kvm_fd, vm_fd, vcpu_fd, err; + uint64_t val; + struct kvm_one_reg reg = { + .id = KVM_ARM64_SYS_REG(SYS_ID_AA64MMFR0_EL1), + .addr = (uint64_t)&val, + }; + + kvm_fd = open_kvm_dev_path_or_exit(); + vm_fd = __kvm_ioctl(kvm_fd, KVM_CREATE_VM, (void *)(unsigned long)ipa); + TEST_ASSERT(vm_fd >= 0, KVM_IOCTL_ERROR(KVM_CREATE_VM, vm_fd)); + + vcpu_fd = ioctl(vm_fd, KVM_CREATE_VCPU, 0); + TEST_ASSERT(vcpu_fd >= 0, KVM_IOCTL_ERROR(KVM_CREATE_VCPU, vcpu_fd)); + + err = ioctl(vm_fd, KVM_ARM_PREFERRED_TARGET, &preferred_init); + TEST_ASSERT(err == 0, KVM_IOCTL_ERROR(KVM_ARM_PREFERRED_TARGET, err)); + err = ioctl(vcpu_fd, KVM_ARM_VCPU_INIT, &preferred_init); + TEST_ASSERT(err == 0, KVM_IOCTL_ERROR(KVM_ARM_VCPU_INIT, err)); + + err = ioctl(vcpu_fd, KVM_GET_ONE_REG, ®); + TEST_ASSERT(err == 0, KVM_IOCTL_ERROR(KVM_GET_ONE_REG, vcpu_fd)); + + *ps4k = ((val >> 28) & 0xf) != 0xf; + *ps64k = ((val >> 24) & 0xf) == 0; + *ps16k = ((val >> 20) & 0xf) != 0; + + close(vcpu_fd); + close(vm_fd); + close(kvm_fd); +} + +/* + * arm64 doesn't have a true default mode, so start by computing the + * available IPA space and page sizes early. + */ +void __attribute__((constructor)) init_guest_modes(void) +{ + guest_modes_append_default(); } -void vm_vcpu_add_default(struct kvm_vm *vm, uint32_t vcpuid, void *guest_code) +void smccc_hvc(uint32_t function_id, uint64_t arg0, uint64_t arg1, + uint64_t arg2, uint64_t arg3, uint64_t arg4, uint64_t arg5, + uint64_t arg6, struct arm_smccc_res *res) { - aarch64_vcpu_add_default(vm, vcpuid, NULL, guest_code); + asm volatile("mov w0, %w[function_id]\n" + "mov x1, %[arg0]\n" + "mov x2, %[arg1]\n" + "mov x3, %[arg2]\n" + "mov x4, %[arg3]\n" + "mov x5, %[arg4]\n" + "mov x6, %[arg5]\n" + "mov x7, %[arg6]\n" + "hvc #0\n" + "mov %[res0], x0\n" + "mov %[res1], x1\n" + "mov %[res2], x2\n" + "mov %[res3], x3\n" + : [res0] "=r"(res->a0), [res1] "=r"(res->a1), + [res2] "=r"(res->a2), [res3] "=r"(res->a3) + : [function_id] "r"(function_id), [arg0] "r"(arg0), + [arg1] "r"(arg1), [arg2] "r"(arg2), [arg3] "r"(arg3), + [arg4] "r"(arg4), [arg5] "r"(arg5), [arg6] "r"(arg6) + : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7"); } diff --git a/tools/testing/selftests/kvm/lib/aarch64/spinlock.c b/tools/testing/selftests/kvm/lib/aarch64/spinlock.c new file mode 100644 index 000000000000..a076e780be5d --- /dev/null +++ b/tools/testing/selftests/kvm/lib/aarch64/spinlock.c @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * ARM64 Spinlock support + */ +#include <stdint.h> + +#include "spinlock.h" + +void spin_lock(struct spinlock *lock) +{ + int val, res; + + asm volatile( + "1: ldaxr %w0, [%2]\n" + " cbnz %w0, 1b\n" + " mov %w0, #1\n" + " stxr %w1, %w0, [%2]\n" + " cbnz %w1, 1b\n" + : "=&r" (val), "=&r" (res) + : "r" (&lock->v) + : "memory"); +} + +void spin_unlock(struct spinlock *lock) +{ + asm volatile("stlr wzr, [%0]\n" : : "r" (&lock->v) : "memory"); +} diff --git a/tools/testing/selftests/kvm/lib/aarch64/ucall.c b/tools/testing/selftests/kvm/lib/aarch64/ucall.c index 6cd91970fbad..ed237b744690 100644 --- a/tools/testing/selftests/kvm/lib/aarch64/ucall.c +++ b/tools/testing/selftests/kvm/lib/aarch64/ucall.c @@ -5,7 +5,6 @@ * Copyright (C) 2018, Red Hat, Inc. */ #include "kvm_util.h" -#include "../kvm_util_internal.h" static vm_vaddr_t *ucall_exit_mmio_addr; @@ -14,7 +13,7 @@ static bool ucall_mmio_init(struct kvm_vm *vm, vm_paddr_t gpa) if (kvm_userspace_memory_region_find(vm, gpa, gpa + 1)) return false; - virt_pg_map(vm, gpa, gpa, 0); + virt_pg_map(vm, gpa, gpa); ucall_exit_mmio_addr = (vm_vaddr_t *)gpa; sync_global_to_guest(vm, ucall_exit_mmio_addr); @@ -52,7 +51,7 @@ void ucall_init(struct kvm_vm *vm, void *arg) * lower and won't match physical addresses. */ bits = vm->va_bits - 1; - bits = vm->pa_bits < bits ? vm->pa_bits : bits; + bits = min(vm->pa_bits, bits); end = 1ul << bits; start = end * 5 / 8; step = end / 16; @@ -62,7 +61,7 @@ void ucall_init(struct kvm_vm *vm, void *arg) if (ucall_mmio_init(vm, start + offset)) return; } - TEST_ASSERT(false, "Can't find a ucall mmio address"); + TEST_FAIL("Can't find a ucall mmio address"); } void ucall_uninit(struct kvm_vm *vm) @@ -73,27 +72,29 @@ void ucall_uninit(struct kvm_vm *vm) void ucall(uint64_t cmd, int nargs, ...) { - struct ucall uc = { - .cmd = cmd, - }; + struct ucall uc = {}; va_list va; int i; - nargs = nargs <= UCALL_MAX_ARGS ? nargs : UCALL_MAX_ARGS; + WRITE_ONCE(uc.cmd, cmd); + nargs = min(nargs, UCALL_MAX_ARGS); va_start(va, nargs); for (i = 0; i < nargs; ++i) - uc.args[i] = va_arg(va, uint64_t); + WRITE_ONCE(uc.args[i], va_arg(va, uint64_t)); va_end(va); - *ucall_exit_mmio_addr = (vm_vaddr_t)&uc; + WRITE_ONCE(*ucall_exit_mmio_addr, (vm_vaddr_t)&uc); } -uint64_t get_ucall(struct kvm_vm *vm, uint32_t vcpu_id, struct ucall *uc) +uint64_t get_ucall(struct kvm_vcpu *vcpu, struct ucall *uc) { - struct kvm_run *run = vcpu_state(vm, vcpu_id); + struct kvm_run *run = vcpu->run; struct ucall ucall = {}; + if (uc) + memset(uc, 0, sizeof(*uc)); + if (run->exit_reason == KVM_EXIT_MMIO && run->mmio.phys_addr == (uint64_t)ucall_exit_mmio_addr) { vm_vaddr_t gva; @@ -101,9 +102,9 @@ uint64_t get_ucall(struct kvm_vm *vm, uint32_t vcpu_id, struct ucall *uc) TEST_ASSERT(run->mmio.is_write && run->mmio.len == 8, "Unexpected ucall exit mmio address access"); memcpy(&gva, run->mmio.data, sizeof(gva)); - memcpy(&ucall, addr_gva2hva(vm, gva), sizeof(ucall)); + memcpy(&ucall, addr_gva2hva(vcpu->vm, gva), sizeof(ucall)); - vcpu_run_complete_io(vm, vcpu_id); + vcpu_run_complete_io(vcpu); if (uc) memcpy(uc, &ucall, sizeof(ucall)); } diff --git a/tools/testing/selftests/kvm/lib/aarch64/vgic.c b/tools/testing/selftests/kvm/lib/aarch64/vgic.c new file mode 100644 index 000000000000..b5f28d21a947 --- /dev/null +++ b/tools/testing/selftests/kvm/lib/aarch64/vgic.c @@ -0,0 +1,170 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * ARM Generic Interrupt Controller (GIC) v3 host support + */ + +#include <linux/kvm.h> +#include <linux/sizes.h> +#include <asm/kvm_para.h> +#include <asm/kvm.h> + +#include "kvm_util.h" +#include "vgic.h" +#include "gic.h" +#include "gic_v3.h" + +/* + * vGIC-v3 default host setup + * + * Input args: + * vm - KVM VM + * nr_vcpus - Number of vCPUs supported by this VM + * gicd_base_gpa - Guest Physical Address of the Distributor region + * gicr_base_gpa - Guest Physical Address of the Redistributor region + * + * Output args: None + * + * Return: GIC file-descriptor or negative error code upon failure + * + * The function creates a vGIC-v3 device and maps the distributor and + * redistributor regions of the guest. Since it depends on the number of + * vCPUs for the VM, it must be called after all the vCPUs have been created. + */ +int vgic_v3_setup(struct kvm_vm *vm, unsigned int nr_vcpus, uint32_t nr_irqs, + uint64_t gicd_base_gpa, uint64_t gicr_base_gpa) +{ + int gic_fd; + uint64_t redist_attr; + struct list_head *iter; + unsigned int nr_gic_pages, nr_vcpus_created = 0; + + TEST_ASSERT(nr_vcpus, "Number of vCPUs cannot be empty\n"); + + /* + * Make sure that the caller is infact calling this + * function after all the vCPUs are added. + */ + list_for_each(iter, &vm->vcpus) + nr_vcpus_created++; + TEST_ASSERT(nr_vcpus == nr_vcpus_created, + "Number of vCPUs requested (%u) doesn't match with the ones created for the VM (%u)\n", + nr_vcpus, nr_vcpus_created); + + /* Distributor setup */ + gic_fd = __kvm_create_device(vm, KVM_DEV_TYPE_ARM_VGIC_V3); + if (gic_fd < 0) + return gic_fd; + + kvm_device_attr_set(gic_fd, KVM_DEV_ARM_VGIC_GRP_NR_IRQS, 0, &nr_irqs); + + kvm_device_attr_set(gic_fd, KVM_DEV_ARM_VGIC_GRP_CTRL, + KVM_DEV_ARM_VGIC_CTRL_INIT, NULL); + + kvm_device_attr_set(gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, + KVM_VGIC_V3_ADDR_TYPE_DIST, &gicd_base_gpa); + nr_gic_pages = vm_calc_num_guest_pages(vm->mode, KVM_VGIC_V3_DIST_SIZE); + virt_map(vm, gicd_base_gpa, gicd_base_gpa, nr_gic_pages); + + /* Redistributor setup */ + redist_attr = REDIST_REGION_ATTR_ADDR(nr_vcpus, gicr_base_gpa, 0, 0); + kvm_device_attr_set(gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, + KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &redist_attr); + nr_gic_pages = vm_calc_num_guest_pages(vm->mode, + KVM_VGIC_V3_REDIST_SIZE * nr_vcpus); + virt_map(vm, gicr_base_gpa, gicr_base_gpa, nr_gic_pages); + + kvm_device_attr_set(gic_fd, KVM_DEV_ARM_VGIC_GRP_CTRL, + KVM_DEV_ARM_VGIC_CTRL_INIT, NULL); + + return gic_fd; +} + +/* should only work for level sensitive interrupts */ +int _kvm_irq_set_level_info(int gic_fd, uint32_t intid, int level) +{ + uint64_t attr = 32 * (intid / 32); + uint64_t index = intid % 32; + uint64_t val; + int ret; + + ret = __kvm_device_attr_get(gic_fd, KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO, + attr, &val); + if (ret != 0) + return ret; + + val |= 1U << index; + ret = __kvm_device_attr_set(gic_fd, KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO, + attr, &val); + return ret; +} + +void kvm_irq_set_level_info(int gic_fd, uint32_t intid, int level) +{ + int ret = _kvm_irq_set_level_info(gic_fd, intid, level); + + TEST_ASSERT(!ret, KVM_IOCTL_ERROR(KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO, ret)); +} + +int _kvm_arm_irq_line(struct kvm_vm *vm, uint32_t intid, int level) +{ + uint32_t irq = intid & KVM_ARM_IRQ_NUM_MASK; + + TEST_ASSERT(!INTID_IS_SGI(intid), "KVM_IRQ_LINE's interface itself " + "doesn't allow injecting SGIs. There's no mask for it."); + + if (INTID_IS_PPI(intid)) + irq |= KVM_ARM_IRQ_TYPE_PPI << KVM_ARM_IRQ_TYPE_SHIFT; + else + irq |= KVM_ARM_IRQ_TYPE_SPI << KVM_ARM_IRQ_TYPE_SHIFT; + + return _kvm_irq_line(vm, irq, level); +} + +void kvm_arm_irq_line(struct kvm_vm *vm, uint32_t intid, int level) +{ + int ret = _kvm_arm_irq_line(vm, intid, level); + + TEST_ASSERT(!ret, KVM_IOCTL_ERROR(KVM_IRQ_LINE, ret)); +} + +static void vgic_poke_irq(int gic_fd, uint32_t intid, struct kvm_vcpu *vcpu, + uint64_t reg_off) +{ + uint64_t reg = intid / 32; + uint64_t index = intid % 32; + uint64_t attr = reg_off + reg * 4; + uint64_t val; + bool intid_is_private = INTID_IS_SGI(intid) || INTID_IS_PPI(intid); + + uint32_t group = intid_is_private ? KVM_DEV_ARM_VGIC_GRP_REDIST_REGS + : KVM_DEV_ARM_VGIC_GRP_DIST_REGS; + + if (intid_is_private) { + /* TODO: only vcpu 0 implemented for now. */ + assert(vcpu->id == 0); + attr += SZ_64K; + } + + /* Check that the addr part of the attr is within 32 bits. */ + assert((attr & ~KVM_DEV_ARM_VGIC_OFFSET_MASK) == 0); + + /* + * All calls will succeed, even with invalid intid's, as long as the + * addr part of the attr is within 32 bits (checked above). An invalid + * intid will just make the read/writes point to above the intended + * register space (i.e., ICPENDR after ISPENDR). + */ + kvm_device_attr_get(gic_fd, group, attr, &val); + val |= 1ULL << index; + kvm_device_attr_set(gic_fd, group, attr, &val); +} + +void kvm_irq_write_ispendr(int gic_fd, uint32_t intid, struct kvm_vcpu *vcpu) +{ + vgic_poke_irq(gic_fd, intid, vcpu, GICD_ISPENDR); +} + +void kvm_irq_write_isactiver(int gic_fd, uint32_t intid, struct kvm_vcpu *vcpu) +{ + vgic_poke_irq(gic_fd, intid, vcpu, GICD_ISACTIVER); +} diff --git a/tools/testing/selftests/kvm/lib/assert.c b/tools/testing/selftests/kvm/lib/assert.c index d1cf9f6e0e6b..2bd25b191d15 100644 --- a/tools/testing/selftests/kvm/lib/assert.c +++ b/tools/testing/selftests/kvm/lib/assert.c @@ -22,7 +22,7 @@ static void test_dump_stack(void) * Build and run this command: * * addr2line -s -e /proc/$PPID/exe -fpai {backtrace addresses} | \ - * grep -v test_dump_stack | cat -n 1>&2 + * cat -n 1>&2 * * Note that the spacing is different and there's no newline. */ @@ -36,18 +36,24 @@ static void test_dump_stack(void) n * (((sizeof(void *)) * 2) + 1) + /* Null terminator: */ 1]; - char *c; + char *c = cmd; n = backtrace(stack, n); - c = &cmd[0]; - c += sprintf(c, "%s", addr2line); /* - * Skip the first 3 frames: backtrace, test_dump_stack, and - * test_assert. We hope that backtrace isn't inlined and the other two - * we've declared noinline. + * Skip the first 2 frames, which should be test_dump_stack() and + * test_assert(); both of which are declared noinline. Bail if the + * resulting stack trace would be empty. Otherwise, addr2line will block + * waiting for addresses to be passed in via stdin. */ + if (n <= 2) { + fputs(" (stack trace empty)\n", stderr); + return; + } + + c += sprintf(c, "%s", addr2line); for (i = 2; i < n; i++) c += sprintf(c, " %lx", ((unsigned long) stack[i]) - 1); + c += sprintf(c, "%s", pipeline); #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wunused-result" @@ -71,9 +77,9 @@ test_assert(bool exp, const char *exp_str, fprintf(stderr, "==== Test Assertion Failure ====\n" " %s:%u: %s\n" - " pid=%d tid=%d - %s\n", + " pid=%d tid=%d errno=%d - %s\n", file, line, exp_str, getpid(), _gettid(), - strerror(errno)); + errno, strerror(errno)); test_dump_stack(); if (fmt) { fputs(" ", stderr); @@ -82,8 +88,10 @@ test_assert(bool exp, const char *exp_str, } va_end(ap); - if (errno == EACCES) - ksft_exit_skip("Access denied - Exiting.\n"); + if (errno == EACCES) { + print_skip("Access denied - Exiting"); + exit(KSFT_SKIP); + } exit(254); } diff --git a/tools/testing/selftests/kvm/lib/elf.c b/tools/testing/selftests/kvm/lib/elf.c index bc75a91e00a6..9f54c098d9d0 100644 --- a/tools/testing/selftests/kvm/lib/elf.c +++ b/tools/testing/selftests/kvm/lib/elf.c @@ -11,7 +11,6 @@ #include <linux/elf.h> #include "kvm_util.h" -#include "kvm_util_internal.h" static void elfhdr_get(const char *filename, Elf64_Ehdr *hdrp) { @@ -111,8 +110,7 @@ static void elfhdr_get(const char *filename, Elf64_Ehdr *hdrp) * by the image and it needs to have sufficient available physical pages, to * back the virtual pages used to load the image. */ -void kvm_vm_elf_load(struct kvm_vm *vm, const char *filename, - uint32_t data_memslot, uint32_t pgd_memslot) +void kvm_vm_elf_load(struct kvm_vm *vm, const char *filename) { off_t offset, offset_rv; Elf64_Ehdr hdr; @@ -158,14 +156,12 @@ void kvm_vm_elf_load(struct kvm_vm *vm, const char *filename, "memsize of 0,\n" " phdr index: %u p_memsz: 0x%" PRIx64, n1, (uint64_t) phdr.p_memsz); - vm_vaddr_t seg_vstart = phdr.p_vaddr; - seg_vstart &= ~(vm_vaddr_t)(vm->page_size - 1); + vm_vaddr_t seg_vstart = align_down(phdr.p_vaddr, vm->page_size); vm_vaddr_t seg_vend = phdr.p_vaddr + phdr.p_memsz - 1; seg_vend |= vm->page_size - 1; size_t seg_size = seg_vend - seg_vstart + 1; - vm_vaddr_t vaddr = vm_vaddr_alloc(vm, seg_size, seg_vstart, - data_memslot, pgd_memslot); + vm_vaddr_t vaddr = vm_vaddr_alloc(vm, seg_size, seg_vstart); TEST_ASSERT(vaddr == seg_vstart, "Unable to allocate " "virtual memory for segment at requested min addr,\n" " segment idx: %u\n" diff --git a/tools/testing/selftests/kvm/lib/guest_modes.c b/tools/testing/selftests/kvm/lib/guest_modes.c new file mode 100644 index 000000000000..99a575bbbc52 --- /dev/null +++ b/tools/testing/selftests/kvm/lib/guest_modes.c @@ -0,0 +1,133 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2020, Red Hat, Inc. + */ +#include "guest_modes.h" + +#ifdef __aarch64__ +#include "processor.h" +enum vm_guest_mode vm_mode_default; +#endif + +struct guest_mode guest_modes[NUM_VM_MODES]; + +void guest_modes_append_default(void) +{ +#ifndef __aarch64__ + guest_mode_append(VM_MODE_DEFAULT, true, true); +#else + { + unsigned int limit = kvm_check_cap(KVM_CAP_ARM_VM_IPA_SIZE); + bool ps4k, ps16k, ps64k; + int i; + + aarch64_get_supported_page_sizes(limit, &ps4k, &ps16k, &ps64k); + + vm_mode_default = NUM_VM_MODES; + + if (limit >= 52) + guest_mode_append(VM_MODE_P52V48_64K, ps64k, ps64k); + if (limit >= 48) { + guest_mode_append(VM_MODE_P48V48_4K, ps4k, ps4k); + guest_mode_append(VM_MODE_P48V48_16K, ps16k, ps16k); + guest_mode_append(VM_MODE_P48V48_64K, ps64k, ps64k); + } + if (limit >= 40) { + guest_mode_append(VM_MODE_P40V48_4K, ps4k, ps4k); + guest_mode_append(VM_MODE_P40V48_16K, ps16k, ps16k); + guest_mode_append(VM_MODE_P40V48_64K, ps64k, ps64k); + if (ps4k) + vm_mode_default = VM_MODE_P40V48_4K; + } + if (limit >= 36) { + guest_mode_append(VM_MODE_P36V48_4K, ps4k, ps4k); + guest_mode_append(VM_MODE_P36V48_16K, ps16k, ps16k); + guest_mode_append(VM_MODE_P36V48_64K, ps64k, ps64k); + guest_mode_append(VM_MODE_P36V47_16K, ps16k, ps16k); + } + + /* + * Pick the first supported IPA size if the default + * isn't available. + */ + for (i = 0; vm_mode_default == NUM_VM_MODES && i < NUM_VM_MODES; i++) { + if (guest_modes[i].supported && guest_modes[i].enabled) + vm_mode_default = i; + } + + TEST_ASSERT(vm_mode_default != NUM_VM_MODES, + "No supported mode!"); + } +#endif +#ifdef __s390x__ + { + int kvm_fd, vm_fd; + struct kvm_s390_vm_cpu_processor info; + + kvm_fd = open_kvm_dev_path_or_exit(); + vm_fd = __kvm_ioctl(kvm_fd, KVM_CREATE_VM, NULL); + kvm_device_attr_get(vm_fd, KVM_S390_VM_CPU_MODEL, + KVM_S390_VM_CPU_PROCESSOR, &info); + close(vm_fd); + close(kvm_fd); + /* Starting with z13 we have 47bits of physical address */ + if (info.ibc >= 0x30) + guest_mode_append(VM_MODE_P47V64_4K, true, true); + } +#endif +#ifdef __riscv + { + unsigned int sz = kvm_check_cap(KVM_CAP_VM_GPA_BITS); + + if (sz >= 52) + guest_mode_append(VM_MODE_P52V48_4K, true, true); + if (sz >= 48) + guest_mode_append(VM_MODE_P48V48_4K, true, true); + } +#endif +} + +void for_each_guest_mode(void (*func)(enum vm_guest_mode, void *), void *arg) +{ + int i; + + for (i = 0; i < NUM_VM_MODES; ++i) { + if (!guest_modes[i].enabled) + continue; + TEST_ASSERT(guest_modes[i].supported, + "Guest mode ID %d (%s) not supported.", + i, vm_guest_mode_string(i)); + func(i, arg); + } +} + +void guest_modes_help(void) +{ + int i; + + printf(" -m: specify the guest mode ID to test\n" + " (default: test all supported modes)\n" + " This option may be used multiple times.\n" + " Guest mode IDs:\n"); + for (i = 0; i < NUM_VM_MODES; ++i) { + printf(" %d: %s%s\n", i, vm_guest_mode_string(i), + guest_modes[i].supported ? " (supported)" : ""); + } +} + +void guest_modes_cmdline(const char *arg) +{ + static bool mode_selected; + unsigned int mode; + int i; + + if (!mode_selected) { + for (i = 0; i < NUM_VM_MODES; ++i) + guest_modes[i].enabled = false; + mode_selected = true; + } + + mode = strtoul(optarg, NULL, 10); + TEST_ASSERT(mode < NUM_VM_MODES, "Guest mode ID %d too big", mode); + guest_modes[mode].enabled = true; +} diff --git a/tools/testing/selftests/kvm/lib/io.c b/tools/testing/selftests/kvm/lib/io.c index eaf351cc7e7f..fedb2a741f0b 100644 --- a/tools/testing/selftests/kvm/lib/io.c +++ b/tools/testing/selftests/kvm/lib/io.c @@ -61,9 +61,9 @@ ssize_t test_write(int fd, const void *buf, size_t count) continue; case 0: - TEST_ASSERT(false, "Unexpected EOF,\n" - " rc: %zi num_written: %zi num_left: %zu", - rc, num_written, num_left); + TEST_FAIL("Unexpected EOF,\n" + " rc: %zi num_written: %zi num_left: %zu", + rc, num_written, num_left); break; default: @@ -138,9 +138,9 @@ ssize_t test_read(int fd, void *buf, size_t count) break; case 0: - TEST_ASSERT(false, "Unexpected EOF,\n" - " rc: %zi num_read: %zi num_left: %zu", - rc, num_read, num_left); + TEST_FAIL("Unexpected EOF,\n" + " rc: %zi num_read: %zi num_left: %zu", + rc, num_read, num_left); break; default: diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index a6dd0401eb50..f1cb1627161f 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -5,27 +5,88 @@ * Copyright (C) 2018, Google LLC. */ +#define _GNU_SOURCE /* for program_invocation_name */ #include "test_util.h" #include "kvm_util.h" -#include "kvm_util_internal.h" #include "processor.h" #include <assert.h> #include <sys/mman.h> #include <sys/types.h> #include <sys/stat.h> +#include <unistd.h> #include <linux/kernel.h> -#define KVM_UTIL_PGS_PER_HUGEPG 512 #define KVM_UTIL_MIN_PFN 2 -/* Aligns x up to the next multiple of size. Size must be a power of 2. */ -static void *align(void *x, size_t size) +static int vcpu_mmap_sz(void); + +int open_path_or_exit(const char *path, int flags) { - size_t mask = size - 1; - TEST_ASSERT(size != 0 && !(size & (size - 1)), - "size not a power of 2: %lu", size); - return (void *) (((size_t) x + mask) & ~mask); + int fd; + + fd = open(path, flags); + __TEST_REQUIRE(fd >= 0, "%s not available (errno: %d)", path, errno); + + return fd; +} + +/* + * Open KVM_DEV_PATH if available, otherwise exit the entire program. + * + * Input Args: + * flags - The flags to pass when opening KVM_DEV_PATH. + * + * Return: + * The opened file descriptor of /dev/kvm. + */ +static int _open_kvm_dev_path_or_exit(int flags) +{ + return open_path_or_exit(KVM_DEV_PATH, flags); +} + +int open_kvm_dev_path_or_exit(void) +{ + return _open_kvm_dev_path_or_exit(O_RDONLY); +} + +static bool get_module_param_bool(const char *module_name, const char *param) +{ + const int path_size = 128; + char path[path_size]; + char value; + ssize_t r; + int fd; + + r = snprintf(path, path_size, "/sys/module/%s/parameters/%s", + module_name, param); + TEST_ASSERT(r < path_size, + "Failed to construct sysfs path in %d bytes.", path_size); + + fd = open_path_or_exit(path, O_RDONLY); + + r = read(fd, &value, 1); + TEST_ASSERT(r == 1, "read(%s) failed", path); + + r = close(fd); + TEST_ASSERT(!r, "close(%s) failed", path); + + if (value == 'Y') + return true; + else if (value == 'N') + return false; + + TEST_FAIL("Unrecognized value '%c' for boolean module param", value); +} + +bool get_kvm_intel_param_bool(const char *param) +{ + return get_module_param_bool("kvm_intel", param); +} + +bool get_kvm_amd_param_bool(const char *param) +{ + return get_module_param_bool("kvm_amd", param); } /* @@ -44,167 +105,166 @@ static void *align(void *x, size_t size) * Looks up and returns the value corresponding to the capability * (KVM_CAP_*) given by cap. */ -int kvm_check_cap(long cap) +unsigned int kvm_check_cap(long cap) { int ret; int kvm_fd; - kvm_fd = open(KVM_DEV_PATH, O_RDONLY); - if (kvm_fd < 0) - exit(KSFT_SKIP); - - ret = ioctl(kvm_fd, KVM_CHECK_EXTENSION, cap); - TEST_ASSERT(ret != -1, "KVM_CHECK_EXTENSION IOCTL failed,\n" - " rc: %i errno: %i", ret, errno); + kvm_fd = open_kvm_dev_path_or_exit(); + ret = __kvm_ioctl(kvm_fd, KVM_CHECK_EXTENSION, (void *)cap); + TEST_ASSERT(ret >= 0, KVM_IOCTL_ERROR(KVM_CHECK_EXTENSION, ret)); close(kvm_fd); - return ret; + return (unsigned int)ret; } -/* VM Enable Capability - * - * Input Args: - * vm - Virtual Machine - * cap - Capability - * - * Output Args: None - * - * Return: On success, 0. On failure a TEST_ASSERT failure is produced. - * - * Enables a capability (KVM_CAP_*) on the VM. - */ -int vm_enable_cap(struct kvm_vm *vm, struct kvm_enable_cap *cap) +void vm_enable_dirty_ring(struct kvm_vm *vm, uint32_t ring_size) { - int ret; - - ret = ioctl(vm->fd, KVM_ENABLE_CAP, cap); - TEST_ASSERT(ret == 0, "KVM_ENABLE_CAP IOCTL failed,\n" - " rc: %i errno: %i", ret, errno); - - return ret; + if (vm_check_cap(vm, KVM_CAP_DIRTY_LOG_RING_ACQ_REL)) + vm_enable_cap(vm, KVM_CAP_DIRTY_LOG_RING_ACQ_REL, ring_size); + else + vm_enable_cap(vm, KVM_CAP_DIRTY_LOG_RING, ring_size); + vm->dirty_ring_size = ring_size; } -static void vm_open(struct kvm_vm *vm, int perm) +static void vm_open(struct kvm_vm *vm) { - vm->kvm_fd = open(KVM_DEV_PATH, perm); - if (vm->kvm_fd < 0) - exit(KSFT_SKIP); + vm->kvm_fd = _open_kvm_dev_path_or_exit(O_RDWR); - if (!kvm_check_cap(KVM_CAP_IMMEDIATE_EXIT)) { - fprintf(stderr, "immediate_exit not available, skipping test\n"); - exit(KSFT_SKIP); - } + TEST_REQUIRE(kvm_has_cap(KVM_CAP_IMMEDIATE_EXIT)); - vm->fd = ioctl(vm->kvm_fd, KVM_CREATE_VM, vm->type); - TEST_ASSERT(vm->fd >= 0, "KVM_CREATE_VM ioctl failed, " - "rc: %i errno: %i", vm->fd, errno); + vm->fd = __kvm_ioctl(vm->kvm_fd, KVM_CREATE_VM, (void *)vm->type); + TEST_ASSERT(vm->fd >= 0, KVM_IOCTL_ERROR(KVM_CREATE_VM, vm->fd)); } -const char * const vm_guest_mode_string[] = { - "PA-bits:52, VA-bits:48, 4K pages", - "PA-bits:52, VA-bits:48, 64K pages", - "PA-bits:48, VA-bits:48, 4K pages", - "PA-bits:48, VA-bits:48, 64K pages", - "PA-bits:40, VA-bits:48, 4K pages", - "PA-bits:40, VA-bits:48, 64K pages", - "PA-bits:ANY, VA-bits:48, 4K pages", +const char *vm_guest_mode_string(uint32_t i) +{ + static const char * const strings[] = { + [VM_MODE_P52V48_4K] = "PA-bits:52, VA-bits:48, 4K pages", + [VM_MODE_P52V48_64K] = "PA-bits:52, VA-bits:48, 64K pages", + [VM_MODE_P48V48_4K] = "PA-bits:48, VA-bits:48, 4K pages", + [VM_MODE_P48V48_16K] = "PA-bits:48, VA-bits:48, 16K pages", + [VM_MODE_P48V48_64K] = "PA-bits:48, VA-bits:48, 64K pages", + [VM_MODE_P40V48_4K] = "PA-bits:40, VA-bits:48, 4K pages", + [VM_MODE_P40V48_16K] = "PA-bits:40, VA-bits:48, 16K pages", + [VM_MODE_P40V48_64K] = "PA-bits:40, VA-bits:48, 64K pages", + [VM_MODE_PXXV48_4K] = "PA-bits:ANY, VA-bits:48, 4K pages", + [VM_MODE_P47V64_4K] = "PA-bits:47, VA-bits:64, 4K pages", + [VM_MODE_P44V64_4K] = "PA-bits:44, VA-bits:64, 4K pages", + [VM_MODE_P36V48_4K] = "PA-bits:36, VA-bits:48, 4K pages", + [VM_MODE_P36V48_16K] = "PA-bits:36, VA-bits:48, 16K pages", + [VM_MODE_P36V48_64K] = "PA-bits:36, VA-bits:48, 64K pages", + [VM_MODE_P36V47_16K] = "PA-bits:36, VA-bits:47, 16K pages", + }; + _Static_assert(sizeof(strings)/sizeof(char *) == NUM_VM_MODES, + "Missing new mode strings?"); + + TEST_ASSERT(i < NUM_VM_MODES, "Guest mode ID %d too big", i); + + return strings[i]; +} + +const struct vm_guest_mode_params vm_guest_mode_params[] = { + [VM_MODE_P52V48_4K] = { 52, 48, 0x1000, 12 }, + [VM_MODE_P52V48_64K] = { 52, 48, 0x10000, 16 }, + [VM_MODE_P48V48_4K] = { 48, 48, 0x1000, 12 }, + [VM_MODE_P48V48_16K] = { 48, 48, 0x4000, 14 }, + [VM_MODE_P48V48_64K] = { 48, 48, 0x10000, 16 }, + [VM_MODE_P40V48_4K] = { 40, 48, 0x1000, 12 }, + [VM_MODE_P40V48_16K] = { 40, 48, 0x4000, 14 }, + [VM_MODE_P40V48_64K] = { 40, 48, 0x10000, 16 }, + [VM_MODE_PXXV48_4K] = { 0, 0, 0x1000, 12 }, + [VM_MODE_P47V64_4K] = { 47, 64, 0x1000, 12 }, + [VM_MODE_P44V64_4K] = { 44, 64, 0x1000, 12 }, + [VM_MODE_P36V48_4K] = { 36, 48, 0x1000, 12 }, + [VM_MODE_P36V48_16K] = { 36, 48, 0x4000, 14 }, + [VM_MODE_P36V48_64K] = { 36, 48, 0x10000, 16 }, + [VM_MODE_P36V47_16K] = { 36, 47, 0x4000, 14 }, }; -_Static_assert(sizeof(vm_guest_mode_string)/sizeof(char *) == NUM_VM_MODES, - "Missing new mode strings?"); +_Static_assert(sizeof(vm_guest_mode_params)/sizeof(struct vm_guest_mode_params) == NUM_VM_MODES, + "Missing new mode params?"); -/* - * VM Create - * - * Input Args: - * mode - VM Mode (e.g. VM_MODE_P52V48_4K) - * phy_pages - Physical memory pages - * perm - permission - * - * Output Args: None - * - * Return: - * Pointer to opaque structure that describes the created VM. - * - * Creates a VM with the mode specified by mode (e.g. VM_MODE_P52V48_4K). - * When phy_pages is non-zero, a memory region of phy_pages physical pages - * is created and mapped starting at guest physical address 0. The file - * descriptor to control the created VM is created with the permissions - * given by perm (e.g. O_RDWR). - */ -struct kvm_vm *_vm_create(enum vm_guest_mode mode, uint64_t phy_pages, int perm) +struct kvm_vm *____vm_create(enum vm_guest_mode mode, uint64_t nr_pages) { struct kvm_vm *vm; - DEBUG("Testing guest mode: %s\n", vm_guest_mode_string(mode)); + pr_debug("%s: mode='%s' pages='%ld'\n", __func__, + vm_guest_mode_string(mode), nr_pages); vm = calloc(1, sizeof(*vm)); TEST_ASSERT(vm != NULL, "Insufficient Memory"); + INIT_LIST_HEAD(&vm->vcpus); + vm->regions.gpa_tree = RB_ROOT; + vm->regions.hva_tree = RB_ROOT; + hash_init(vm->regions.slot_hash); + vm->mode = mode; vm->type = 0; + vm->pa_bits = vm_guest_mode_params[mode].pa_bits; + vm->va_bits = vm_guest_mode_params[mode].va_bits; + vm->page_size = vm_guest_mode_params[mode].page_size; + vm->page_shift = vm_guest_mode_params[mode].page_shift; + /* Setup mode specific traits. */ switch (vm->mode) { case VM_MODE_P52V48_4K: vm->pgtable_levels = 4; - vm->pa_bits = 52; - vm->va_bits = 48; - vm->page_size = 0x1000; - vm->page_shift = 12; break; case VM_MODE_P52V48_64K: vm->pgtable_levels = 3; - vm->pa_bits = 52; - vm->va_bits = 48; - vm->page_size = 0x10000; - vm->page_shift = 16; break; case VM_MODE_P48V48_4K: vm->pgtable_levels = 4; - vm->pa_bits = 48; - vm->va_bits = 48; - vm->page_size = 0x1000; - vm->page_shift = 12; break; case VM_MODE_P48V48_64K: vm->pgtable_levels = 3; - vm->pa_bits = 48; - vm->va_bits = 48; - vm->page_size = 0x10000; - vm->page_shift = 16; break; case VM_MODE_P40V48_4K: + case VM_MODE_P36V48_4K: vm->pgtable_levels = 4; - vm->pa_bits = 40; - vm->va_bits = 48; - vm->page_size = 0x1000; - vm->page_shift = 12; break; case VM_MODE_P40V48_64K: + case VM_MODE_P36V48_64K: + vm->pgtable_levels = 3; + break; + case VM_MODE_P48V48_16K: + case VM_MODE_P40V48_16K: + case VM_MODE_P36V48_16K: + vm->pgtable_levels = 4; + break; + case VM_MODE_P36V47_16K: vm->pgtable_levels = 3; - vm->pa_bits = 40; - vm->va_bits = 48; - vm->page_size = 0x10000; - vm->page_shift = 16; break; case VM_MODE_PXXV48_4K: #ifdef __x86_64__ kvm_get_cpu_address_width(&vm->pa_bits, &vm->va_bits); - TEST_ASSERT(vm->va_bits == 48, "Linear address width " - "(%d bits) not supported", vm->va_bits); + /* + * Ignore KVM support for 5-level paging (vm->va_bits == 57), + * it doesn't take effect unless a CR4.LA57 is set, which it + * isn't for this VM_MODE. + */ + TEST_ASSERT(vm->va_bits == 48 || vm->va_bits == 57, + "Linear address width (%d bits) not supported", + vm->va_bits); + pr_debug("Guest physical address width detected: %d\n", + vm->pa_bits); vm->pgtable_levels = 4; - vm->page_size = 0x1000; - vm->page_shift = 12; - DEBUG("Guest physical address width detected: %d\n", - vm->pa_bits); + vm->va_bits = 48; #else - TEST_ASSERT(false, "VM_MODE_PXXV48_4K not supported on " - "non-x86 platforms"); + TEST_FAIL("VM_MODE_PXXV48_4K not supported on non-x86 platforms"); #endif break; + case VM_MODE_P47V64_4K: + vm->pgtable_levels = 5; + break; + case VM_MODE_P44V64_4K: + vm->pgtable_levels = 5; + break; default: - TEST_ASSERT(false, "Unknown guest mode, mode: 0x%x", mode); + TEST_FAIL("Unknown guest mode, mode: 0x%x", mode); } #ifdef __aarch64__ @@ -212,7 +272,7 @@ struct kvm_vm *_vm_create(enum vm_guest_mode mode, uint64_t phy_pages, int perm) vm->type = KVM_VM_TYPE_ARM_IPA_SIZE(vm->pa_bits); #endif - vm_open(vm, perm); + vm_open(vm); /* Limit to VA-bit canonical virtual addresses. */ vm->vpages_valid = sparsebit_alloc(); @@ -223,20 +283,117 @@ struct kvm_vm *_vm_create(enum vm_guest_mode mode, uint64_t phy_pages, int perm) (1ULL << (vm->va_bits - 1)) >> vm->page_shift); /* Limit physical addresses to PA-bits. */ - vm->max_gfn = ((1ULL << vm->pa_bits) >> vm->page_shift) - 1; + vm->max_gfn = vm_compute_max_gfn(vm); /* Allocate and setup memory for guest. */ vm->vpages_mapped = sparsebit_alloc(); - if (phy_pages != 0) + if (nr_pages != 0) vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, - 0, 0, phy_pages, 0); + 0, 0, nr_pages, 0); return vm; } -struct kvm_vm *vm_create(enum vm_guest_mode mode, uint64_t phy_pages, int perm) +static uint64_t vm_nr_pages_required(enum vm_guest_mode mode, + uint32_t nr_runnable_vcpus, + uint64_t extra_mem_pages) { - return _vm_create(mode, phy_pages, perm); + uint64_t nr_pages; + + TEST_ASSERT(nr_runnable_vcpus, + "Use vm_create_barebones() for VMs that _never_ have vCPUs\n"); + + TEST_ASSERT(nr_runnable_vcpus <= kvm_check_cap(KVM_CAP_MAX_VCPUS), + "nr_vcpus = %d too large for host, max-vcpus = %d", + nr_runnable_vcpus, kvm_check_cap(KVM_CAP_MAX_VCPUS)); + + /* + * Arbitrarily allocate 512 pages (2mb when page size is 4kb) for the + * test code and other per-VM assets that will be loaded into memslot0. + */ + nr_pages = 512; + + /* Account for the per-vCPU stacks on behalf of the test. */ + nr_pages += nr_runnable_vcpus * DEFAULT_STACK_PGS; + + /* + * Account for the number of pages needed for the page tables. The + * maximum page table size for a memory region will be when the + * smallest page size is used. Considering each page contains x page + * table descriptors, the total extra size for page tables (for extra + * N pages) will be: N/x+N/x^2+N/x^3+... which is definitely smaller + * than N/x*2. + */ + nr_pages += (nr_pages + extra_mem_pages) / PTES_PER_MIN_PAGE * 2; + + return vm_adjust_num_guest_pages(mode, nr_pages); +} + +struct kvm_vm *__vm_create(enum vm_guest_mode mode, uint32_t nr_runnable_vcpus, + uint64_t nr_extra_pages) +{ + uint64_t nr_pages = vm_nr_pages_required(mode, nr_runnable_vcpus, + nr_extra_pages); + struct kvm_vm *vm; + + vm = ____vm_create(mode, nr_pages); + + kvm_vm_elf_load(vm, program_invocation_name); + +#ifdef __x86_64__ + vm_create_irqchip(vm); +#endif + return vm; +} + +/* + * VM Create with customized parameters + * + * Input Args: + * mode - VM Mode (e.g. VM_MODE_P52V48_4K) + * nr_vcpus - VCPU count + * extra_mem_pages - Non-slot0 physical memory total size + * guest_code - Guest entry point + * vcpuids - VCPU IDs + * + * Output Args: None + * + * Return: + * Pointer to opaque structure that describes the created VM. + * + * Creates a VM with the mode specified by mode (e.g. VM_MODE_P52V48_4K). + * extra_mem_pages is only used to calculate the maximum page table size, + * no real memory allocation for non-slot0 memory in this function. + */ +struct kvm_vm *__vm_create_with_vcpus(enum vm_guest_mode mode, uint32_t nr_vcpus, + uint64_t extra_mem_pages, + void *guest_code, struct kvm_vcpu *vcpus[]) +{ + struct kvm_vm *vm; + int i; + + TEST_ASSERT(!nr_vcpus || vcpus, "Must provide vCPU array"); + + vm = __vm_create(mode, nr_vcpus, extra_mem_pages); + + for (i = 0; i < nr_vcpus; ++i) + vcpus[i] = vm_vcpu_add(vm, i, guest_code); + + return vm; +} + +struct kvm_vm *__vm_create_with_one_vcpu(struct kvm_vcpu **vcpu, + uint64_t extra_mem_pages, + void *guest_code) +{ + struct kvm_vcpu *vcpus[1]; + struct kvm_vm *vm; + + vm = __vm_create_with_vcpus(VM_MODE_DEFAULT, 1, extra_mem_pages, + guest_code, vcpus); + + *vcpu = vcpus[0]; + return vm; } /* @@ -244,7 +401,6 @@ struct kvm_vm *vm_create(enum vm_guest_mode mode, uint64_t phy_pages, int perm) * * Input Args: * vm - VM that has been released before - * perm - permission * * Output Args: None * @@ -252,21 +408,21 @@ struct kvm_vm *vm_create(enum vm_guest_mode mode, uint64_t phy_pages, int perm) * global state, such as the irqchip and the memory regions that are mapped * into the guest. */ -void kvm_vm_restart(struct kvm_vm *vmp, int perm) +void kvm_vm_restart(struct kvm_vm *vmp) { + int ctr; struct userspace_mem_region *region; - vm_open(vmp, perm); + vm_open(vmp); if (vmp->has_irqchip) vm_create_irqchip(vmp); - for (region = vmp->userspace_mem_region_head; region; - region = region->next) { + hash_for_each(vmp->regions.slot_hash, ctr, region, slot_node) { int ret = ioctl(vmp->fd, KVM_SET_USER_MEMORY_REGION, ®ion->region); TEST_ASSERT(ret == 0, "KVM_SET_USER_MEMORY_REGION IOCTL failed,\n" " rc: %i errno: %i\n" " slot: %u flags: 0x%x\n" - " guest_phys_addr: 0x%lx size: 0x%lx", + " guest_phys_addr: 0x%llx size: 0x%llx", ret, errno, region->region.slot, region->region.flags, region->region.guest_phys_addr, @@ -274,27 +430,17 @@ void kvm_vm_restart(struct kvm_vm *vmp, int perm) } } -void kvm_vm_get_dirty_log(struct kvm_vm *vm, int slot, void *log) +__weak struct kvm_vcpu *vm_arch_vcpu_recreate(struct kvm_vm *vm, + uint32_t vcpu_id) { - struct kvm_dirty_log args = { .dirty_bitmap = log, .slot = slot }; - int ret; - - ret = ioctl(vm->fd, KVM_GET_DIRTY_LOG, &args); - TEST_ASSERT(ret == 0, "%s: KVM_GET_DIRTY_LOG failed: %s", - strerror(-ret)); + return __vm_vcpu_add(vm, vcpu_id); } -void kvm_vm_clear_dirty_log(struct kvm_vm *vm, int slot, void *log, - uint64_t first_page, uint32_t num_pages) +struct kvm_vcpu *vm_recreate_with_one_vcpu(struct kvm_vm *vm) { - struct kvm_clear_dirty_log args = { .dirty_bitmap = log, .slot = slot, - .first_page = first_page, - .num_pages = num_pages }; - int ret; + kvm_vm_restart(vm); - ret = ioctl(vm->fd, KVM_CLEAR_DIRTY_LOG, &args); - TEST_ASSERT(ret == 0, "%s: KVM_CLEAR_DIRTY_LOG failed: %s", - strerror(-ret)); + return vm_vcpu_recreate(vm, 0); } /* @@ -319,15 +465,21 @@ void kvm_vm_clear_dirty_log(struct kvm_vm *vm, int slot, void *log, static struct userspace_mem_region * userspace_mem_region_find(struct kvm_vm *vm, uint64_t start, uint64_t end) { - struct userspace_mem_region *region; + struct rb_node *node; - for (region = vm->userspace_mem_region_head; region; - region = region->next) { + for (node = vm->regions.gpa_tree.rb_node; node; ) { + struct userspace_mem_region *region = + container_of(node, struct userspace_mem_region, gpa_node); uint64_t existing_start = region->region.guest_phys_addr; uint64_t existing_end = region->region.guest_phys_addr + region->region.memory_size - 1; if (start <= existing_end && end >= existing_start) return region; + + if (start < existing_start) + node = node->rb_left; + else + node = node->rb_right; } return NULL; @@ -362,82 +514,80 @@ kvm_userspace_memory_region_find(struct kvm_vm *vm, uint64_t start, return ®ion->region; } -/* - * VCPU Find - * - * Input Args: - * vm - Virtual Machine - * vcpuid - VCPU ID - * - * Output Args: None - * - * Return: - * Pointer to VCPU structure - * - * Locates a vcpu structure that describes the VCPU specified by vcpuid and - * returns a pointer to it. Returns NULL if the VM doesn't contain a VCPU - * for the specified vcpuid. - */ -struct vcpu *vcpu_find(struct kvm_vm *vm, uint32_t vcpuid) +__weak void vcpu_arch_free(struct kvm_vcpu *vcpu) { - struct vcpu *vcpup; - - for (vcpup = vm->vcpu_head; vcpup; vcpup = vcpup->next) { - if (vcpup->id == vcpuid) - return vcpup; - } - return NULL; } /* * VM VCPU Remove * * Input Args: - * vm - Virtual Machine - * vcpuid - VCPU ID + * vcpu - VCPU to remove * * Output Args: None * * Return: None, TEST_ASSERT failures for all error conditions * - * Within the VM specified by vm, removes the VCPU given by vcpuid. + * Removes a vCPU from a VM and frees its resources. */ -static void vm_vcpu_rm(struct kvm_vm *vm, uint32_t vcpuid) +static void vm_vcpu_rm(struct kvm_vm *vm, struct kvm_vcpu *vcpu) { - struct vcpu *vcpu = vcpu_find(vm, vcpuid); int ret; - ret = munmap(vcpu->state, sizeof(*vcpu->state)); - TEST_ASSERT(ret == 0, "munmap of VCPU fd failed, rc: %i " - "errno: %i", ret, errno); - close(vcpu->fd); - TEST_ASSERT(ret == 0, "Close of VCPU fd failed, rc: %i " - "errno: %i", ret, errno); - - if (vcpu->next) - vcpu->next->prev = vcpu->prev; - if (vcpu->prev) - vcpu->prev->next = vcpu->next; - else - vm->vcpu_head = vcpu->next; + if (vcpu->dirty_gfns) { + ret = munmap(vcpu->dirty_gfns, vm->dirty_ring_size); + TEST_ASSERT(!ret, __KVM_SYSCALL_ERROR("munmap()", ret)); + vcpu->dirty_gfns = NULL; + } + + ret = munmap(vcpu->run, vcpu_mmap_sz()); + TEST_ASSERT(!ret, __KVM_SYSCALL_ERROR("munmap()", ret)); + + ret = close(vcpu->fd); + TEST_ASSERT(!ret, __KVM_SYSCALL_ERROR("close()", ret)); + + list_del(&vcpu->list); + + vcpu_arch_free(vcpu); free(vcpu); } void kvm_vm_release(struct kvm_vm *vmp) { + struct kvm_vcpu *vcpu, *tmp; int ret; - while (vmp->vcpu_head) - vm_vcpu_rm(vmp, vmp->vcpu_head->id); + list_for_each_entry_safe(vcpu, tmp, &vmp->vcpus, list) + vm_vcpu_rm(vmp, vcpu); ret = close(vmp->fd); - TEST_ASSERT(ret == 0, "Close of vm fd failed,\n" - " vmp->fd: %i rc: %i errno: %i", vmp->fd, ret, errno); + TEST_ASSERT(!ret, __KVM_SYSCALL_ERROR("close()", ret)); + + ret = close(vmp->kvm_fd); + TEST_ASSERT(!ret, __KVM_SYSCALL_ERROR("close()", ret)); +} + +static void __vm_mem_region_delete(struct kvm_vm *vm, + struct userspace_mem_region *region, + bool unlink) +{ + int ret; + + if (unlink) { + rb_erase(®ion->gpa_node, &vm->regions.gpa_tree); + rb_erase(®ion->hva_node, &vm->regions.hva_tree); + hash_del(®ion->slot_node); + } - close(vmp->kvm_fd); - TEST_ASSERT(ret == 0, "Close of /dev/kvm fd failed,\n" - " vmp->kvm_fd: %i rc: %i errno: %i", vmp->kvm_fd, ret, errno); + region->region.memory_size = 0; + vm_ioctl(vm, KVM_SET_USER_MEMORY_REGION, ®ion->region); + + sparsebit_free(®ion->unused_phy_pages); + ret = munmap(region->mmap_start, region->mmap_size); + TEST_ASSERT(!ret, __KVM_SYSCALL_ERROR("munmap()", ret)); + + free(region); } /* @@ -445,31 +595,23 @@ void kvm_vm_release(struct kvm_vm *vmp) */ void kvm_vm_free(struct kvm_vm *vmp) { - int ret; + int ctr; + struct hlist_node *node; + struct userspace_mem_region *region; if (vmp == NULL) return; - /* Free userspace_mem_regions. */ - while (vmp->userspace_mem_region_head) { - struct userspace_mem_region *region - = vmp->userspace_mem_region_head; - - region->region.memory_size = 0; - ret = ioctl(vmp->fd, KVM_SET_USER_MEMORY_REGION, - ®ion->region); - TEST_ASSERT(ret == 0, "KVM_SET_USER_MEMORY_REGION IOCTL failed, " - "rc: %i errno: %i", ret, errno); - - vmp->userspace_mem_region_head = region->next; - sparsebit_free(®ion->unused_phy_pages); - ret = munmap(region->mmap_start, region->mmap_size); - TEST_ASSERT(ret == 0, "munmap failed, rc: %i errno: %i", - ret, errno); - - free(region); + /* Free cached stats metadata and close FD */ + if (vmp->stats_fd) { + free(vmp->stats_desc); + close(vmp->stats_fd); } + /* Free userspace_mem_regions. */ + hash_for_each_safe(vmp->regions.slot_hash, ctr, node, region, slot_node) + __vm_mem_region_delete(vmp, region, false); + /* Free sparsebit arrays. */ sparsebit_free(&vmp->vpages_valid); sparsebit_free(&vmp->vpages_mapped); @@ -480,6 +622,26 @@ void kvm_vm_free(struct kvm_vm *vmp) free(vmp); } +int kvm_memfd_alloc(size_t size, bool hugepages) +{ + int memfd_flags = MFD_CLOEXEC; + int fd, r; + + if (hugepages) + memfd_flags |= MFD_HUGETLB; + + fd = memfd_create("kvm_selftest", memfd_flags); + TEST_ASSERT(fd != -1, __KVM_SYSCALL_ERROR("memfd_create()", fd)); + + r = ftruncate(fd, size); + TEST_ASSERT(!r, __KVM_SYSCALL_ERROR("ftruncate()", r)); + + r = fallocate(fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, 0, size); + TEST_ASSERT(!r, __KVM_SYSCALL_ERROR("fallocate()", r)); + + return fd; +} + /* * Memory Compare, host virtual to guest virtual * @@ -550,13 +712,88 @@ int kvm_memcmp_hva_gva(void *hva, struct kvm_vm *vm, vm_vaddr_t gva, size_t len) return 0; } +static void vm_userspace_mem_region_gpa_insert(struct rb_root *gpa_tree, + struct userspace_mem_region *region) +{ + struct rb_node **cur, *parent; + + for (cur = &gpa_tree->rb_node, parent = NULL; *cur; ) { + struct userspace_mem_region *cregion; + + cregion = container_of(*cur, typeof(*cregion), gpa_node); + parent = *cur; + if (region->region.guest_phys_addr < + cregion->region.guest_phys_addr) + cur = &(*cur)->rb_left; + else { + TEST_ASSERT(region->region.guest_phys_addr != + cregion->region.guest_phys_addr, + "Duplicate GPA in region tree"); + + cur = &(*cur)->rb_right; + } + } + + rb_link_node(®ion->gpa_node, parent, cur); + rb_insert_color(®ion->gpa_node, gpa_tree); +} + +static void vm_userspace_mem_region_hva_insert(struct rb_root *hva_tree, + struct userspace_mem_region *region) +{ + struct rb_node **cur, *parent; + + for (cur = &hva_tree->rb_node, parent = NULL; *cur; ) { + struct userspace_mem_region *cregion; + + cregion = container_of(*cur, typeof(*cregion), hva_node); + parent = *cur; + if (region->host_mem < cregion->host_mem) + cur = &(*cur)->rb_left; + else { + TEST_ASSERT(region->host_mem != + cregion->host_mem, + "Duplicate HVA in region tree"); + + cur = &(*cur)->rb_right; + } + } + + rb_link_node(®ion->hva_node, parent, cur); + rb_insert_color(®ion->hva_node, hva_tree); +} + + +int __vm_set_user_memory_region(struct kvm_vm *vm, uint32_t slot, uint32_t flags, + uint64_t gpa, uint64_t size, void *hva) +{ + struct kvm_userspace_memory_region region = { + .slot = slot, + .flags = flags, + .guest_phys_addr = gpa, + .memory_size = size, + .userspace_addr = (uintptr_t)hva, + }; + + return ioctl(vm->fd, KVM_SET_USER_MEMORY_REGION, ®ion); +} + +void vm_set_user_memory_region(struct kvm_vm *vm, uint32_t slot, uint32_t flags, + uint64_t gpa, uint64_t size, void *hva) +{ + int ret = __vm_set_user_memory_region(vm, slot, flags, gpa, size, hva); + + TEST_ASSERT(!ret, "KVM_SET_USER_MEMORY_REGION failed, errno = %d (%s)", + errno, strerror(errno)); +} + /* * VM Userspace Memory Region Add * * Input Args: * vm - Virtual Machine - * backing_src - Storage source for this region. - * NULL to use anonymous memory. + * src_type - Storage source for this region. + * NULL to use anonymous memory. * guest_paddr - Starting guest physical address * slot - KVM region slot * npages - Number of physical pages @@ -579,9 +816,13 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm, { int ret; struct userspace_mem_region *region; - size_t huge_page_size = KVM_UTIL_PGS_PER_HUGEPG * vm->page_size; + size_t backing_src_pagesz = get_backing_src_pagesz(src_type); size_t alignment; + TEST_ASSERT(vm_adjust_num_guest_pages(vm->mode, npages) == npages, + "Number of guest pages is not compatible with the host. " + "Try npages=%d", vm_adjust_num_guest_pages(vm->mode, npages)); + TEST_ASSERT((guest_paddr % vm->page_size) == 0, "Guest physical " "address not on a page boundary.\n" " guest_paddr: 0x%lx vm->page_size: 0x%x", @@ -600,7 +841,7 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm, region = (struct userspace_mem_region *) userspace_mem_region_find( vm, guest_paddr, (guest_paddr + npages * vm->page_size) - 1); if (region != NULL) - TEST_ASSERT(false, "overlapping userspace_mem_region already " + TEST_FAIL("overlapping userspace_mem_region already " "exists\n" " requested guest_paddr: 0x%lx npages: 0x%lx " "page_size: 0x%x\n" @@ -610,13 +851,12 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm, (uint64_t) region->region.memory_size); /* Confirm no region with the requested slot already exists. */ - for (region = vm->userspace_mem_region_head; region; - region = region->next) { - if (region->region.slot == slot) - break; - } - if (region != NULL) - TEST_ASSERT(false, "A mem region with the requested slot " + hash_for_each_possible(vm->regions.slot_hash, region, slot_node, + slot) { + if (region->region.slot != slot) + continue; + + TEST_FAIL("A mem region with the requested slot " "already exists.\n" " requested slot: %u paddr: 0x%lx npages: 0x%lx\n" " existing slot: %u paddr: 0x%lx size: 0x%lx", @@ -624,6 +864,7 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm, region->region.slot, (uint64_t) region->region.guest_phys_addr, (uint64_t) region->region.memory_size); + } /* Allocate and initialize new mem region structure. */ region = calloc(1, sizeof(*region)); @@ -637,34 +878,49 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm, alignment = 1; #endif + /* + * When using THP mmap is not guaranteed to returned a hugepage aligned + * address so we have to pad the mmap. Padding is not needed for HugeTLB + * because mmap will always return an address aligned to the HugeTLB + * page size. + */ if (src_type == VM_MEM_SRC_ANONYMOUS_THP) - alignment = max(huge_page_size, alignment); + alignment = max(backing_src_pagesz, alignment); + + ASSERT_EQ(guest_paddr, align_up(guest_paddr, backing_src_pagesz)); /* Add enough memory to align up if necessary */ if (alignment > 1) region->mmap_size += alignment; + region->fd = -1; + if (backing_src_is_shared(src_type)) + region->fd = kvm_memfd_alloc(region->mmap_size, + src_type == VM_MEM_SRC_SHARED_HUGETLB); + region->mmap_start = mmap(NULL, region->mmap_size, PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS - | (src_type == VM_MEM_SRC_ANONYMOUS_HUGETLB ? MAP_HUGETLB : 0), - -1, 0); + vm_mem_backing_src_alias(src_type)->flag, + region->fd, 0); TEST_ASSERT(region->mmap_start != MAP_FAILED, - "test_malloc failed, mmap_start: %p errno: %i", - region->mmap_start, errno); + __KVM_SYSCALL_ERROR("mmap()", (int)(unsigned long)MAP_FAILED)); + + TEST_ASSERT(!is_backing_src_hugetlb(src_type) || + region->mmap_start == align_ptr_up(region->mmap_start, backing_src_pagesz), + "mmap_start %p is not aligned to HugeTLB page size 0x%lx", + region->mmap_start, backing_src_pagesz); /* Align host address */ - region->host_mem = align(region->mmap_start, alignment); + region->host_mem = align_ptr_up(region->mmap_start, alignment); /* As needed perform madvise */ - if (src_type == VM_MEM_SRC_ANONYMOUS || src_type == VM_MEM_SRC_ANONYMOUS_THP) { + if ((src_type == VM_MEM_SRC_ANONYMOUS || + src_type == VM_MEM_SRC_ANONYMOUS_THP) && thp_configured()) { ret = madvise(region->host_mem, npages * vm->page_size, - src_type == VM_MEM_SRC_ANONYMOUS ? MADV_NOHUGEPAGE : MADV_HUGEPAGE); - TEST_ASSERT(ret == 0, "madvise failed,\n" - " addr: %p\n" - " length: 0x%lx\n" - " src_type: %x", - region->host_mem, npages * vm->page_size, src_type); + src_type == VM_MEM_SRC_ANONYMOUS ? MADV_NOHUGEPAGE : MADV_HUGEPAGE); + TEST_ASSERT(ret == 0, "madvise failed, addr: %p length: 0x%lx src_type: %s", + region->host_mem, npages * vm->page_size, + vm_mem_backing_src_alias(src_type)->name); } region->unused_phy_pages = sparsebit_alloc(); @@ -675,7 +931,7 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm, region->region.guest_phys_addr = guest_paddr; region->region.memory_size = npages * vm->page_size; region->region.userspace_addr = (uintptr_t) region->host_mem; - ret = ioctl(vm->fd, KVM_SET_USER_MEMORY_REGION, ®ion->region); + ret = __vm_ioctl(vm, KVM_SET_USER_MEMORY_REGION, ®ion->region); TEST_ASSERT(ret == 0, "KVM_SET_USER_MEMORY_REGION IOCTL failed,\n" " rc: %i errno: %i\n" " slot: %u flags: 0x%x\n" @@ -683,11 +939,23 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm, ret, errno, slot, flags, guest_paddr, (uint64_t) region->region.memory_size); - /* Add to linked-list of memory regions. */ - if (vm->userspace_mem_region_head) - vm->userspace_mem_region_head->prev = region; - region->next = vm->userspace_mem_region_head; - vm->userspace_mem_region_head = region; + /* Add to quick lookup data structures */ + vm_userspace_mem_region_gpa_insert(&vm->regions.gpa_tree, region); + vm_userspace_mem_region_hva_insert(&vm->regions.hva_tree, region); + hash_add(vm->regions.slot_hash, ®ion->slot_node, slot); + + /* If shared memory, create an alias. */ + if (region->fd >= 0) { + region->mmap_alias = mmap(NULL, region->mmap_size, + PROT_READ | PROT_WRITE, + vm_mem_backing_src_alias(src_type)->flag, + region->fd, 0); + TEST_ASSERT(region->mmap_alias != MAP_FAILED, + __KVM_SYSCALL_ERROR("mmap()", (int)(unsigned long)MAP_FAILED)); + + /* Align host alias address */ + region->host_alias = align_ptr_up(region->mmap_alias, alignment); + } } /* @@ -710,20 +978,17 @@ memslot2region(struct kvm_vm *vm, uint32_t memslot) { struct userspace_mem_region *region; - for (region = vm->userspace_mem_region_head; region; - region = region->next) { + hash_for_each_possible(vm->regions.slot_hash, region, slot_node, + memslot) if (region->region.slot == memslot) - break; - } - if (region == NULL) { - fprintf(stderr, "No mem region with the requested slot found,\n" - " requested slot: %u\n", memslot); - fputs("---- vm dump ----\n", stderr); - vm_dump(stderr, vm, 2); - TEST_ASSERT(false, "Mem region not found"); - } + return region; - return region; + fprintf(stderr, "No mem region with the requested slot found,\n" + " requested slot: %u\n", memslot); + fputs("---- vm dump ----\n", stderr); + vm_dump(stderr, vm, 2); + TEST_FAIL("Mem region not found"); + return NULL; } /* @@ -749,7 +1014,7 @@ void vm_mem_region_set_flags(struct kvm_vm *vm, uint32_t slot, uint32_t flags) region->region.flags = flags; - ret = ioctl(vm->fd, KVM_SET_USER_MEMORY_REGION, ®ion->region); + ret = __vm_ioctl(vm, KVM_SET_USER_MEMORY_REGION, ®ion->region); TEST_ASSERT(ret == 0, "KVM_SET_USER_MEMORY_REGION IOCTL failed,\n" " rc: %i errno: %i slot: %u flags: 0x%x", @@ -757,84 +1022,113 @@ void vm_mem_region_set_flags(struct kvm_vm *vm, uint32_t slot, uint32_t flags) } /* - * VCPU mmap Size + * VM Memory Region Move * - * Input Args: None + * Input Args: + * vm - Virtual Machine + * slot - Slot of the memory region to move + * new_gpa - Starting guest physical address * * Output Args: None * - * Return: - * Size of VCPU state + * Return: None * - * Returns the size of the structure pointed to by the return value - * of vcpu_state(). + * Change the gpa of a memory region. */ -static int vcpu_mmap_sz(void) +void vm_mem_region_move(struct kvm_vm *vm, uint32_t slot, uint64_t new_gpa) { - int dev_fd, ret; + struct userspace_mem_region *region; + int ret; - dev_fd = open(KVM_DEV_PATH, O_RDONLY); - if (dev_fd < 0) - exit(KSFT_SKIP); + region = memslot2region(vm, slot); - ret = ioctl(dev_fd, KVM_GET_VCPU_MMAP_SIZE, NULL); - TEST_ASSERT(ret >= sizeof(struct kvm_run), - "%s KVM_GET_VCPU_MMAP_SIZE ioctl failed, rc: %i errno: %i", - __func__, ret, errno); + region->region.guest_phys_addr = new_gpa; - close(dev_fd); + ret = __vm_ioctl(vm, KVM_SET_USER_MEMORY_REGION, ®ion->region); - return ret; + TEST_ASSERT(!ret, "KVM_SET_USER_MEMORY_REGION failed\n" + "ret: %i errno: %i slot: %u new_gpa: 0x%lx", + ret, errno, slot, new_gpa); } /* - * VM VCPU Add + * VM Memory Region Delete * * Input Args: * vm - Virtual Machine - * vcpuid - VCPU ID + * slot - Slot of the memory region to delete * * Output Args: None * * Return: None * - * Adds a virtual CPU to the VM specified by vm with the ID given by vcpuid. - * No additional VCPU setup is done. + * Delete a memory region. + */ +void vm_mem_region_delete(struct kvm_vm *vm, uint32_t slot) +{ + __vm_mem_region_delete(vm, memslot2region(vm, slot), true); +} + +/* Returns the size of a vCPU's kvm_run structure. */ +static int vcpu_mmap_sz(void) +{ + int dev_fd, ret; + + dev_fd = open_kvm_dev_path_or_exit(); + + ret = ioctl(dev_fd, KVM_GET_VCPU_MMAP_SIZE, NULL); + TEST_ASSERT(ret >= sizeof(struct kvm_run), + KVM_IOCTL_ERROR(KVM_GET_VCPU_MMAP_SIZE, ret)); + + close(dev_fd); + + return ret; +} + +static bool vcpu_exists(struct kvm_vm *vm, uint32_t vcpu_id) +{ + struct kvm_vcpu *vcpu; + + list_for_each_entry(vcpu, &vm->vcpus, list) { + if (vcpu->id == vcpu_id) + return true; + } + + return false; +} + +/* + * Adds a virtual CPU to the VM specified by vm with the ID given by vcpu_id. + * No additional vCPU setup is done. Returns the vCPU. */ -void vm_vcpu_add(struct kvm_vm *vm, uint32_t vcpuid) +struct kvm_vcpu *__vm_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id) { - struct vcpu *vcpu; + struct kvm_vcpu *vcpu; /* Confirm a vcpu with the specified id doesn't already exist. */ - vcpu = vcpu_find(vm, vcpuid); - if (vcpu != NULL) - TEST_ASSERT(false, "vcpu with the specified id " - "already exists,\n" - " requested vcpuid: %u\n" - " existing vcpuid: %u state: %p", - vcpuid, vcpu->id, vcpu->state); + TEST_ASSERT(!vcpu_exists(vm, vcpu_id), "vCPU%d already exists\n", vcpu_id); /* Allocate and initialize new vcpu structure. */ vcpu = calloc(1, sizeof(*vcpu)); TEST_ASSERT(vcpu != NULL, "Insufficient Memory"); - vcpu->id = vcpuid; - vcpu->fd = ioctl(vm->fd, KVM_CREATE_VCPU, vcpuid); - TEST_ASSERT(vcpu->fd >= 0, "KVM_CREATE_VCPU failed, rc: %i errno: %i", - vcpu->fd, errno); - TEST_ASSERT(vcpu_mmap_sz() >= sizeof(*vcpu->state), "vcpu mmap size " + vcpu->vm = vm; + vcpu->id = vcpu_id; + vcpu->fd = __vm_ioctl(vm, KVM_CREATE_VCPU, (void *)(unsigned long)vcpu_id); + TEST_ASSERT(vcpu->fd >= 0, KVM_IOCTL_ERROR(KVM_CREATE_VCPU, vcpu->fd)); + + TEST_ASSERT(vcpu_mmap_sz() >= sizeof(*vcpu->run), "vcpu mmap size " "smaller than expected, vcpu_mmap_sz: %i expected_min: %zi", - vcpu_mmap_sz(), sizeof(*vcpu->state)); - vcpu->state = (struct kvm_run *) mmap(NULL, sizeof(*vcpu->state), + vcpu_mmap_sz(), sizeof(*vcpu->run)); + vcpu->run = (struct kvm_run *) mmap(NULL, vcpu_mmap_sz(), PROT_READ | PROT_WRITE, MAP_SHARED, vcpu->fd, 0); - TEST_ASSERT(vcpu->state != MAP_FAILED, "mmap vcpu_state failed, " - "vcpu id: %u errno: %i", vcpuid, errno); + TEST_ASSERT(vcpu->run != MAP_FAILED, + __KVM_SYSCALL_ERROR("mmap()", (int)(unsigned long)MAP_FAILED)); /* Add to linked-list of VCPUs. */ - if (vm->vcpu_head) - vm->vcpu_head->prev = vcpu; - vcpu->next = vm->vcpu_head; - vm->vcpu_head = vcpu; + list_add(&vcpu->list, &vm->vcpus); + + return vcpu; } /* @@ -901,8 +1195,7 @@ static vm_vaddr_t vm_vaddr_unused_gap(struct kvm_vm *vm, size_t sz, } while (pgidx_start != 0); no_va_found: - TEST_ASSERT(false, "No vaddr of specified pages available, " - "pages: 0x%lx", pages); + TEST_FAIL("No vaddr of specified pages available, pages: 0x%lx", pages); /* NOT REACHED */ return -1; @@ -931,8 +1224,6 @@ va_found: * vm - Virtual Machine * sz - Size in bytes * vaddr_min - Minimum starting virtual address - * data_memslot - Memory region slot for data pages - * pgd_memslot - Memory region slot for new virtual translation tables * * Output Args: None * @@ -945,12 +1236,13 @@ va_found: * a unique set of pages, with the minimum real allocation being at least * a page. */ -vm_vaddr_t vm_vaddr_alloc(struct kvm_vm *vm, size_t sz, vm_vaddr_t vaddr_min, - uint32_t data_memslot, uint32_t pgd_memslot) +vm_vaddr_t vm_vaddr_alloc(struct kvm_vm *vm, size_t sz, vm_vaddr_t vaddr_min) { uint64_t pages = (sz >> vm->page_shift) + ((sz % vm->page_size) != 0); - virt_pgd_alloc(vm, pgd_memslot); + virt_pgd_alloc(vm); + vm_paddr_t paddr = vm_phy_pages_alloc(vm, pages, + KVM_UTIL_MIN_PFN * vm->page_size, 0); /* * Find an unused range of virtual page addresses of at least @@ -960,13 +1252,9 @@ vm_vaddr_t vm_vaddr_alloc(struct kvm_vm *vm, size_t sz, vm_vaddr_t vaddr_min, /* Map the virtual pages. */ for (vm_vaddr_t vaddr = vaddr_start; pages > 0; - pages--, vaddr += vm->page_size) { - vm_paddr_t paddr; + pages--, vaddr += vm->page_size, paddr += vm->page_size) { - paddr = vm_phy_page_alloc(vm, - KVM_UTIL_MIN_PFN * vm->page_size, data_memslot); - - virt_pg_map(vm, vaddr, paddr, pgd_memslot); + virt_pg_map(vm, vaddr, paddr); sparsebit_set(vm->vpages_mapped, vaddr >> vm->page_shift); @@ -976,33 +1264,70 @@ vm_vaddr_t vm_vaddr_alloc(struct kvm_vm *vm, size_t sz, vm_vaddr_t vaddr_min, } /* + * VM Virtual Address Allocate Pages + * + * Input Args: + * vm - Virtual Machine + * + * Output Args: None + * + * Return: + * Starting guest virtual address + * + * Allocates at least N system pages worth of bytes within the virtual address + * space of the vm. + */ +vm_vaddr_t vm_vaddr_alloc_pages(struct kvm_vm *vm, int nr_pages) +{ + return vm_vaddr_alloc(vm, nr_pages * getpagesize(), KVM_UTIL_MIN_VADDR); +} + +/* + * VM Virtual Address Allocate Page + * + * Input Args: + * vm - Virtual Machine + * + * Output Args: None + * + * Return: + * Starting guest virtual address + * + * Allocates at least one system page worth of bytes within the virtual address + * space of the vm. + */ +vm_vaddr_t vm_vaddr_alloc_page(struct kvm_vm *vm) +{ + return vm_vaddr_alloc_pages(vm, 1); +} + +/* * Map a range of VM virtual address to the VM's physical address * * Input Args: * vm - Virtual Machine * vaddr - Virtuall address to map * paddr - VM Physical Address - * size - The size of the range to map - * pgd_memslot - Memory region slot for new virtual translation tables + * npages - The number of pages to map * * Output Args: None * * Return: None * - * Within the VM given by vm, creates a virtual translation for the - * page range starting at vaddr to the page range starting at paddr. + * Within the VM given by @vm, creates a virtual translation for + * @npages starting at @vaddr to the page range starting at @paddr. */ void virt_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, - size_t size, uint32_t pgd_memslot) + unsigned int npages) { size_t page_size = vm->page_size; - size_t npages = size / page_size; + size_t size = npages * page_size; TEST_ASSERT(vaddr + size > vaddr, "Vaddr overflow"); TEST_ASSERT(paddr + size > paddr, "Paddr overflow"); while (npages--) { - virt_pg_map(vm, vaddr, paddr, pgd_memslot); + virt_pg_map(vm, vaddr, paddr); vaddr += page_size; paddr += page_size; } @@ -1028,17 +1353,15 @@ void virt_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, void *addr_gpa2hva(struct kvm_vm *vm, vm_paddr_t gpa) { struct userspace_mem_region *region; - for (region = vm->userspace_mem_region_head; region; - region = region->next) { - if ((gpa >= region->region.guest_phys_addr) - && (gpa <= (region->region.guest_phys_addr - + region->region.memory_size - 1))) - return (void *) ((uintptr_t) region->host_mem - + (gpa - region->region.guest_phys_addr)); + + region = userspace_mem_region_find(vm, gpa, gpa); + if (!region) { + TEST_FAIL("No vm physical memory at 0x%lx", gpa); + return NULL; } - TEST_ASSERT(false, "No vm physical memory at 0x%lx", gpa); - return NULL; + return (void *)((uintptr_t)region->host_mem + + (gpa - region->region.guest_phys_addr)); } /* @@ -1060,410 +1383,288 @@ void *addr_gpa2hva(struct kvm_vm *vm, vm_paddr_t gpa) */ vm_paddr_t addr_hva2gpa(struct kvm_vm *vm, void *hva) { - struct userspace_mem_region *region; - for (region = vm->userspace_mem_region_head; region; - region = region->next) { - if ((hva >= region->host_mem) - && (hva <= (region->host_mem - + region->region.memory_size - 1))) - return (vm_paddr_t) ((uintptr_t) - region->region.guest_phys_addr - + (hva - (uintptr_t) region->host_mem)); + struct rb_node *node; + + for (node = vm->regions.hva_tree.rb_node; node; ) { + struct userspace_mem_region *region = + container_of(node, struct userspace_mem_region, hva_node); + + if (hva >= region->host_mem) { + if (hva <= (region->host_mem + + region->region.memory_size - 1)) + return (vm_paddr_t)((uintptr_t) + region->region.guest_phys_addr + + (hva - (uintptr_t)region->host_mem)); + + node = node->rb_right; + } else + node = node->rb_left; } - TEST_ASSERT(false, "No mapping to a guest physical address, " - "hva: %p", hva); + TEST_FAIL("No mapping to a guest physical address, hva: %p", hva); return -1; } /* - * VM Create IRQ Chip + * Address VM physical to Host Virtual *alias*. * * Input Args: * vm - Virtual Machine + * gpa - VM physical address * * Output Args: None * - * Return: None - * - * Creates an interrupt controller chip for the VM specified by vm. + * Return: + * Equivalent address within the host virtual *alias* area, or NULL + * (without failing the test) if the guest memory is not shared (so + * no alias exists). + * + * Create a writable, shared virtual=>physical alias for the specific GPA. + * The primary use case is to allow the host selftest to manipulate guest + * memory without mapping said memory in the guest's address space. And, for + * userfaultfd-based demand paging, to do so without triggering userfaults. */ -void vm_create_irqchip(struct kvm_vm *vm) +void *addr_gpa2alias(struct kvm_vm *vm, vm_paddr_t gpa) { - int ret; - - ret = ioctl(vm->fd, KVM_CREATE_IRQCHIP, 0); - TEST_ASSERT(ret == 0, "KVM_CREATE_IRQCHIP IOCTL failed, " - "rc: %i errno: %i", ret, errno); + struct userspace_mem_region *region; + uintptr_t offset; - vm->has_irqchip = true; -} + region = userspace_mem_region_find(vm, gpa, gpa); + if (!region) + return NULL; -/* - * VM VCPU State - * - * Input Args: - * vm - Virtual Machine - * vcpuid - VCPU ID - * - * Output Args: None - * - * Return: - * Pointer to structure that describes the state of the VCPU. - * - * Locates and returns a pointer to a structure that describes the - * state of the VCPU with the given vcpuid. - */ -struct kvm_run *vcpu_state(struct kvm_vm *vm, uint32_t vcpuid) -{ - struct vcpu *vcpu = vcpu_find(vm, vcpuid); - TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid); + if (!region->host_alias) + return NULL; - return vcpu->state; + offset = gpa - region->region.guest_phys_addr; + return (void *) ((uintptr_t) region->host_alias + offset); } -/* - * VM VCPU Run - * - * Input Args: - * vm - Virtual Machine - * vcpuid - VCPU ID - * - * Output Args: None - * - * Return: None - * - * Switch to executing the code for the VCPU given by vcpuid, within the VM - * given by vm. - */ -void vcpu_run(struct kvm_vm *vm, uint32_t vcpuid) +/* Create an interrupt controller chip for the specified VM. */ +void vm_create_irqchip(struct kvm_vm *vm) { - int ret = _vcpu_run(vm, vcpuid); - TEST_ASSERT(ret == 0, "KVM_RUN IOCTL failed, " - "rc: %i errno: %i", ret, errno); + vm_ioctl(vm, KVM_CREATE_IRQCHIP, NULL); + + vm->has_irqchip = true; } -int _vcpu_run(struct kvm_vm *vm, uint32_t vcpuid) +int _vcpu_run(struct kvm_vcpu *vcpu) { - struct vcpu *vcpu = vcpu_find(vm, vcpuid); int rc; - TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid); do { - rc = ioctl(vcpu->fd, KVM_RUN, NULL); + rc = __vcpu_run(vcpu); } while (rc == -1 && errno == EINTR); - return rc; -} -void vcpu_run_complete_io(struct kvm_vm *vm, uint32_t vcpuid) -{ - struct vcpu *vcpu = vcpu_find(vm, vcpuid); - int ret; - - TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid); - - vcpu->state->immediate_exit = 1; - ret = ioctl(vcpu->fd, KVM_RUN, NULL); - vcpu->state->immediate_exit = 0; + assert_on_unhandled_exception(vcpu); - TEST_ASSERT(ret == -1 && errno == EINTR, - "KVM_RUN IOCTL didn't exit immediately, rc: %i, errno: %i", - ret, errno); + return rc; } /* - * VM VCPU Set MP State - * - * Input Args: - * vm - Virtual Machine - * vcpuid - VCPU ID - * mp_state - mp_state to be set - * - * Output Args: None - * - * Return: None - * - * Sets the MP state of the VCPU given by vcpuid, to the state given - * by mp_state. + * Invoke KVM_RUN on a vCPU until KVM returns something other than -EINTR. + * Assert if the KVM returns an error (other than -EINTR). */ -void vcpu_set_mp_state(struct kvm_vm *vm, uint32_t vcpuid, - struct kvm_mp_state *mp_state) +void vcpu_run(struct kvm_vcpu *vcpu) { - struct vcpu *vcpu = vcpu_find(vm, vcpuid); - int ret; + int ret = _vcpu_run(vcpu); - TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid); - - ret = ioctl(vcpu->fd, KVM_SET_MP_STATE, mp_state); - TEST_ASSERT(ret == 0, "KVM_SET_MP_STATE IOCTL failed, " - "rc: %i errno: %i", ret, errno); + TEST_ASSERT(!ret, KVM_IOCTL_ERROR(KVM_RUN, ret)); } -/* - * VM VCPU Regs Get - * - * Input Args: - * vm - Virtual Machine - * vcpuid - VCPU ID - * - * Output Args: - * regs - current state of VCPU regs - * - * Return: None - * - * Obtains the current register state for the VCPU specified by vcpuid - * and stores it at the location given by regs. - */ -void vcpu_regs_get(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_regs *regs) +void vcpu_run_complete_io(struct kvm_vcpu *vcpu) { - struct vcpu *vcpu = vcpu_find(vm, vcpuid); int ret; - TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid); + vcpu->run->immediate_exit = 1; + ret = __vcpu_run(vcpu); + vcpu->run->immediate_exit = 0; - ret = ioctl(vcpu->fd, KVM_GET_REGS, regs); - TEST_ASSERT(ret == 0, "KVM_GET_REGS failed, rc: %i errno: %i", - ret, errno); + TEST_ASSERT(ret == -1 && errno == EINTR, + "KVM_RUN IOCTL didn't exit immediately, rc: %i, errno: %i", + ret, errno); } /* - * VM VCPU Regs Set - * - * Input Args: - * vm - Virtual Machine - * vcpuid - VCPU ID - * regs - Values to set VCPU regs to - * - * Output Args: None - * - * Return: None - * - * Sets the regs of the VCPU specified by vcpuid to the values - * given by regs. + * Get the list of guest registers which are supported for + * KVM_GET_ONE_REG/KVM_SET_ONE_REG ioctls. Returns a kvm_reg_list pointer, + * it is the caller's responsibility to free the list. */ -void vcpu_regs_set(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_regs *regs) +struct kvm_reg_list *vcpu_get_reg_list(struct kvm_vcpu *vcpu) { - struct vcpu *vcpu = vcpu_find(vm, vcpuid); + struct kvm_reg_list reg_list_n = { .n = 0 }, *reg_list; int ret; - TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid); + ret = __vcpu_ioctl(vcpu, KVM_GET_REG_LIST, ®_list_n); + TEST_ASSERT(ret == -1 && errno == E2BIG, "KVM_GET_REG_LIST n=0"); - ret = ioctl(vcpu->fd, KVM_SET_REGS, regs); - TEST_ASSERT(ret == 0, "KVM_SET_REGS failed, rc: %i errno: %i", - ret, errno); + reg_list = calloc(1, sizeof(*reg_list) + reg_list_n.n * sizeof(__u64)); + reg_list->n = reg_list_n.n; + vcpu_ioctl(vcpu, KVM_GET_REG_LIST, reg_list); + return reg_list; } -#ifdef __KVM_HAVE_VCPU_EVENTS -void vcpu_events_get(struct kvm_vm *vm, uint32_t vcpuid, - struct kvm_vcpu_events *events) +void *vcpu_map_dirty_ring(struct kvm_vcpu *vcpu) { - struct vcpu *vcpu = vcpu_find(vm, vcpuid); - int ret; + uint32_t page_size = vcpu->vm->page_size; + uint32_t size = vcpu->vm->dirty_ring_size; - TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid); + TEST_ASSERT(size > 0, "Should enable dirty ring first"); - ret = ioctl(vcpu->fd, KVM_GET_VCPU_EVENTS, events); - TEST_ASSERT(ret == 0, "KVM_GET_VCPU_EVENTS, failed, rc: %i errno: %i", - ret, errno); -} + if (!vcpu->dirty_gfns) { + void *addr; -void vcpu_events_set(struct kvm_vm *vm, uint32_t vcpuid, - struct kvm_vcpu_events *events) -{ - struct vcpu *vcpu = vcpu_find(vm, vcpuid); - int ret; + addr = mmap(NULL, size, PROT_READ, MAP_PRIVATE, vcpu->fd, + page_size * KVM_DIRTY_LOG_PAGE_OFFSET); + TEST_ASSERT(addr == MAP_FAILED, "Dirty ring mapped private"); - TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid); + addr = mmap(NULL, size, PROT_READ | PROT_EXEC, MAP_PRIVATE, vcpu->fd, + page_size * KVM_DIRTY_LOG_PAGE_OFFSET); + TEST_ASSERT(addr == MAP_FAILED, "Dirty ring mapped exec"); - ret = ioctl(vcpu->fd, KVM_SET_VCPU_EVENTS, events); - TEST_ASSERT(ret == 0, "KVM_SET_VCPU_EVENTS, failed, rc: %i errno: %i", - ret, errno); -} -#endif + addr = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, vcpu->fd, + page_size * KVM_DIRTY_LOG_PAGE_OFFSET); + TEST_ASSERT(addr != MAP_FAILED, "Dirty ring map failed"); -#ifdef __x86_64__ -void vcpu_nested_state_get(struct kvm_vm *vm, uint32_t vcpuid, - struct kvm_nested_state *state) -{ - struct vcpu *vcpu = vcpu_find(vm, vcpuid); - int ret; - - TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid); - - ret = ioctl(vcpu->fd, KVM_GET_NESTED_STATE, state); - TEST_ASSERT(ret == 0, - "KVM_SET_NESTED_STATE failed, ret: %i errno: %i", - ret, errno); -} - -int vcpu_nested_state_set(struct kvm_vm *vm, uint32_t vcpuid, - struct kvm_nested_state *state, bool ignore_error) -{ - struct vcpu *vcpu = vcpu_find(vm, vcpuid); - int ret; - - TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid); - - ret = ioctl(vcpu->fd, KVM_SET_NESTED_STATE, state); - if (!ignore_error) { - TEST_ASSERT(ret == 0, - "KVM_SET_NESTED_STATE failed, ret: %i errno: %i", - ret, errno); + vcpu->dirty_gfns = addr; + vcpu->dirty_gfns_count = size / sizeof(struct kvm_dirty_gfn); } - return ret; + return vcpu->dirty_gfns; } -#endif /* - * VM VCPU System Regs Get - * - * Input Args: - * vm - Virtual Machine - * vcpuid - VCPU ID - * - * Output Args: - * sregs - current state of VCPU system regs - * - * Return: None - * - * Obtains the current system register state for the VCPU specified by - * vcpuid and stores it at the location given by sregs. + * Device Ioctl */ -void vcpu_sregs_get(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_sregs *sregs) -{ - struct vcpu *vcpu = vcpu_find(vm, vcpuid); - int ret; - TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid); +int __kvm_has_device_attr(int dev_fd, uint32_t group, uint64_t attr) +{ + struct kvm_device_attr attribute = { + .group = group, + .attr = attr, + .flags = 0, + }; - ret = ioctl(vcpu->fd, KVM_GET_SREGS, sregs); - TEST_ASSERT(ret == 0, "KVM_GET_SREGS failed, rc: %i errno: %i", - ret, errno); + return ioctl(dev_fd, KVM_HAS_DEVICE_ATTR, &attribute); } -/* - * VM VCPU System Regs Set - * - * Input Args: - * vm - Virtual Machine - * vcpuid - VCPU ID - * sregs - Values to set VCPU system regs to - * - * Output Args: None - * - * Return: None - * - * Sets the system regs of the VCPU specified by vcpuid to the values - * given by sregs. - */ -void vcpu_sregs_set(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_sregs *sregs) +int __kvm_test_create_device(struct kvm_vm *vm, uint64_t type) { - int ret = _vcpu_sregs_set(vm, vcpuid, sregs); - TEST_ASSERT(ret == 0, "KVM_RUN IOCTL failed, " - "rc: %i errno: %i", ret, errno); + struct kvm_create_device create_dev = { + .type = type, + .flags = KVM_CREATE_DEVICE_TEST, + }; + + return __vm_ioctl(vm, KVM_CREATE_DEVICE, &create_dev); } -int _vcpu_sregs_set(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_sregs *sregs) +int __kvm_create_device(struct kvm_vm *vm, uint64_t type) { - struct vcpu *vcpu = vcpu_find(vm, vcpuid); - - TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid); - - return ioctl(vcpu->fd, KVM_SET_SREGS, sregs); + struct kvm_create_device create_dev = { + .type = type, + .fd = -1, + .flags = 0, + }; + int err; + + err = __vm_ioctl(vm, KVM_CREATE_DEVICE, &create_dev); + TEST_ASSERT(err <= 0, "KVM_CREATE_DEVICE shouldn't return a positive value"); + return err ? : create_dev.fd; } -void vcpu_fpu_get(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_fpu *fpu) +int __kvm_device_attr_get(int dev_fd, uint32_t group, uint64_t attr, void *val) { - int ret; + struct kvm_device_attr kvmattr = { + .group = group, + .attr = attr, + .flags = 0, + .addr = (uintptr_t)val, + }; + + return __kvm_ioctl(dev_fd, KVM_GET_DEVICE_ATTR, &kvmattr); +} - ret = _vcpu_ioctl(vm, vcpuid, KVM_GET_FPU, fpu); - TEST_ASSERT(ret == 0, "KVM_GET_FPU failed, rc: %i errno: %i (%s)", - ret, errno, strerror(errno)); +int __kvm_device_attr_set(int dev_fd, uint32_t group, uint64_t attr, void *val) +{ + struct kvm_device_attr kvmattr = { + .group = group, + .attr = attr, + .flags = 0, + .addr = (uintptr_t)val, + }; + + return __kvm_ioctl(dev_fd, KVM_SET_DEVICE_ATTR, &kvmattr); } -void vcpu_fpu_set(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_fpu *fpu) +/* + * IRQ related functions. + */ + +int _kvm_irq_line(struct kvm_vm *vm, uint32_t irq, int level) { - int ret; + struct kvm_irq_level irq_level = { + .irq = irq, + .level = level, + }; - ret = _vcpu_ioctl(vm, vcpuid, KVM_SET_FPU, fpu); - TEST_ASSERT(ret == 0, "KVM_SET_FPU failed, rc: %i errno: %i (%s)", - ret, errno, strerror(errno)); + return __vm_ioctl(vm, KVM_IRQ_LINE, &irq_level); } -void vcpu_get_reg(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_one_reg *reg) +void kvm_irq_line(struct kvm_vm *vm, uint32_t irq, int level) { - int ret; + int ret = _kvm_irq_line(vm, irq, level); - ret = _vcpu_ioctl(vm, vcpuid, KVM_GET_ONE_REG, reg); - TEST_ASSERT(ret == 0, "KVM_GET_ONE_REG failed, rc: %i errno: %i (%s)", - ret, errno, strerror(errno)); + TEST_ASSERT(ret >= 0, KVM_IOCTL_ERROR(KVM_IRQ_LINE, ret)); } -void vcpu_set_reg(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_one_reg *reg) +struct kvm_irq_routing *kvm_gsi_routing_create(void) { - int ret; + struct kvm_irq_routing *routing; + size_t size; - ret = _vcpu_ioctl(vm, vcpuid, KVM_SET_ONE_REG, reg); - TEST_ASSERT(ret == 0, "KVM_SET_ONE_REG failed, rc: %i errno: %i (%s)", - ret, errno, strerror(errno)); + size = sizeof(struct kvm_irq_routing); + /* Allocate space for the max number of entries: this wastes 196 KBs. */ + size += KVM_MAX_IRQ_ROUTES * sizeof(struct kvm_irq_routing_entry); + routing = calloc(1, size); + assert(routing); + + return routing; } -/* - * VCPU Ioctl - * - * Input Args: - * vm - Virtual Machine - * vcpuid - VCPU ID - * cmd - Ioctl number - * arg - Argument to pass to the ioctl - * - * Return: None - * - * Issues an arbitrary ioctl on a VCPU fd. - */ -void vcpu_ioctl(struct kvm_vm *vm, uint32_t vcpuid, - unsigned long cmd, void *arg) +void kvm_gsi_routing_irqchip_add(struct kvm_irq_routing *routing, + uint32_t gsi, uint32_t pin) { - int ret; - - ret = _vcpu_ioctl(vm, vcpuid, cmd, arg); - TEST_ASSERT(ret == 0, "vcpu ioctl %lu failed, rc: %i errno: %i (%s)", - cmd, ret, errno, strerror(errno)); + int i; + + assert(routing); + assert(routing->nr < KVM_MAX_IRQ_ROUTES); + + i = routing->nr; + routing->entries[i].gsi = gsi; + routing->entries[i].type = KVM_IRQ_ROUTING_IRQCHIP; + routing->entries[i].flags = 0; + routing->entries[i].u.irqchip.irqchip = 0; + routing->entries[i].u.irqchip.pin = pin; + routing->nr++; } -int _vcpu_ioctl(struct kvm_vm *vm, uint32_t vcpuid, - unsigned long cmd, void *arg) +int _kvm_gsi_routing_write(struct kvm_vm *vm, struct kvm_irq_routing *routing) { - struct vcpu *vcpu = vcpu_find(vm, vcpuid); int ret; - TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid); - - ret = ioctl(vcpu->fd, cmd, arg); + assert(routing); + ret = __vm_ioctl(vm, KVM_SET_GSI_ROUTING, routing); + free(routing); return ret; } -/* - * VM Ioctl - * - * Input Args: - * vm - Virtual Machine - * cmd - Ioctl number - * arg - Argument to pass to the ioctl - * - * Return: None - * - * Issues an arbitrary ioctl on a VM fd. - */ -void vm_ioctl(struct kvm_vm *vm, unsigned long cmd, void *arg) +void kvm_gsi_routing_write(struct kvm_vm *vm, struct kvm_irq_routing *routing) { int ret; - ret = ioctl(vm->fd, cmd, arg); - TEST_ASSERT(ret == 0, "vm ioctl %lu failed, rc: %i errno: %i (%s)", - cmd, ret, errno, strerror(errno)); + ret = _kvm_gsi_routing_write(vm, routing); + TEST_ASSERT(!ret, KVM_IOCTL_ERROR(KVM_SET_GSI_ROUTING, ret)); } /* @@ -1483,15 +1684,15 @@ void vm_ioctl(struct kvm_vm *vm, unsigned long cmd, void *arg) */ void vm_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent) { + int ctr; struct userspace_mem_region *region; - struct vcpu *vcpu; + struct kvm_vcpu *vcpu; fprintf(stream, "%*smode: 0x%x\n", indent, "", vm->mode); fprintf(stream, "%*sfd: %i\n", indent, "", vm->fd); fprintf(stream, "%*spage_size: 0x%x\n", indent, "", vm->page_size); fprintf(stream, "%*sMem Regions:\n", indent, ""); - for (region = vm->userspace_mem_region_head; region; - region = region->next) { + hash_for_each(vm->regions.slot_hash, ctr, region, slot_node) { fprintf(stream, "%*sguest_phys: 0x%lx size: 0x%lx " "host_virt: %p\n", indent + 2, "", (uint64_t) region->region.guest_phys_addr, @@ -1510,8 +1711,9 @@ void vm_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent) virt_dump(stream, vm, indent + 4); } fprintf(stream, "%*sVCPUs:\n", indent, ""); - for (vcpu = vm->vcpu_head; vcpu; vcpu = vcpu->next) - vcpu_dump(stream, vm, vcpu->id, indent + 2); + + list_for_each_entry(vcpu, &vm->vcpus, list) + vcpu_dump(stream, vcpu, indent + 2); } /* Known KVM exit reasons */ @@ -1539,6 +1741,10 @@ static struct exit_reason { {KVM_EXIT_INTERNAL_ERROR, "INTERNAL_ERROR"}, {KVM_EXIT_OSI, "OSI"}, {KVM_EXIT_PAPR_HCALL, "PAPR_HCALL"}, + {KVM_EXIT_DIRTY_RING_FULL, "DIRTY_RING_FULL"}, + {KVM_EXIT_X86_RDMSR, "RDMSR"}, + {KVM_EXIT_X86_WRMSR, "WRMSR"}, + {KVM_EXIT_XEN, "XEN"}, #ifdef KVM_EXIT_MEMORY_NOT_PRESENT {KVM_EXIT_MEMORY_NOT_PRESENT, "MEMORY_NOT_PRESENT"}, #endif @@ -1636,6 +1842,14 @@ vm_paddr_t vm_phy_page_alloc(struct kvm_vm *vm, vm_paddr_t paddr_min, return vm_phy_pages_alloc(vm, 1, paddr_min, memslot); } +/* Arbitrary minimum physical address used for virtual translation tables. */ +#define KVM_GUEST_PAGE_TABLE_MIN_PADDR 0x180000 + +vm_paddr_t vm_alloc_page_table(struct kvm_vm *vm) +{ + return vm_phy_page_alloc(vm, KVM_GUEST_PAGE_TABLE_MIN_PADDR, 0); +} + /* * Address Guest Virtual to Host Virtual * @@ -1653,53 +1867,157 @@ void *addr_gva2hva(struct kvm_vm *vm, vm_vaddr_t gva) return addr_gpa2hva(vm, addr_gva2gpa(vm, gva)); } +unsigned long __weak vm_compute_max_gfn(struct kvm_vm *vm) +{ + return ((1ULL << vm->pa_bits) >> vm->page_shift) - 1; +} + +static unsigned int vm_calc_num_pages(unsigned int num_pages, + unsigned int page_shift, + unsigned int new_page_shift, + bool ceil) +{ + unsigned int n = 1 << (new_page_shift - page_shift); + + if (page_shift >= new_page_shift) + return num_pages * (1 << (page_shift - new_page_shift)); + + return num_pages / n + !!(ceil && num_pages % n); +} + +static inline int getpageshift(void) +{ + return __builtin_ffs(getpagesize()) - 1; +} + +unsigned int +vm_num_host_pages(enum vm_guest_mode mode, unsigned int num_guest_pages) +{ + return vm_calc_num_pages(num_guest_pages, + vm_guest_mode_params[mode].page_shift, + getpageshift(), true); +} + +unsigned int +vm_num_guest_pages(enum vm_guest_mode mode, unsigned int num_host_pages) +{ + return vm_calc_num_pages(num_host_pages, getpageshift(), + vm_guest_mode_params[mode].page_shift, false); +} + +unsigned int vm_calc_num_guest_pages(enum vm_guest_mode mode, size_t size) +{ + unsigned int n; + n = DIV_ROUND_UP(size, vm_guest_mode_params[mode].page_size); + return vm_adjust_num_guest_pages(mode, n); +} + /* - * Is Unrestricted Guest + * Read binary stats descriptors * * Input Args: - * vm - Virtual Machine + * stats_fd - the file descriptor for the binary stats file from which to read + * header - the binary stats metadata header corresponding to the given FD * * Output Args: None * - * Return: True if the unrestricted guest is set to 'Y', otherwise return false. + * Return: + * A pointer to a newly allocated series of stat descriptors. + * Caller is responsible for freeing the returned kvm_stats_desc. * - * Check if the unrestricted guest flag is enabled. + * Read the stats descriptors from the binary stats interface. */ -bool vm_is_unrestricted_guest(struct kvm_vm *vm) -{ - char val = 'N'; - size_t count; - FILE *f; - - if (vm == NULL) { - /* Ensure that the KVM vendor-specific module is loaded. */ - f = fopen(KVM_DEV_PATH, "r"); - TEST_ASSERT(f != NULL, "Error in opening KVM dev file: %d", - errno); - fclose(f); - } +struct kvm_stats_desc *read_stats_descriptors(int stats_fd, + struct kvm_stats_header *header) +{ + struct kvm_stats_desc *stats_desc; + ssize_t desc_size, total_size, ret; - f = fopen("/sys/module/kvm_intel/parameters/unrestricted_guest", "r"); - if (f) { - count = fread(&val, sizeof(char), 1, f); - TEST_ASSERT(count == 1, "Unable to read from param file."); - fclose(f); - } + desc_size = get_stats_descriptor_size(header); + total_size = header->num_desc * desc_size; - return val == 'Y'; -} + stats_desc = calloc(header->num_desc, desc_size); + TEST_ASSERT(stats_desc, "Allocate memory for stats descriptors"); -unsigned int vm_get_page_size(struct kvm_vm *vm) -{ - return vm->page_size; + ret = pread(stats_fd, stats_desc, total_size, header->desc_offset); + TEST_ASSERT(ret == total_size, "Read KVM stats descriptors"); + + return stats_desc; } -unsigned int vm_get_page_shift(struct kvm_vm *vm) +/* + * Read stat data for a particular stat + * + * Input Args: + * stats_fd - the file descriptor for the binary stats file from which to read + * header - the binary stats metadata header corresponding to the given FD + * desc - the binary stat metadata for the particular stat to be read + * max_elements - the maximum number of 8-byte values to read into data + * + * Output Args: + * data - the buffer into which stat data should be read + * + * Read the data values of a specified stat from the binary stats interface. + */ +void read_stat_data(int stats_fd, struct kvm_stats_header *header, + struct kvm_stats_desc *desc, uint64_t *data, + size_t max_elements) { - return vm->page_shift; + size_t nr_elements = min_t(ssize_t, desc->size, max_elements); + size_t size = nr_elements * sizeof(*data); + ssize_t ret; + + TEST_ASSERT(desc->size, "No elements in stat '%s'", desc->name); + TEST_ASSERT(max_elements, "Zero elements requested for stat '%s'", desc->name); + + ret = pread(stats_fd, data, size, + header->data_offset + desc->offset); + + TEST_ASSERT(ret >= 0, "pread() failed on stat '%s', errno: %i (%s)", + desc->name, errno, strerror(errno)); + TEST_ASSERT(ret == size, + "pread() on stat '%s' read %ld bytes, wanted %lu bytes", + desc->name, size, ret); } -unsigned int vm_get_max_gfn(struct kvm_vm *vm) +/* + * Read the data of the named stat + * + * Input Args: + * vm - the VM for which the stat should be read + * stat_name - the name of the stat to read + * max_elements - the maximum number of 8-byte values to read into data + * + * Output Args: + * data - the buffer into which stat data should be read + * + * Read the data values of a specified stat from the binary stats interface. + */ +void __vm_get_stat(struct kvm_vm *vm, const char *stat_name, uint64_t *data, + size_t max_elements) { - return vm->max_gfn; + struct kvm_stats_desc *desc; + size_t size_desc; + int i; + + if (!vm->stats_fd) { + vm->stats_fd = vm_get_stats_fd(vm); + read_stats_header(vm->stats_fd, &vm->stats_header); + vm->stats_desc = read_stats_descriptors(vm->stats_fd, + &vm->stats_header); + } + + size_desc = get_stats_descriptor_size(&vm->stats_header); + + for (i = 0; i < vm->stats_header.num_desc; ++i) { + desc = (void *)vm->stats_desc + (i * size_desc); + + if (strcmp(desc->name, stat_name)) + continue; + + read_stat_data(vm->stats_fd, &vm->stats_header, desc, + data, max_elements); + + break; + } } diff --git a/tools/testing/selftests/kvm/lib/kvm_util_internal.h b/tools/testing/selftests/kvm/lib/kvm_util_internal.h deleted file mode 100644 index ac50c42750cf..000000000000 --- a/tools/testing/selftests/kvm/lib/kvm_util_internal.h +++ /dev/null @@ -1,74 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * tools/testing/selftests/kvm/lib/kvm_util_internal.h - * - * Copyright (C) 2018, Google LLC. - */ - -#ifndef SELFTEST_KVM_UTIL_INTERNAL_H -#define SELFTEST_KVM_UTIL_INTERNAL_H - -#include "sparsebit.h" - -#define KVM_DEV_PATH "/dev/kvm" - -#ifndef BITS_PER_BYTE -#define BITS_PER_BYTE 8 -#endif - -#ifndef BITS_PER_LONG -#define BITS_PER_LONG (BITS_PER_BYTE * sizeof(long)) -#endif - -#define DIV_ROUND_UP(n, d) (((n) + (d) - 1) / (d)) -#define BITS_TO_LONGS(nr) DIV_ROUND_UP(nr, BITS_PER_LONG) - -struct userspace_mem_region { - struct userspace_mem_region *next, *prev; - struct kvm_userspace_memory_region region; - struct sparsebit *unused_phy_pages; - int fd; - off_t offset; - void *host_mem; - void *mmap_start; - size_t mmap_size; -}; - -struct vcpu { - struct vcpu *next, *prev; - uint32_t id; - int fd; - struct kvm_run *state; -}; - -struct kvm_vm { - int mode; - unsigned long type; - int kvm_fd; - int fd; - unsigned int pgtable_levels; - unsigned int page_size; - unsigned int page_shift; - unsigned int pa_bits; - unsigned int va_bits; - uint64_t max_gfn; - struct vcpu *vcpu_head; - struct userspace_mem_region *userspace_mem_region_head; - struct sparsebit *vpages_valid; - struct sparsebit *vpages_mapped; - bool has_irqchip; - bool pgd_created; - vm_paddr_t pgd; - vm_vaddr_t gdt; - vm_vaddr_t tss; -}; - -struct vcpu *vcpu_find(struct kvm_vm *vm, uint32_t vcpuid); -void virt_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent); -void regs_dump(FILE *stream, struct kvm_regs *regs, uint8_t indent); -void sregs_dump(FILE *stream, struct kvm_sregs *sregs, uint8_t indent); - -struct userspace_mem_region * -memslot2region(struct kvm_vm *vm, uint32_t memslot); - -#endif /* SELFTEST_KVM_UTIL_INTERNAL_H */ diff --git a/tools/testing/selftests/kvm/lib/perf_test_util.c b/tools/testing/selftests/kvm/lib/perf_test_util.c new file mode 100644 index 000000000000..9618b37c66f7 --- /dev/null +++ b/tools/testing/selftests/kvm/lib/perf_test_util.c @@ -0,0 +1,294 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2020, Google LLC. + */ +#include <inttypes.h> + +#include "kvm_util.h" +#include "perf_test_util.h" +#include "processor.h" + +struct perf_test_args perf_test_args; + +/* + * Guest virtual memory offset of the testing memory slot. + * Must not conflict with identity mapped test code. + */ +static uint64_t guest_test_virt_mem = DEFAULT_GUEST_TEST_MEM; + +struct vcpu_thread { + /* The index of the vCPU. */ + int vcpu_idx; + + /* The pthread backing the vCPU. */ + pthread_t thread; + + /* Set to true once the vCPU thread is up and running. */ + bool running; +}; + +/* The vCPU threads involved in this test. */ +static struct vcpu_thread vcpu_threads[KVM_MAX_VCPUS]; + +/* The function run by each vCPU thread, as provided by the test. */ +static void (*vcpu_thread_fn)(struct perf_test_vcpu_args *); + +/* Set to true once all vCPU threads are up and running. */ +static bool all_vcpu_threads_running; + +static struct kvm_vcpu *vcpus[KVM_MAX_VCPUS]; + +/* + * Continuously write to the first 8 bytes of each page in the + * specified region. + */ +void perf_test_guest_code(uint32_t vcpu_idx) +{ + struct perf_test_args *pta = &perf_test_args; + struct perf_test_vcpu_args *vcpu_args = &pta->vcpu_args[vcpu_idx]; + uint64_t gva; + uint64_t pages; + int i; + + gva = vcpu_args->gva; + pages = vcpu_args->pages; + + /* Make sure vCPU args data structure is not corrupt. */ + GUEST_ASSERT(vcpu_args->vcpu_idx == vcpu_idx); + + while (true) { + for (i = 0; i < pages; i++) { + uint64_t addr = gva + (i * pta->guest_page_size); + + if (i % pta->wr_fract == 0) + *(uint64_t *)addr = 0x0123456789ABCDEF; + else + READ_ONCE(*(uint64_t *)addr); + } + + GUEST_SYNC(1); + } +} + +void perf_test_setup_vcpus(struct kvm_vm *vm, int nr_vcpus, + struct kvm_vcpu *vcpus[], + uint64_t vcpu_memory_bytes, + bool partition_vcpu_memory_access) +{ + struct perf_test_args *pta = &perf_test_args; + struct perf_test_vcpu_args *vcpu_args; + int i; + + for (i = 0; i < nr_vcpus; i++) { + vcpu_args = &pta->vcpu_args[i]; + + vcpu_args->vcpu = vcpus[i]; + vcpu_args->vcpu_idx = i; + + if (partition_vcpu_memory_access) { + vcpu_args->gva = guest_test_virt_mem + + (i * vcpu_memory_bytes); + vcpu_args->pages = vcpu_memory_bytes / + pta->guest_page_size; + vcpu_args->gpa = pta->gpa + (i * vcpu_memory_bytes); + } else { + vcpu_args->gva = guest_test_virt_mem; + vcpu_args->pages = (nr_vcpus * vcpu_memory_bytes) / + pta->guest_page_size; + vcpu_args->gpa = pta->gpa; + } + + vcpu_args_set(vcpus[i], 1, i); + + pr_debug("Added VCPU %d with test mem gpa [%lx, %lx)\n", + i, vcpu_args->gpa, vcpu_args->gpa + + (vcpu_args->pages * pta->guest_page_size)); + } +} + +struct kvm_vm *perf_test_create_vm(enum vm_guest_mode mode, int nr_vcpus, + uint64_t vcpu_memory_bytes, int slots, + enum vm_mem_backing_src_type backing_src, + bool partition_vcpu_memory_access) +{ + struct perf_test_args *pta = &perf_test_args; + struct kvm_vm *vm; + uint64_t guest_num_pages, slot0_pages = 0; + uint64_t backing_src_pagesz = get_backing_src_pagesz(backing_src); + uint64_t region_end_gfn; + int i; + + pr_info("Testing guest mode: %s\n", vm_guest_mode_string(mode)); + + /* By default vCPUs will write to memory. */ + pta->wr_fract = 1; + + /* + * Snapshot the non-huge page size. This is used by the guest code to + * access/dirty pages at the logging granularity. + */ + pta->guest_page_size = vm_guest_mode_params[mode].page_size; + + guest_num_pages = vm_adjust_num_guest_pages(mode, + (nr_vcpus * vcpu_memory_bytes) / pta->guest_page_size); + + TEST_ASSERT(vcpu_memory_bytes % getpagesize() == 0, + "Guest memory size is not host page size aligned."); + TEST_ASSERT(vcpu_memory_bytes % pta->guest_page_size == 0, + "Guest memory size is not guest page size aligned."); + TEST_ASSERT(guest_num_pages % slots == 0, + "Guest memory cannot be evenly divided into %d slots.", + slots); + + /* + * If using nested, allocate extra pages for the nested page tables and + * in-memory data structures. + */ + if (pta->nested) + slot0_pages += perf_test_nested_pages(nr_vcpus); + + /* + * Pass guest_num_pages to populate the page tables for test memory. + * The memory is also added to memslot 0, but that's a benign side + * effect as KVM allows aliasing HVAs in meslots. + */ + vm = __vm_create_with_vcpus(mode, nr_vcpus, slot0_pages + guest_num_pages, + perf_test_guest_code, vcpus); + + pta->vm = vm; + + /* Put the test region at the top guest physical memory. */ + region_end_gfn = vm->max_gfn + 1; + +#ifdef __x86_64__ + /* + * When running vCPUs in L2, restrict the test region to 48 bits to + * avoid needing 5-level page tables to identity map L2. + */ + if (pta->nested) + region_end_gfn = min(region_end_gfn, (1UL << 48) / pta->guest_page_size); +#endif + /* + * If there should be more memory in the guest test region than there + * can be pages in the guest, it will definitely cause problems. + */ + TEST_ASSERT(guest_num_pages < region_end_gfn, + "Requested more guest memory than address space allows.\n" + " guest pages: %" PRIx64 " max gfn: %" PRIx64 + " nr_vcpus: %d wss: %" PRIx64 "]\n", + guest_num_pages, region_end_gfn - 1, nr_vcpus, vcpu_memory_bytes); + + pta->gpa = (region_end_gfn - guest_num_pages - 1) * pta->guest_page_size; + pta->gpa = align_down(pta->gpa, backing_src_pagesz); +#ifdef __s390x__ + /* Align to 1M (segment size) */ + pta->gpa = align_down(pta->gpa, 1 << 20); +#endif + pta->size = guest_num_pages * pta->guest_page_size; + pr_info("guest physical test memory: [0x%lx, 0x%lx)\n", + pta->gpa, pta->gpa + pta->size); + + /* Add extra memory slots for testing */ + for (i = 0; i < slots; i++) { + uint64_t region_pages = guest_num_pages / slots; + vm_paddr_t region_start = pta->gpa + region_pages * pta->guest_page_size * i; + + vm_userspace_mem_region_add(vm, backing_src, region_start, + PERF_TEST_MEM_SLOT_INDEX + i, + region_pages, 0); + } + + /* Do mapping for the demand paging memory slot */ + virt_map(vm, guest_test_virt_mem, pta->gpa, guest_num_pages); + + perf_test_setup_vcpus(vm, nr_vcpus, vcpus, vcpu_memory_bytes, + partition_vcpu_memory_access); + + if (pta->nested) { + pr_info("Configuring vCPUs to run in L2 (nested).\n"); + perf_test_setup_nested(vm, nr_vcpus, vcpus); + } + + ucall_init(vm, NULL); + + /* Export the shared variables to the guest. */ + sync_global_to_guest(vm, perf_test_args); + + return vm; +} + +void perf_test_destroy_vm(struct kvm_vm *vm) +{ + ucall_uninit(vm); + kvm_vm_free(vm); +} + +void perf_test_set_wr_fract(struct kvm_vm *vm, int wr_fract) +{ + perf_test_args.wr_fract = wr_fract; + sync_global_to_guest(vm, perf_test_args); +} + +uint64_t __weak perf_test_nested_pages(int nr_vcpus) +{ + return 0; +} + +void __weak perf_test_setup_nested(struct kvm_vm *vm, int nr_vcpus, struct kvm_vcpu **vcpus) +{ + pr_info("%s() not support on this architecture, skipping.\n", __func__); + exit(KSFT_SKIP); +} + +static void *vcpu_thread_main(void *data) +{ + struct vcpu_thread *vcpu = data; + + WRITE_ONCE(vcpu->running, true); + + /* + * Wait for all vCPU threads to be up and running before calling the test- + * provided vCPU thread function. This prevents thread creation (which + * requires taking the mmap_sem in write mode) from interfering with the + * guest faulting in its memory. + */ + while (!READ_ONCE(all_vcpu_threads_running)) + ; + + vcpu_thread_fn(&perf_test_args.vcpu_args[vcpu->vcpu_idx]); + + return NULL; +} + +void perf_test_start_vcpu_threads(int nr_vcpus, + void (*vcpu_fn)(struct perf_test_vcpu_args *)) +{ + int i; + + vcpu_thread_fn = vcpu_fn; + WRITE_ONCE(all_vcpu_threads_running, false); + + for (i = 0; i < nr_vcpus; i++) { + struct vcpu_thread *vcpu = &vcpu_threads[i]; + + vcpu->vcpu_idx = i; + WRITE_ONCE(vcpu->running, false); + + pthread_create(&vcpu->thread, NULL, vcpu_thread_main, vcpu); + } + + for (i = 0; i < nr_vcpus; i++) { + while (!READ_ONCE(vcpu_threads[i].running)) + ; + } + + WRITE_ONCE(all_vcpu_threads_running, true); +} + +void perf_test_join_vcpu_threads(int nr_vcpus) +{ + int i; + + for (i = 0; i < nr_vcpus; i++) + pthread_join(vcpu_threads[i].thread, NULL); +} diff --git a/tools/testing/selftests/kvm/lib/rbtree.c b/tools/testing/selftests/kvm/lib/rbtree.c new file mode 100644 index 000000000000..a703f0194ea3 --- /dev/null +++ b/tools/testing/selftests/kvm/lib/rbtree.c @@ -0,0 +1 @@ +#include "../../../../lib/rbtree.c" diff --git a/tools/testing/selftests/kvm/lib/riscv/processor.c b/tools/testing/selftests/kvm/lib/riscv/processor.c new file mode 100644 index 000000000000..604478151212 --- /dev/null +++ b/tools/testing/selftests/kvm/lib/riscv/processor.c @@ -0,0 +1,364 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * RISC-V code + * + * Copyright (C) 2021 Western Digital Corporation or its affiliates. + */ + +#include <linux/compiler.h> +#include <assert.h> + +#include "kvm_util.h" +#include "processor.h" + +#define DEFAULT_RISCV_GUEST_STACK_VADDR_MIN 0xac0000 + +static uint64_t page_align(struct kvm_vm *vm, uint64_t v) +{ + return (v + vm->page_size) & ~(vm->page_size - 1); +} + +static uint64_t pte_addr(struct kvm_vm *vm, uint64_t entry) +{ + return ((entry & PGTBL_PTE_ADDR_MASK) >> PGTBL_PTE_ADDR_SHIFT) << + PGTBL_PAGE_SIZE_SHIFT; +} + +static uint64_t ptrs_per_pte(struct kvm_vm *vm) +{ + return PGTBL_PAGE_SIZE / sizeof(uint64_t); +} + +static uint64_t pte_index_mask[] = { + PGTBL_L0_INDEX_MASK, + PGTBL_L1_INDEX_MASK, + PGTBL_L2_INDEX_MASK, + PGTBL_L3_INDEX_MASK, +}; + +static uint32_t pte_index_shift[] = { + PGTBL_L0_INDEX_SHIFT, + PGTBL_L1_INDEX_SHIFT, + PGTBL_L2_INDEX_SHIFT, + PGTBL_L3_INDEX_SHIFT, +}; + +static uint64_t pte_index(struct kvm_vm *vm, vm_vaddr_t gva, int level) +{ + TEST_ASSERT(level > -1, + "Negative page table level (%d) not possible", level); + TEST_ASSERT(level < vm->pgtable_levels, + "Invalid page table level (%d)", level); + + return (gva & pte_index_mask[level]) >> pte_index_shift[level]; +} + +void virt_arch_pgd_alloc(struct kvm_vm *vm) +{ + if (!vm->pgd_created) { + vm_paddr_t paddr = vm_phy_pages_alloc(vm, + page_align(vm, ptrs_per_pte(vm) * 8) / vm->page_size, + KVM_GUEST_PAGE_TABLE_MIN_PADDR, 0); + vm->pgd = paddr; + vm->pgd_created = true; + } +} + +void virt_arch_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr) +{ + uint64_t *ptep, next_ppn; + int level = vm->pgtable_levels - 1; + + TEST_ASSERT((vaddr % vm->page_size) == 0, + "Virtual address not on page boundary,\n" + " vaddr: 0x%lx vm->page_size: 0x%x", vaddr, vm->page_size); + TEST_ASSERT(sparsebit_is_set(vm->vpages_valid, + (vaddr >> vm->page_shift)), + "Invalid virtual address, vaddr: 0x%lx", vaddr); + TEST_ASSERT((paddr % vm->page_size) == 0, + "Physical address not on page boundary,\n" + " paddr: 0x%lx vm->page_size: 0x%x", paddr, vm->page_size); + TEST_ASSERT((paddr >> vm->page_shift) <= vm->max_gfn, + "Physical address beyond maximum supported,\n" + " paddr: 0x%lx vm->max_gfn: 0x%lx vm->page_size: 0x%x", + paddr, vm->max_gfn, vm->page_size); + + ptep = addr_gpa2hva(vm, vm->pgd) + pte_index(vm, vaddr, level) * 8; + if (!*ptep) { + next_ppn = vm_alloc_page_table(vm) >> PGTBL_PAGE_SIZE_SHIFT; + *ptep = (next_ppn << PGTBL_PTE_ADDR_SHIFT) | + PGTBL_PTE_VALID_MASK; + } + level--; + + while (level > -1) { + ptep = addr_gpa2hva(vm, pte_addr(vm, *ptep)) + + pte_index(vm, vaddr, level) * 8; + if (!*ptep && level > 0) { + next_ppn = vm_alloc_page_table(vm) >> + PGTBL_PAGE_SIZE_SHIFT; + *ptep = (next_ppn << PGTBL_PTE_ADDR_SHIFT) | + PGTBL_PTE_VALID_MASK; + } + level--; + } + + paddr = paddr >> PGTBL_PAGE_SIZE_SHIFT; + *ptep = (paddr << PGTBL_PTE_ADDR_SHIFT) | + PGTBL_PTE_PERM_MASK | PGTBL_PTE_VALID_MASK; +} + +vm_paddr_t addr_arch_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva) +{ + uint64_t *ptep; + int level = vm->pgtable_levels - 1; + + if (!vm->pgd_created) + goto unmapped_gva; + + ptep = addr_gpa2hva(vm, vm->pgd) + pte_index(vm, gva, level) * 8; + if (!ptep) + goto unmapped_gva; + level--; + + while (level > -1) { + ptep = addr_gpa2hva(vm, pte_addr(vm, *ptep)) + + pte_index(vm, gva, level) * 8; + if (!ptep) + goto unmapped_gva; + level--; + } + + return pte_addr(vm, *ptep) + (gva & (vm->page_size - 1)); + +unmapped_gva: + TEST_FAIL("No mapping for vm virtual address gva: 0x%lx level: %d", + gva, level); + exit(1); +} + +static void pte_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent, + uint64_t page, int level) +{ +#ifdef DEBUG + static const char *const type[] = { "pte", "pmd", "pud", "p4d"}; + uint64_t pte, *ptep; + + if (level < 0) + return; + + for (pte = page; pte < page + ptrs_per_pte(vm) * 8; pte += 8) { + ptep = addr_gpa2hva(vm, pte); + if (!*ptep) + continue; + fprintf(stream, "%*s%s: %lx: %lx at %p\n", indent, "", + type[level], pte, *ptep, ptep); + pte_dump(stream, vm, indent + 1, + pte_addr(vm, *ptep), level - 1); + } +#endif +} + +void virt_arch_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent) +{ + int level = vm->pgtable_levels - 1; + uint64_t pgd, *ptep; + + if (!vm->pgd_created) + return; + + for (pgd = vm->pgd; pgd < vm->pgd + ptrs_per_pte(vm) * 8; pgd += 8) { + ptep = addr_gpa2hva(vm, pgd); + if (!*ptep) + continue; + fprintf(stream, "%*spgd: %lx: %lx at %p\n", indent, "", + pgd, *ptep, ptep); + pte_dump(stream, vm, indent + 1, + pte_addr(vm, *ptep), level - 1); + } +} + +void riscv_vcpu_mmu_setup(struct kvm_vcpu *vcpu) +{ + struct kvm_vm *vm = vcpu->vm; + unsigned long satp; + + /* + * The RISC-V Sv48 MMU mode supports 56-bit physical address + * for 48-bit virtual address with 4KB last level page size. + */ + switch (vm->mode) { + case VM_MODE_P52V48_4K: + case VM_MODE_P48V48_4K: + case VM_MODE_P40V48_4K: + break; + default: + TEST_FAIL("Unknown guest mode, mode: 0x%x", vm->mode); + } + + satp = (vm->pgd >> PGTBL_PAGE_SIZE_SHIFT) & SATP_PPN; + satp |= SATP_MODE_48; + + vcpu_set_reg(vcpu, RISCV_CSR_REG(satp), satp); +} + +void vcpu_arch_dump(FILE *stream, struct kvm_vcpu *vcpu, uint8_t indent) +{ + struct kvm_riscv_core core; + + vcpu_get_reg(vcpu, RISCV_CORE_REG(mode), &core.mode); + vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.pc), &core.regs.pc); + vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.ra), &core.regs.ra); + vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.sp), &core.regs.sp); + vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.gp), &core.regs.gp); + vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.tp), &core.regs.tp); + vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.t0), &core.regs.t0); + vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.t1), &core.regs.t1); + vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.t2), &core.regs.t2); + vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.s0), &core.regs.s0); + vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.s1), &core.regs.s1); + vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.a0), &core.regs.a0); + vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.a1), &core.regs.a1); + vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.a2), &core.regs.a2); + vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.a3), &core.regs.a3); + vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.a4), &core.regs.a4); + vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.a5), &core.regs.a5); + vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.a6), &core.regs.a6); + vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.a7), &core.regs.a7); + vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.s2), &core.regs.s2); + vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.s3), &core.regs.s3); + vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.s4), &core.regs.s4); + vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.s5), &core.regs.s5); + vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.s6), &core.regs.s6); + vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.s7), &core.regs.s7); + vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.s8), &core.regs.s8); + vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.s9), &core.regs.s9); + vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.s10), &core.regs.s10); + vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.s11), &core.regs.s11); + vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.t3), &core.regs.t3); + vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.t4), &core.regs.t4); + vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.t5), &core.regs.t5); + vcpu_get_reg(vcpu, RISCV_CORE_REG(regs.t6), &core.regs.t6); + + fprintf(stream, + " MODE: 0x%lx\n", core.mode); + fprintf(stream, + " PC: 0x%016lx RA: 0x%016lx SP: 0x%016lx GP: 0x%016lx\n", + core.regs.pc, core.regs.ra, core.regs.sp, core.regs.gp); + fprintf(stream, + " TP: 0x%016lx T0: 0x%016lx T1: 0x%016lx T2: 0x%016lx\n", + core.regs.tp, core.regs.t0, core.regs.t1, core.regs.t2); + fprintf(stream, + " S0: 0x%016lx S1: 0x%016lx A0: 0x%016lx A1: 0x%016lx\n", + core.regs.s0, core.regs.s1, core.regs.a0, core.regs.a1); + fprintf(stream, + " A2: 0x%016lx A3: 0x%016lx A4: 0x%016lx A5: 0x%016lx\n", + core.regs.a2, core.regs.a3, core.regs.a4, core.regs.a5); + fprintf(stream, + " A6: 0x%016lx A7: 0x%016lx S2: 0x%016lx S3: 0x%016lx\n", + core.regs.a6, core.regs.a7, core.regs.s2, core.regs.s3); + fprintf(stream, + " S4: 0x%016lx S5: 0x%016lx S6: 0x%016lx S7: 0x%016lx\n", + core.regs.s4, core.regs.s5, core.regs.s6, core.regs.s7); + fprintf(stream, + " S8: 0x%016lx S9: 0x%016lx S10: 0x%016lx S11: 0x%016lx\n", + core.regs.s8, core.regs.s9, core.regs.s10, core.regs.s11); + fprintf(stream, + " T3: 0x%016lx T4: 0x%016lx T5: 0x%016lx T6: 0x%016lx\n", + core.regs.t3, core.regs.t4, core.regs.t5, core.regs.t6); +} + +static void __aligned(16) guest_unexp_trap(void) +{ + sbi_ecall(KVM_RISCV_SELFTESTS_SBI_EXT, + KVM_RISCV_SELFTESTS_SBI_UNEXP, + 0, 0, 0, 0, 0, 0); +} + +struct kvm_vcpu *vm_arch_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id, + void *guest_code) +{ + int r; + size_t stack_size = vm->page_size == 4096 ? + DEFAULT_STACK_PGS * vm->page_size : + vm->page_size; + unsigned long stack_vaddr = vm_vaddr_alloc(vm, stack_size, + DEFAULT_RISCV_GUEST_STACK_VADDR_MIN); + unsigned long current_gp = 0; + struct kvm_mp_state mps; + struct kvm_vcpu *vcpu; + + vcpu = __vm_vcpu_add(vm, vcpu_id); + riscv_vcpu_mmu_setup(vcpu); + + /* + * With SBI HSM support in KVM RISC-V, all secondary VCPUs are + * powered-off by default so we ensure that all secondary VCPUs + * are powered-on using KVM_SET_MP_STATE ioctl(). + */ + mps.mp_state = KVM_MP_STATE_RUNNABLE; + r = __vcpu_ioctl(vcpu, KVM_SET_MP_STATE, &mps); + TEST_ASSERT(!r, "IOCTL KVM_SET_MP_STATE failed (error %d)", r); + + /* Setup global pointer of guest to be same as the host */ + asm volatile ( + "add %0, gp, zero" : "=r" (current_gp) : : "memory"); + vcpu_set_reg(vcpu, RISCV_CORE_REG(regs.gp), current_gp); + + /* Setup stack pointer and program counter of guest */ + vcpu_set_reg(vcpu, RISCV_CORE_REG(regs.sp), stack_vaddr + stack_size); + vcpu_set_reg(vcpu, RISCV_CORE_REG(regs.pc), (unsigned long)guest_code); + + /* Setup default exception vector of guest */ + vcpu_set_reg(vcpu, RISCV_CSR_REG(stvec), (unsigned long)guest_unexp_trap); + + return vcpu; +} + +void vcpu_args_set(struct kvm_vcpu *vcpu, unsigned int num, ...) +{ + va_list ap; + uint64_t id = RISCV_CORE_REG(regs.a0); + int i; + + TEST_ASSERT(num >= 1 && num <= 8, "Unsupported number of args,\n" + " num: %u\n", num); + + va_start(ap, num); + + for (i = 0; i < num; i++) { + switch (i) { + case 0: + id = RISCV_CORE_REG(regs.a0); + break; + case 1: + id = RISCV_CORE_REG(regs.a1); + break; + case 2: + id = RISCV_CORE_REG(regs.a2); + break; + case 3: + id = RISCV_CORE_REG(regs.a3); + break; + case 4: + id = RISCV_CORE_REG(regs.a4); + break; + case 5: + id = RISCV_CORE_REG(regs.a5); + break; + case 6: + id = RISCV_CORE_REG(regs.a6); + break; + case 7: + id = RISCV_CORE_REG(regs.a7); + break; + } + vcpu_set_reg(vcpu, id, va_arg(ap, uint64_t)); + } + + va_end(ap); +} + +void assert_on_unhandled_exception(struct kvm_vcpu *vcpu) +{ +} diff --git a/tools/testing/selftests/kvm/lib/riscv/ucall.c b/tools/testing/selftests/kvm/lib/riscv/ucall.c new file mode 100644 index 000000000000..087b9740bc8f --- /dev/null +++ b/tools/testing/selftests/kvm/lib/riscv/ucall.c @@ -0,0 +1,98 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * ucall support. A ucall is a "hypercall to userspace". + * + * Copyright (C) 2021 Western Digital Corporation or its affiliates. + */ + +#include <linux/kvm.h> + +#include "kvm_util.h" +#include "processor.h" + +void ucall_init(struct kvm_vm *vm, void *arg) +{ +} + +void ucall_uninit(struct kvm_vm *vm) +{ +} + +struct sbiret sbi_ecall(int ext, int fid, unsigned long arg0, + unsigned long arg1, unsigned long arg2, + unsigned long arg3, unsigned long arg4, + unsigned long arg5) +{ + register uintptr_t a0 asm ("a0") = (uintptr_t)(arg0); + register uintptr_t a1 asm ("a1") = (uintptr_t)(arg1); + register uintptr_t a2 asm ("a2") = (uintptr_t)(arg2); + register uintptr_t a3 asm ("a3") = (uintptr_t)(arg3); + register uintptr_t a4 asm ("a4") = (uintptr_t)(arg4); + register uintptr_t a5 asm ("a5") = (uintptr_t)(arg5); + register uintptr_t a6 asm ("a6") = (uintptr_t)(fid); + register uintptr_t a7 asm ("a7") = (uintptr_t)(ext); + struct sbiret ret; + + asm volatile ( + "ecall" + : "+r" (a0), "+r" (a1) + : "r" (a2), "r" (a3), "r" (a4), "r" (a5), "r" (a6), "r" (a7) + : "memory"); + ret.error = a0; + ret.value = a1; + + return ret; +} + +void ucall(uint64_t cmd, int nargs, ...) +{ + struct ucall uc = { + .cmd = cmd, + }; + va_list va; + int i; + + nargs = min(nargs, UCALL_MAX_ARGS); + + va_start(va, nargs); + for (i = 0; i < nargs; ++i) + uc.args[i] = va_arg(va, uint64_t); + va_end(va); + + sbi_ecall(KVM_RISCV_SELFTESTS_SBI_EXT, + KVM_RISCV_SELFTESTS_SBI_UCALL, + (vm_vaddr_t)&uc, 0, 0, 0, 0, 0); +} + +uint64_t get_ucall(struct kvm_vcpu *vcpu, struct ucall *uc) +{ + struct kvm_run *run = vcpu->run; + struct ucall ucall = {}; + + if (uc) + memset(uc, 0, sizeof(*uc)); + + if (run->exit_reason == KVM_EXIT_RISCV_SBI && + run->riscv_sbi.extension_id == KVM_RISCV_SELFTESTS_SBI_EXT) { + switch (run->riscv_sbi.function_id) { + case KVM_RISCV_SELFTESTS_SBI_UCALL: + memcpy(&ucall, + addr_gva2hva(vcpu->vm, run->riscv_sbi.args[0]), + sizeof(ucall)); + + vcpu_run_complete_io(vcpu); + if (uc) + memcpy(uc, &ucall, sizeof(ucall)); + + break; + case KVM_RISCV_SELFTESTS_SBI_UNEXP: + vcpu_dump(stderr, vcpu, 2); + TEST_ASSERT(0, "Unexpected trap taken by guest"); + break; + default: + break; + } + } + + return ucall.cmd; +} diff --git a/tools/testing/selftests/kvm/lib/s390x/diag318_test_handler.c b/tools/testing/selftests/kvm/lib/s390x/diag318_test_handler.c new file mode 100644 index 000000000000..cdb7daeed5fd --- /dev/null +++ b/tools/testing/selftests/kvm/lib/s390x/diag318_test_handler.c @@ -0,0 +1,81 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Test handler for the s390x DIAGNOSE 0x0318 instruction. + * + * Copyright (C) 2020, IBM + */ + +#include "test_util.h" +#include "kvm_util.h" + +#define ICPT_INSTRUCTION 0x04 +#define IPA0_DIAG 0x8300 + +static void guest_code(void) +{ + uint64_t diag318_info = 0x12345678; + + asm volatile ("diag %0,0,0x318\n" : : "d" (diag318_info)); +} + +/* + * The DIAGNOSE 0x0318 instruction call must be handled via userspace. As such, + * we create an ad-hoc VM here to handle the instruction then extract the + * necessary data. It is up to the caller to decide what to do with that data. + */ +static uint64_t diag318_handler(void) +{ + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; + struct kvm_run *run; + uint64_t reg; + uint64_t diag318_info; + + vm = vm_create_with_one_vcpu(&vcpu, guest_code); + vcpu_run(vcpu); + run = vcpu->run; + + TEST_ASSERT(run->exit_reason == KVM_EXIT_S390_SIEIC, + "DIAGNOSE 0x0318 instruction was not intercepted"); + TEST_ASSERT(run->s390_sieic.icptcode == ICPT_INSTRUCTION, + "Unexpected intercept code: 0x%x", run->s390_sieic.icptcode); + TEST_ASSERT((run->s390_sieic.ipa & 0xff00) == IPA0_DIAG, + "Unexpected IPA0 code: 0x%x", (run->s390_sieic.ipa & 0xff00)); + + reg = (run->s390_sieic.ipa & 0x00f0) >> 4; + diag318_info = run->s.regs.gprs[reg]; + + TEST_ASSERT(diag318_info != 0, "DIAGNOSE 0x0318 info not set"); + + kvm_vm_free(vm); + + return diag318_info; +} + +uint64_t get_diag318_info(void) +{ + static uint64_t diag318_info; + static bool printed_skip; + + /* + * If KVM does not support diag318, then return 0 to + * ensure tests do not break. + */ + if (!kvm_has_cap(KVM_CAP_S390_DIAG318)) { + if (!printed_skip) { + fprintf(stdout, "KVM_CAP_S390_DIAG318 not supported. " + "Skipping diag318 test.\n"); + printed_skip = true; + } + return 0; + } + + /* + * If a test has previously requested the diag318 info, + * then don't bother spinning up a temporary VM again. + */ + if (!diag318_info) + diag318_info = diag318_handler(); + + return diag318_info; +} diff --git a/tools/testing/selftests/kvm/lib/s390x/processor.c b/tools/testing/selftests/kvm/lib/s390x/processor.c index 32a02360b1eb..89d7340d9cbd 100644 --- a/tools/testing/selftests/kvm/lib/s390x/processor.c +++ b/tools/testing/selftests/kvm/lib/s390x/processor.c @@ -5,17 +5,12 @@ * Copyright (C) 2019, Red Hat, Inc. */ -#define _GNU_SOURCE /* for program_invocation_name */ - #include "processor.h" #include "kvm_util.h" -#include "../kvm_util_internal.h" - -#define KVM_GUEST_PAGE_TABLE_MIN_PADDR 0x180000 #define PAGES_PER_REGION 4 -void virt_pgd_alloc(struct kvm_vm *vm, uint32_t memslot) +void virt_arch_pgd_alloc(struct kvm_vm *vm) { vm_paddr_t paddr; @@ -26,7 +21,7 @@ void virt_pgd_alloc(struct kvm_vm *vm, uint32_t memslot) return; paddr = vm_phy_pages_alloc(vm, PAGES_PER_REGION, - KVM_GUEST_PAGE_TABLE_MIN_PADDR, memslot); + KVM_GUEST_PAGE_TABLE_MIN_PADDR, 0); memset(addr_gpa2hva(vm, paddr), 0xff, PAGES_PER_REGION * vm->page_size); vm->pgd = paddr; @@ -38,12 +33,12 @@ void virt_pgd_alloc(struct kvm_vm *vm, uint32_t memslot) * a page table (ri == 4). Returns a suitable region/segment table entry * which points to the freshly allocated pages. */ -static uint64_t virt_alloc_region(struct kvm_vm *vm, int ri, uint32_t memslot) +static uint64_t virt_alloc_region(struct kvm_vm *vm, int ri) { uint64_t taddr; taddr = vm_phy_pages_alloc(vm, ri < 4 ? PAGES_PER_REGION : 1, - KVM_GUEST_PAGE_TABLE_MIN_PADDR, memslot); + KVM_GUEST_PAGE_TABLE_MIN_PADDR, 0); memset(addr_gpa2hva(vm, taddr), 0xff, PAGES_PER_REGION * vm->page_size); return (taddr & REGION_ENTRY_ORIGIN) @@ -51,24 +46,7 @@ static uint64_t virt_alloc_region(struct kvm_vm *vm, int ri, uint32_t memslot) | ((ri < 4 ? (PAGES_PER_REGION - 1) : 0) & REGION_ENTRY_LENGTH); } -/* - * VM Virtual Page Map - * - * Input Args: - * vm - Virtual Machine - * gva - VM Virtual Address - * gpa - VM Physical Address - * memslot - Memory region slot for new virtual translation tables - * - * Output Args: None - * - * Return: None - * - * Within the VM given by vm, creates a virtual translation for the page - * starting at vaddr to the page starting at paddr. - */ -void virt_pg_map(struct kvm_vm *vm, uint64_t gva, uint64_t gpa, - uint32_t memslot) +void virt_arch_pg_map(struct kvm_vm *vm, uint64_t gva, uint64_t gpa) { int ri, idx; uint64_t *entry; @@ -95,7 +73,7 @@ void virt_pg_map(struct kvm_vm *vm, uint64_t gva, uint64_t gpa, for (ri = 1; ri <= 4; ri++) { idx = (gva >> (64 - 11 * ri)) & 0x7ffu; if (entry[idx] & REGION_ENTRY_INVALID) - entry[idx] = virt_alloc_region(vm, ri, memslot); + entry[idx] = virt_alloc_region(vm, ri); entry = addr_gpa2hva(vm, entry[idx] & REGION_ENTRY_ORIGIN); } @@ -107,27 +85,7 @@ void virt_pg_map(struct kvm_vm *vm, uint64_t gva, uint64_t gpa, entry[idx] = gpa; } -/* - * Address Guest Virtual to Guest Physical - * - * Input Args: - * vm - Virtual Machine - * gpa - VM virtual address - * - * Output Args: None - * - * Return: - * Equivalent VM physical address - * - * Translates the VM virtual address given by gva to a VM physical - * address and then locates the memory region containing the VM - * physical address, within the VM given by vm. When found, the host - * virtual address providing the memory to the vm physical address is - * returned. - * A TEST_ASSERT failure occurs if no region containing translated - * VM virtual address exists. - */ -vm_paddr_t addr_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva) +vm_paddr_t addr_arch_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva) { int ri, idx; uint64_t *entry; @@ -188,7 +146,7 @@ static void virt_dump_region(FILE *stream, struct kvm_vm *vm, uint8_t indent, } } -void virt_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent) +void virt_arch_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent) { if (!vm->pgd_created) return; @@ -196,83 +154,67 @@ void virt_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent) virt_dump_region(stream, vm, indent, vm->pgd); } -/* - * Create a VM with reasonable defaults - * - * Input Args: - * vcpuid - The id of the single VCPU to add to the VM. - * extra_mem_pages - The size of extra memories to add (this will - * decide how much extra space we will need to - * setup the page tables using mem slot 0) - * guest_code - The vCPU's entry point - * - * Output Args: None - * - * Return: - * Pointer to opaque structure that describes the created VM. - */ -struct kvm_vm *vm_create_default(uint32_t vcpuid, uint64_t extra_mem_pages, - void *guest_code) -{ - /* - * The additional amount of pages required for the page tables is: - * 1 * n / 256 + 4 * (n / 256) / 2048 + 4 * (n / 256) / 2048^2 + ... - * which is definitely smaller than (n / 256) * 2. - */ - uint64_t extra_pg_pages = extra_mem_pages / 256 * 2; - struct kvm_vm *vm; - - vm = vm_create(VM_MODE_DEFAULT, - DEFAULT_GUEST_PHY_PAGES + extra_pg_pages, O_RDWR); - - kvm_vm_elf_load(vm, program_invocation_name, 0, 0); - vm_vcpu_add_default(vm, vcpuid, guest_code); - - return vm; -} - -/* - * Adds a vCPU with reasonable defaults (i.e. a stack and initial PSW) - * - * Input Args: - * vcpuid - The id of the VCPU to add to the VM. - * guest_code - The vCPU's entry point - */ -void vm_vcpu_add_default(struct kvm_vm *vm, uint32_t vcpuid, void *guest_code) +struct kvm_vcpu *vm_arch_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id, + void *guest_code) { size_t stack_size = DEFAULT_STACK_PGS * getpagesize(); uint64_t stack_vaddr; struct kvm_regs regs; struct kvm_sregs sregs; + struct kvm_vcpu *vcpu; struct kvm_run *run; TEST_ASSERT(vm->page_size == 4096, "Unsupported page size: 0x%x", vm->page_size); stack_vaddr = vm_vaddr_alloc(vm, stack_size, - DEFAULT_GUEST_STACK_VADDR_MIN, 0, 0); + DEFAULT_GUEST_STACK_VADDR_MIN); - vm_vcpu_add(vm, vcpuid); + vcpu = __vm_vcpu_add(vm, vcpu_id); /* Setup guest registers */ - vcpu_regs_get(vm, vcpuid, ®s); + vcpu_regs_get(vcpu, ®s); regs.gprs[15] = stack_vaddr + (DEFAULT_STACK_PGS * getpagesize()) - 160; - vcpu_regs_set(vm, vcpuid, ®s); + vcpu_regs_set(vcpu, ®s); - vcpu_sregs_get(vm, vcpuid, &sregs); + vcpu_sregs_get(vcpu, &sregs); sregs.crs[0] |= 0x00040000; /* Enable floating point regs */ sregs.crs[1] = vm->pgd | 0xf; /* Primary region table */ - vcpu_sregs_set(vm, vcpuid, &sregs); + vcpu_sregs_set(vcpu, &sregs); - run = vcpu_state(vm, vcpuid); + run = vcpu->run; run->psw_mask = 0x0400000180000000ULL; /* DAT enabled + 64 bit mode */ run->psw_addr = (uintptr_t)guest_code; + + return vcpu; } -void vcpu_dump(FILE *stream, struct kvm_vm *vm, uint32_t vcpuid, uint8_t indent) +void vcpu_args_set(struct kvm_vcpu *vcpu, unsigned int num, ...) { - struct vcpu *vcpu = vm->vcpu_head; + va_list ap; + struct kvm_regs regs; + int i; + TEST_ASSERT(num >= 1 && num <= 5, "Unsupported number of args,\n" + " num: %u\n", + num); + + va_start(ap, num); + vcpu_regs_get(vcpu, ®s); + + for (i = 0; i < num; i++) + regs.gprs[i + 2] = va_arg(ap, uint64_t); + + vcpu_regs_set(vcpu, ®s); + va_end(ap); +} + +void vcpu_arch_dump(FILE *stream, struct kvm_vcpu *vcpu, uint8_t indent) +{ fprintf(stream, "%*spstate: psw: 0x%.16llx:0x%.16llx\n", - indent, "", vcpu->state->psw_mask, vcpu->state->psw_addr); + indent, "", vcpu->run->psw_mask, vcpu->run->psw_addr); +} + +void assert_on_unhandled_exception(struct kvm_vcpu *vcpu) +{ } diff --git a/tools/testing/selftests/kvm/lib/s390x/ucall.c b/tools/testing/selftests/kvm/lib/s390x/ucall.c index fd589dc9bfab..73dc4e21190f 100644 --- a/tools/testing/selftests/kvm/lib/s390x/ucall.c +++ b/tools/testing/selftests/kvm/lib/s390x/ucall.c @@ -22,7 +22,7 @@ void ucall(uint64_t cmd, int nargs, ...) va_list va; int i; - nargs = nargs <= UCALL_MAX_ARGS ? nargs : UCALL_MAX_ARGS; + nargs = min(nargs, UCALL_MAX_ARGS); va_start(va, nargs); for (i = 0; i < nargs; ++i) @@ -33,21 +33,24 @@ void ucall(uint64_t cmd, int nargs, ...) asm volatile ("diag 0,%0,0x501" : : "a"(&uc) : "memory"); } -uint64_t get_ucall(struct kvm_vm *vm, uint32_t vcpu_id, struct ucall *uc) +uint64_t get_ucall(struct kvm_vcpu *vcpu, struct ucall *uc) { - struct kvm_run *run = vcpu_state(vm, vcpu_id); + struct kvm_run *run = vcpu->run; struct ucall ucall = {}; + if (uc) + memset(uc, 0, sizeof(*uc)); + if (run->exit_reason == KVM_EXIT_S390_SIEIC && run->s390_sieic.icptcode == 4 && (run->s390_sieic.ipa >> 8) == 0x83 && /* 0x83 means DIAGNOSE */ (run->s390_sieic.ipb >> 16) == 0x501) { int reg = run->s390_sieic.ipa & 0xf; - memcpy(&ucall, addr_gva2hva(vm, run->s.regs.gprs[reg]), + memcpy(&ucall, addr_gva2hva(vcpu->vm, run->s.regs.gprs[reg]), sizeof(ucall)); - vcpu_run_complete_io(vm, vcpu_id); + vcpu_run_complete_io(vcpu); if (uc) memcpy(uc, &ucall, sizeof(ucall)); } diff --git a/tools/testing/selftests/kvm/lib/sparsebit.c b/tools/testing/selftests/kvm/lib/sparsebit.c index 031ba3c932ed..50e0cf41a7dd 100644 --- a/tools/testing/selftests/kvm/lib/sparsebit.c +++ b/tools/testing/selftests/kvm/lib/sparsebit.c @@ -1866,7 +1866,7 @@ void sparsebit_validate_internal(struct sparsebit *s) * of total bits set. */ if (s->num_set != total_bits_set) { - fprintf(stderr, "Number of bits set missmatch,\n" + fprintf(stderr, "Number of bits set mismatch,\n" " s->num_set: 0x%lx total_bits_set: 0x%lx", s->num_set, total_bits_set); @@ -1890,7 +1890,6 @@ void sparsebit_validate_internal(struct sparsebit *s) */ #include <stdlib.h> -#include <assert.h> struct range { sparsebit_idx_t first, last; diff --git a/tools/testing/selftests/kvm/lib/string_override.c b/tools/testing/selftests/kvm/lib/string_override.c new file mode 100644 index 000000000000..632398adc229 --- /dev/null +++ b/tools/testing/selftests/kvm/lib/string_override.c @@ -0,0 +1,39 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include <stddef.h> + +/* + * Override the "basic" built-in string helpers so that they can be used in + * guest code. KVM selftests don't support dynamic loading in guest code and + * will jump into the weeds if the compiler decides to insert an out-of-line + * call via the PLT. + */ +int memcmp(const void *cs, const void *ct, size_t count) +{ + const unsigned char *su1, *su2; + int res = 0; + + for (su1 = cs, su2 = ct; 0 < count; ++su1, ++su2, count--) { + if ((res = *su1 - *su2) != 0) + break; + } + return res; +} + +void *memcpy(void *dest, const void *src, size_t count) +{ + char *tmp = dest; + const char *s = src; + + while (count--) + *tmp++ = *s++; + return dest; +} + +void *memset(void *s, int c, size_t count) +{ + char *xs = s; + + while (count--) + *xs++ = c; + return s; +} diff --git a/tools/testing/selftests/kvm/lib/test_util.c b/tools/testing/selftests/kvm/lib/test_util.c new file mode 100644 index 000000000000..6d23878bbfe1 --- /dev/null +++ b/tools/testing/selftests/kvm/lib/test_util.c @@ -0,0 +1,336 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * tools/testing/selftests/kvm/lib/test_util.c + * + * Copyright (C) 2020, Google LLC. + */ + +#include <assert.h> +#include <ctype.h> +#include <limits.h> +#include <stdlib.h> +#include <time.h> +#include <sys/stat.h> +#include <sys/syscall.h> +#include <linux/mman.h> +#include "linux/kernel.h" + +#include "test_util.h" + +/* + * Parses "[0-9]+[kmgt]?". + */ +size_t parse_size(const char *size) +{ + size_t base; + char *scale; + int shift = 0; + + TEST_ASSERT(size && isdigit(size[0]), "Need at least one digit in '%s'", size); + + base = strtoull(size, &scale, 0); + + TEST_ASSERT(base != ULLONG_MAX, "Overflow parsing size!"); + + switch (tolower(*scale)) { + case 't': + shift = 40; + break; + case 'g': + shift = 30; + break; + case 'm': + shift = 20; + break; + case 'k': + shift = 10; + break; + case 'b': + case '\0': + shift = 0; + break; + default: + TEST_ASSERT(false, "Unknown size letter %c", *scale); + } + + TEST_ASSERT((base << shift) >> shift == base, "Overflow scaling size!"); + + return base << shift; +} + +int64_t timespec_to_ns(struct timespec ts) +{ + return (int64_t)ts.tv_nsec + 1000000000LL * (int64_t)ts.tv_sec; +} + +struct timespec timespec_add_ns(struct timespec ts, int64_t ns) +{ + struct timespec res; + + res.tv_nsec = ts.tv_nsec + ns; + res.tv_sec = ts.tv_sec + res.tv_nsec / 1000000000LL; + res.tv_nsec %= 1000000000LL; + + return res; +} + +struct timespec timespec_add(struct timespec ts1, struct timespec ts2) +{ + int64_t ns1 = timespec_to_ns(ts1); + int64_t ns2 = timespec_to_ns(ts2); + return timespec_add_ns((struct timespec){0}, ns1 + ns2); +} + +struct timespec timespec_sub(struct timespec ts1, struct timespec ts2) +{ + int64_t ns1 = timespec_to_ns(ts1); + int64_t ns2 = timespec_to_ns(ts2); + return timespec_add_ns((struct timespec){0}, ns1 - ns2); +} + +struct timespec timespec_elapsed(struct timespec start) +{ + struct timespec end; + + clock_gettime(CLOCK_MONOTONIC, &end); + return timespec_sub(end, start); +} + +struct timespec timespec_div(struct timespec ts, int divisor) +{ + int64_t ns = timespec_to_ns(ts) / divisor; + + return timespec_add_ns((struct timespec){0}, ns); +} + +void print_skip(const char *fmt, ...) +{ + va_list ap; + + assert(fmt); + va_start(ap, fmt); + vprintf(fmt, ap); + va_end(ap); + puts(", skipping test"); +} + +bool thp_configured(void) +{ + int ret; + struct stat statbuf; + + ret = stat("/sys/kernel/mm/transparent_hugepage", &statbuf); + TEST_ASSERT(ret == 0 || (ret == -1 && errno == ENOENT), + "Error in stating /sys/kernel/mm/transparent_hugepage"); + + return ret == 0; +} + +size_t get_trans_hugepagesz(void) +{ + size_t size; + FILE *f; + int ret; + + TEST_ASSERT(thp_configured(), "THP is not configured in host kernel"); + + f = fopen("/sys/kernel/mm/transparent_hugepage/hpage_pmd_size", "r"); + TEST_ASSERT(f != NULL, "Error in opening transparent_hugepage/hpage_pmd_size"); + + ret = fscanf(f, "%ld", &size); + ret = fscanf(f, "%ld", &size); + TEST_ASSERT(ret < 1, "Error reading transparent_hugepage/hpage_pmd_size"); + fclose(f); + + return size; +} + +size_t get_def_hugetlb_pagesz(void) +{ + char buf[64]; + const char *tag = "Hugepagesize:"; + FILE *f; + + f = fopen("/proc/meminfo", "r"); + TEST_ASSERT(f != NULL, "Error in opening /proc/meminfo"); + + while (fgets(buf, sizeof(buf), f) != NULL) { + if (strstr(buf, tag) == buf) { + fclose(f); + return strtoull(buf + strlen(tag), NULL, 10) << 10; + } + } + + if (feof(f)) + TEST_FAIL("HUGETLB is not configured in host kernel"); + else + TEST_FAIL("Error in reading /proc/meminfo"); + + fclose(f); + return 0; +} + +#define ANON_FLAGS (MAP_PRIVATE | MAP_ANONYMOUS) +#define ANON_HUGE_FLAGS (ANON_FLAGS | MAP_HUGETLB) + +const struct vm_mem_backing_src_alias *vm_mem_backing_src_alias(uint32_t i) +{ + static const struct vm_mem_backing_src_alias aliases[] = { + [VM_MEM_SRC_ANONYMOUS] = { + .name = "anonymous", + .flag = ANON_FLAGS, + }, + [VM_MEM_SRC_ANONYMOUS_THP] = { + .name = "anonymous_thp", + .flag = ANON_FLAGS, + }, + [VM_MEM_SRC_ANONYMOUS_HUGETLB] = { + .name = "anonymous_hugetlb", + .flag = ANON_HUGE_FLAGS, + }, + [VM_MEM_SRC_ANONYMOUS_HUGETLB_16KB] = { + .name = "anonymous_hugetlb_16kb", + .flag = ANON_HUGE_FLAGS | MAP_HUGE_16KB, + }, + [VM_MEM_SRC_ANONYMOUS_HUGETLB_64KB] = { + .name = "anonymous_hugetlb_64kb", + .flag = ANON_HUGE_FLAGS | MAP_HUGE_64KB, + }, + [VM_MEM_SRC_ANONYMOUS_HUGETLB_512KB] = { + .name = "anonymous_hugetlb_512kb", + .flag = ANON_HUGE_FLAGS | MAP_HUGE_512KB, + }, + [VM_MEM_SRC_ANONYMOUS_HUGETLB_1MB] = { + .name = "anonymous_hugetlb_1mb", + .flag = ANON_HUGE_FLAGS | MAP_HUGE_1MB, + }, + [VM_MEM_SRC_ANONYMOUS_HUGETLB_2MB] = { + .name = "anonymous_hugetlb_2mb", + .flag = ANON_HUGE_FLAGS | MAP_HUGE_2MB, + }, + [VM_MEM_SRC_ANONYMOUS_HUGETLB_8MB] = { + .name = "anonymous_hugetlb_8mb", + .flag = ANON_HUGE_FLAGS | MAP_HUGE_8MB, + }, + [VM_MEM_SRC_ANONYMOUS_HUGETLB_16MB] = { + .name = "anonymous_hugetlb_16mb", + .flag = ANON_HUGE_FLAGS | MAP_HUGE_16MB, + }, + [VM_MEM_SRC_ANONYMOUS_HUGETLB_32MB] = { + .name = "anonymous_hugetlb_32mb", + .flag = ANON_HUGE_FLAGS | MAP_HUGE_32MB, + }, + [VM_MEM_SRC_ANONYMOUS_HUGETLB_256MB] = { + .name = "anonymous_hugetlb_256mb", + .flag = ANON_HUGE_FLAGS | MAP_HUGE_256MB, + }, + [VM_MEM_SRC_ANONYMOUS_HUGETLB_512MB] = { + .name = "anonymous_hugetlb_512mb", + .flag = ANON_HUGE_FLAGS | MAP_HUGE_512MB, + }, + [VM_MEM_SRC_ANONYMOUS_HUGETLB_1GB] = { + .name = "anonymous_hugetlb_1gb", + .flag = ANON_HUGE_FLAGS | MAP_HUGE_1GB, + }, + [VM_MEM_SRC_ANONYMOUS_HUGETLB_2GB] = { + .name = "anonymous_hugetlb_2gb", + .flag = ANON_HUGE_FLAGS | MAP_HUGE_2GB, + }, + [VM_MEM_SRC_ANONYMOUS_HUGETLB_16GB] = { + .name = "anonymous_hugetlb_16gb", + .flag = ANON_HUGE_FLAGS | MAP_HUGE_16GB, + }, + [VM_MEM_SRC_SHMEM] = { + .name = "shmem", + .flag = MAP_SHARED, + }, + [VM_MEM_SRC_SHARED_HUGETLB] = { + .name = "shared_hugetlb", + /* + * No MAP_HUGETLB, we use MFD_HUGETLB instead. Since + * we're using "file backed" memory, we need to specify + * this when the FD is created, not when the area is + * mapped. + */ + .flag = MAP_SHARED, + }, + }; + _Static_assert(ARRAY_SIZE(aliases) == NUM_SRC_TYPES, + "Missing new backing src types?"); + + TEST_ASSERT(i < NUM_SRC_TYPES, "Backing src type ID %d too big", i); + + return &aliases[i]; +} + +#define MAP_HUGE_PAGE_SIZE(x) (1ULL << ((x >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK)) + +size_t get_backing_src_pagesz(uint32_t i) +{ + uint32_t flag = vm_mem_backing_src_alias(i)->flag; + + switch (i) { + case VM_MEM_SRC_ANONYMOUS: + case VM_MEM_SRC_SHMEM: + return getpagesize(); + case VM_MEM_SRC_ANONYMOUS_THP: + return get_trans_hugepagesz(); + case VM_MEM_SRC_ANONYMOUS_HUGETLB: + case VM_MEM_SRC_SHARED_HUGETLB: + return get_def_hugetlb_pagesz(); + default: + return MAP_HUGE_PAGE_SIZE(flag); + } +} + +bool is_backing_src_hugetlb(uint32_t i) +{ + return !!(vm_mem_backing_src_alias(i)->flag & MAP_HUGETLB); +} + +static void print_available_backing_src_types(const char *prefix) +{ + int i; + + printf("%sAvailable backing src types:\n", prefix); + + for (i = 0; i < NUM_SRC_TYPES; i++) + printf("%s %s\n", prefix, vm_mem_backing_src_alias(i)->name); +} + +void backing_src_help(const char *flag) +{ + printf(" %s: specify the type of memory that should be used to\n" + " back the guest data region. (default: %s)\n", + flag, vm_mem_backing_src_alias(DEFAULT_VM_MEM_SRC)->name); + print_available_backing_src_types(" "); +} + +enum vm_mem_backing_src_type parse_backing_src_type(const char *type_name) +{ + int i; + + for (i = 0; i < NUM_SRC_TYPES; i++) + if (!strcmp(type_name, vm_mem_backing_src_alias(i)->name)) + return i; + + print_available_backing_src_types(""); + TEST_FAIL("Unknown backing src type: %s", type_name); + return -1; +} + +long get_run_delay(void) +{ + char path[64]; + long val[2]; + FILE *fp; + + sprintf(path, "/proc/%ld/schedstat", syscall(SYS_gettid)); + fp = fopen(path, "r"); + /* Return MIN_RUN_DELAY_NS upon failure just to be safe */ + if (fscanf(fp, "%ld %ld ", &val[0], &val[1]) < 2) + val[1] = MIN_RUN_DELAY_NS; + fclose(fp); + + return val[1]; +} diff --git a/tools/testing/selftests/kvm/lib/x86_64/apic.c b/tools/testing/selftests/kvm/lib/x86_64/apic.c new file mode 100644 index 000000000000..7168e25c194e --- /dev/null +++ b/tools/testing/selftests/kvm/lib/x86_64/apic.c @@ -0,0 +1,45 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * tools/testing/selftests/kvm/lib/x86_64/processor.c + * + * Copyright (C) 2021, Google LLC. + */ + +#include "apic.h" + +void apic_disable(void) +{ + wrmsr(MSR_IA32_APICBASE, + rdmsr(MSR_IA32_APICBASE) & + ~(MSR_IA32_APICBASE_ENABLE | MSR_IA32_APICBASE_EXTD)); +} + +void xapic_enable(void) +{ + uint64_t val = rdmsr(MSR_IA32_APICBASE); + + /* Per SDM: to enable xAPIC when in x2APIC must first disable APIC */ + if (val & MSR_IA32_APICBASE_EXTD) { + apic_disable(); + wrmsr(MSR_IA32_APICBASE, + rdmsr(MSR_IA32_APICBASE) | MSR_IA32_APICBASE_ENABLE); + } else if (!(val & MSR_IA32_APICBASE_ENABLE)) { + wrmsr(MSR_IA32_APICBASE, val | MSR_IA32_APICBASE_ENABLE); + } + + /* + * Per SDM: reset value of spurious interrupt vector register has the + * APIC software enabled bit=0. It must be enabled in addition to the + * enable bit in the MSR. + */ + val = xapic_read_reg(APIC_SPIV) | APIC_SPIV_APIC_ENABLED; + xapic_write_reg(APIC_SPIV, val); +} + +void x2apic_enable(void) +{ + wrmsr(MSR_IA32_APICBASE, rdmsr(MSR_IA32_APICBASE) | + MSR_IA32_APICBASE_ENABLE | MSR_IA32_APICBASE_EXTD); + x2apic_write_reg(APIC_SPIV, + x2apic_read_reg(APIC_SPIV) | APIC_SPIV_APIC_ENABLED); +} diff --git a/tools/testing/selftests/kvm/lib/x86_64/handlers.S b/tools/testing/selftests/kvm/lib/x86_64/handlers.S new file mode 100644 index 000000000000..7629819734af --- /dev/null +++ b/tools/testing/selftests/kvm/lib/x86_64/handlers.S @@ -0,0 +1,81 @@ +handle_exception: + push %r15 + push %r14 + push %r13 + push %r12 + push %r11 + push %r10 + push %r9 + push %r8 + + push %rdi + push %rsi + push %rbp + push %rbx + push %rdx + push %rcx + push %rax + mov %rsp, %rdi + + call route_exception + + pop %rax + pop %rcx + pop %rdx + pop %rbx + pop %rbp + pop %rsi + pop %rdi + pop %r8 + pop %r9 + pop %r10 + pop %r11 + pop %r12 + pop %r13 + pop %r14 + pop %r15 + + /* Discard vector and error code. */ + add $16, %rsp + iretq + +/* + * Build the handle_exception wrappers which push the vector/error code on the + * stack and an array of pointers to those wrappers. + */ +.pushsection .rodata +.globl idt_handlers +idt_handlers: +.popsection + +.macro HANDLERS has_error from to + vector = \from + .rept \to - \from + 1 + .align 8 + + /* Fetch current address and append it to idt_handlers. */ +666 : +.pushsection .rodata + .quad 666b +.popsection + + .if ! \has_error + pushq $0 + .endif + pushq $vector + jmp handle_exception + vector = vector + 1 + .endr +.endm + +.global idt_handler_code +idt_handler_code: + HANDLERS has_error=0 from=0 to=7 + HANDLERS has_error=1 from=8 to=8 + HANDLERS has_error=0 from=9 to=9 + HANDLERS has_error=1 from=10 to=14 + HANDLERS has_error=0 from=15 to=16 + HANDLERS has_error=1 from=17 to=17 + HANDLERS has_error=0 from=18 to=255 + +.section .note.GNU-stack, "", %progbits diff --git a/tools/testing/selftests/kvm/lib/x86_64/perf_test_util.c b/tools/testing/selftests/kvm/lib/x86_64/perf_test_util.c new file mode 100644 index 000000000000..0f344a7c89c4 --- /dev/null +++ b/tools/testing/selftests/kvm/lib/x86_64/perf_test_util.c @@ -0,0 +1,111 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * x86_64-specific extensions to perf_test_util.c. + * + * Copyright (C) 2022, Google, Inc. + */ +#include <stdio.h> +#include <stdlib.h> +#include <linux/bitmap.h> +#include <linux/bitops.h> + +#include "test_util.h" +#include "kvm_util.h" +#include "perf_test_util.h" +#include "processor.h" +#include "vmx.h" + +void perf_test_l2_guest_code(uint64_t vcpu_id) +{ + perf_test_guest_code(vcpu_id); + vmcall(); +} + +extern char perf_test_l2_guest_entry[]; +__asm__( +"perf_test_l2_guest_entry:" +" mov (%rsp), %rdi;" +" call perf_test_l2_guest_code;" +" ud2;" +); + +static void perf_test_l1_guest_code(struct vmx_pages *vmx, uint64_t vcpu_id) +{ +#define L2_GUEST_STACK_SIZE 64 + unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; + unsigned long *rsp; + + GUEST_ASSERT(vmx->vmcs_gpa); + GUEST_ASSERT(prepare_for_vmx_operation(vmx)); + GUEST_ASSERT(load_vmcs(vmx)); + GUEST_ASSERT(ept_1g_pages_supported()); + + rsp = &l2_guest_stack[L2_GUEST_STACK_SIZE - 1]; + *rsp = vcpu_id; + prepare_vmcs(vmx, perf_test_l2_guest_entry, rsp); + + GUEST_ASSERT(!vmlaunch()); + GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL); + GUEST_DONE(); +} + +uint64_t perf_test_nested_pages(int nr_vcpus) +{ + /* + * 513 page tables is enough to identity-map 256 TiB of L2 with 1G + * pages and 4-level paging, plus a few pages per-vCPU for data + * structures such as the VMCS. + */ + return 513 + 10 * nr_vcpus; +} + +void perf_test_setup_ept(struct vmx_pages *vmx, struct kvm_vm *vm) +{ + uint64_t start, end; + + prepare_eptp(vmx, vm, 0); + + /* + * Identity map the first 4G and the test region with 1G pages so that + * KVM can shadow the EPT12 with the maximum huge page size supported + * by the backing source. + */ + nested_identity_map_1g(vmx, vm, 0, 0x100000000ULL); + + start = align_down(perf_test_args.gpa, PG_SIZE_1G); + end = align_up(perf_test_args.gpa + perf_test_args.size, PG_SIZE_1G); + nested_identity_map_1g(vmx, vm, start, end - start); +} + +void perf_test_setup_nested(struct kvm_vm *vm, int nr_vcpus, struct kvm_vcpu *vcpus[]) +{ + struct vmx_pages *vmx, *vmx0 = NULL; + struct kvm_regs regs; + vm_vaddr_t vmx_gva; + int vcpu_id; + + TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX)); + + for (vcpu_id = 0; vcpu_id < nr_vcpus; vcpu_id++) { + vmx = vcpu_alloc_vmx(vm, &vmx_gva); + + if (vcpu_id == 0) { + perf_test_setup_ept(vmx, vm); + vmx0 = vmx; + } else { + /* Share the same EPT table across all vCPUs. */ + vmx->eptp = vmx0->eptp; + vmx->eptp_hva = vmx0->eptp_hva; + vmx->eptp_gpa = vmx0->eptp_gpa; + } + + /* + * Override the vCPU to run perf_test_l1_guest_code() which will + * bounce it into L2 before calling perf_test_guest_code(). + */ + vcpu_regs_get(vcpus[vcpu_id], ®s); + regs.rip = (unsigned long) perf_test_l1_guest_code; + vcpu_regs_set(vcpus[vcpu_id], ®s); + vcpu_args_set(vcpus[vcpu_id], 2, vmx_gva, vcpu_id); + } +} diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c index 683d3bdb8f6a..39c4409ef56a 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/processor.c +++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c @@ -5,94 +5,22 @@ * Copyright (C) 2018, Google LLC. */ -#define _GNU_SOURCE /* for program_invocation_name */ - #include "test_util.h" #include "kvm_util.h" -#include "../kvm_util_internal.h" #include "processor.h" -/* Minimum physical address used for virtual translation tables. */ -#define KVM_GUEST_PAGE_TABLE_MIN_PADDR 0x180000 - -/* Virtual translation table structure declarations */ -struct pageMapL4Entry { - uint64_t present:1; - uint64_t writable:1; - uint64_t user:1; - uint64_t write_through:1; - uint64_t cache_disable:1; - uint64_t accessed:1; - uint64_t ignored_06:1; - uint64_t page_size:1; - uint64_t ignored_11_08:4; - uint64_t address:40; - uint64_t ignored_62_52:11; - uint64_t execute_disable:1; -}; +#ifndef NUM_INTERRUPTS +#define NUM_INTERRUPTS 256 +#endif -struct pageDirectoryPointerEntry { - uint64_t present:1; - uint64_t writable:1; - uint64_t user:1; - uint64_t write_through:1; - uint64_t cache_disable:1; - uint64_t accessed:1; - uint64_t ignored_06:1; - uint64_t page_size:1; - uint64_t ignored_11_08:4; - uint64_t address:40; - uint64_t ignored_62_52:11; - uint64_t execute_disable:1; -}; +#define DEFAULT_CODE_SELECTOR 0x8 +#define DEFAULT_DATA_SELECTOR 0x10 -struct pageDirectoryEntry { - uint64_t present:1; - uint64_t writable:1; - uint64_t user:1; - uint64_t write_through:1; - uint64_t cache_disable:1; - uint64_t accessed:1; - uint64_t ignored_06:1; - uint64_t page_size:1; - uint64_t ignored_11_08:4; - uint64_t address:40; - uint64_t ignored_62_52:11; - uint64_t execute_disable:1; -}; +#define MAX_NR_CPUID_ENTRIES 100 -struct pageTableEntry { - uint64_t present:1; - uint64_t writable:1; - uint64_t user:1; - uint64_t write_through:1; - uint64_t cache_disable:1; - uint64_t accessed:1; - uint64_t dirty:1; - uint64_t reserved_07:1; - uint64_t global:1; - uint64_t ignored_11_09:3; - uint64_t address:40; - uint64_t ignored_62_52:11; - uint64_t execute_disable:1; -}; +vm_vaddr_t exception_handlers; -/* Register Dump - * - * Input Args: - * indent - Left margin indent amount - * regs - register - * - * Output Args: - * stream - Output FILE stream - * - * Return: None - * - * Dumps the state of the registers given by regs, to the FILE stream - * given by steam. - */ -void regs_dump(FILE *stream, struct kvm_regs *regs, - uint8_t indent) +static void regs_dump(FILE *stream, struct kvm_regs *regs, uint8_t indent) { fprintf(stream, "%*srax: 0x%.16llx rbx: 0x%.16llx " "rcx: 0x%.16llx rdx: 0x%.16llx\n", @@ -115,20 +43,6 @@ void regs_dump(FILE *stream, struct kvm_regs *regs, regs->rip, regs->rflags); } -/* Segment Dump - * - * Input Args: - * indent - Left margin indent amount - * segment - KVM segment - * - * Output Args: - * stream - Output FILE stream - * - * Return: None - * - * Dumps the state of the KVM segment given by segment, to the FILE stream - * given by steam. - */ static void segment_dump(FILE *stream, struct kvm_segment *segment, uint8_t indent) { @@ -146,20 +60,6 @@ static void segment_dump(FILE *stream, struct kvm_segment *segment, segment->unusable, segment->padding); } -/* dtable Dump - * - * Input Args: - * indent - Left margin indent amount - * dtable - KVM dtable - * - * Output Args: - * stream - Output FILE stream - * - * Return: None - * - * Dumps the state of the KVM dtable given by dtable, to the FILE stream - * given by steam. - */ static void dtable_dump(FILE *stream, struct kvm_dtable *dtable, uint8_t indent) { @@ -169,22 +69,7 @@ static void dtable_dump(FILE *stream, struct kvm_dtable *dtable, dtable->padding[0], dtable->padding[1], dtable->padding[2]); } -/* System Register Dump - * - * Input Args: - * indent - Left margin indent amount - * sregs - System registers - * - * Output Args: - * stream - Output FILE stream - * - * Return: None - * - * Dumps the state of the system registers given by sregs, to the FILE stream - * given by steam. - */ -void sregs_dump(FILE *stream, struct kvm_sregs *sregs, - uint8_t indent) +static void sregs_dump(FILE *stream, struct kvm_sregs *sregs, uint8_t indent) { unsigned int i; @@ -226,126 +111,230 @@ void sregs_dump(FILE *stream, struct kvm_sregs *sregs, } } -void virt_pgd_alloc(struct kvm_vm *vm, uint32_t pgd_memslot) +bool kvm_is_tdp_enabled(void) +{ + if (is_intel_cpu()) + return get_kvm_intel_param_bool("ept"); + else + return get_kvm_amd_param_bool("npt"); +} + +void virt_arch_pgd_alloc(struct kvm_vm *vm) { TEST_ASSERT(vm->mode == VM_MODE_PXXV48_4K, "Attempt to use " "unknown or unsupported guest mode, mode: 0x%x", vm->mode); /* If needed, create page map l4 table. */ if (!vm->pgd_created) { - vm_paddr_t paddr = vm_phy_page_alloc(vm, - KVM_GUEST_PAGE_TABLE_MIN_PADDR, pgd_memslot); - vm->pgd = paddr; + vm->pgd = vm_alloc_page_table(vm); vm->pgd_created = true; } } -/* VM Virtual Page Map - * - * Input Args: - * vm - Virtual Machine - * vaddr - VM Virtual Address - * paddr - VM Physical Address - * pgd_memslot - Memory region slot for new virtual translation tables - * - * Output Args: None - * - * Return: None - * - * Within the VM given by vm, creates a virtual translation for the page - * starting at vaddr to the page starting at paddr. - */ -void virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, - uint32_t pgd_memslot) +static void *virt_get_pte(struct kvm_vm *vm, uint64_t pt_pfn, uint64_t vaddr, + int level) +{ + uint64_t *page_table = addr_gpa2hva(vm, pt_pfn << vm->page_shift); + int index = (vaddr >> PG_LEVEL_SHIFT(level)) & 0x1ffu; + + return &page_table[index]; +} + +static uint64_t *virt_create_upper_pte(struct kvm_vm *vm, + uint64_t pt_pfn, + uint64_t vaddr, + uint64_t paddr, + int current_level, + int target_level) +{ + uint64_t *pte = virt_get_pte(vm, pt_pfn, vaddr, current_level); + + if (!(*pte & PTE_PRESENT_MASK)) { + *pte = PTE_PRESENT_MASK | PTE_WRITABLE_MASK; + if (current_level == target_level) + *pte |= PTE_LARGE_MASK | (paddr & PHYSICAL_PAGE_MASK); + else + *pte |= vm_alloc_page_table(vm) & PHYSICAL_PAGE_MASK; + } else { + /* + * Entry already present. Assert that the caller doesn't want + * a hugepage at this level, and that there isn't a hugepage at + * this level. + */ + TEST_ASSERT(current_level != target_level, + "Cannot create hugepage at level: %u, vaddr: 0x%lx\n", + current_level, vaddr); + TEST_ASSERT(!(*pte & PTE_LARGE_MASK), + "Cannot create page table at level: %u, vaddr: 0x%lx\n", + current_level, vaddr); + } + return pte; +} + +void __virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, int level) +{ + const uint64_t pg_size = PG_LEVEL_SIZE(level); + uint64_t *pml4e, *pdpe, *pde; + uint64_t *pte; + + TEST_ASSERT(vm->mode == VM_MODE_PXXV48_4K, + "Unknown or unsupported guest mode, mode: 0x%x", vm->mode); + + TEST_ASSERT((vaddr % pg_size) == 0, + "Virtual address not aligned,\n" + "vaddr: 0x%lx page size: 0x%lx", vaddr, pg_size); + TEST_ASSERT(sparsebit_is_set(vm->vpages_valid, (vaddr >> vm->page_shift)), + "Invalid virtual address, vaddr: 0x%lx", vaddr); + TEST_ASSERT((paddr % pg_size) == 0, + "Physical address not aligned,\n" + " paddr: 0x%lx page size: 0x%lx", paddr, pg_size); + TEST_ASSERT((paddr >> vm->page_shift) <= vm->max_gfn, + "Physical address beyond maximum supported,\n" + " paddr: 0x%lx vm->max_gfn: 0x%lx vm->page_size: 0x%x", + paddr, vm->max_gfn, vm->page_size); + + /* + * Allocate upper level page tables, if not already present. Return + * early if a hugepage was created. + */ + pml4e = virt_create_upper_pte(vm, vm->pgd >> vm->page_shift, + vaddr, paddr, PG_LEVEL_512G, level); + if (*pml4e & PTE_LARGE_MASK) + return; + + pdpe = virt_create_upper_pte(vm, PTE_GET_PFN(*pml4e), vaddr, paddr, PG_LEVEL_1G, level); + if (*pdpe & PTE_LARGE_MASK) + return; + + pde = virt_create_upper_pte(vm, PTE_GET_PFN(*pdpe), vaddr, paddr, PG_LEVEL_2M, level); + if (*pde & PTE_LARGE_MASK) + return; + + /* Fill in page table entry. */ + pte = virt_get_pte(vm, PTE_GET_PFN(*pde), vaddr, PG_LEVEL_4K); + TEST_ASSERT(!(*pte & PTE_PRESENT_MASK), + "PTE already present for 4k page at vaddr: 0x%lx\n", vaddr); + *pte = PTE_PRESENT_MASK | PTE_WRITABLE_MASK | (paddr & PHYSICAL_PAGE_MASK); +} + +void virt_arch_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr) +{ + __virt_pg_map(vm, vaddr, paddr, PG_LEVEL_4K); +} + +void virt_map_level(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, + uint64_t nr_bytes, int level) +{ + uint64_t pg_size = PG_LEVEL_SIZE(level); + uint64_t nr_pages = nr_bytes / pg_size; + int i; + + TEST_ASSERT(nr_bytes % pg_size == 0, + "Region size not aligned: nr_bytes: 0x%lx, page size: 0x%lx", + nr_bytes, pg_size); + + for (i = 0; i < nr_pages; i++) { + __virt_pg_map(vm, vaddr, paddr, level); + + vaddr += pg_size; + paddr += pg_size; + } +} + +static uint64_t *_vm_get_page_table_entry(struct kvm_vm *vm, + struct kvm_vcpu *vcpu, + uint64_t vaddr) { uint16_t index[4]; - struct pageMapL4Entry *pml4e; + uint64_t *pml4e, *pdpe, *pde; + uint64_t *pte; + struct kvm_sregs sregs; + uint64_t rsvd_mask = 0; + + /* Set the high bits in the reserved mask. */ + if (vm->pa_bits < 52) + rsvd_mask = GENMASK_ULL(51, vm->pa_bits); + + /* + * SDM vol 3, fig 4-11 "Formats of CR3 and Paging-Structure Entries + * with 4-Level Paging and 5-Level Paging". + * If IA32_EFER.NXE = 0 and the P flag of a paging-structure entry is 1, + * the XD flag (bit 63) is reserved. + */ + vcpu_sregs_get(vcpu, &sregs); + if ((sregs.efer & EFER_NX) == 0) { + rsvd_mask |= PTE_NX_MASK; + } TEST_ASSERT(vm->mode == VM_MODE_PXXV48_4K, "Attempt to use " "unknown or unsupported guest mode, mode: 0x%x", vm->mode); - - TEST_ASSERT((vaddr % vm->page_size) == 0, - "Virtual address not on page boundary,\n" - " vaddr: 0x%lx vm->page_size: 0x%x", - vaddr, vm->page_size); TEST_ASSERT(sparsebit_is_set(vm->vpages_valid, (vaddr >> vm->page_shift)), "Invalid virtual address, vaddr: 0x%lx", vaddr); - TEST_ASSERT((paddr % vm->page_size) == 0, - "Physical address not on page boundary,\n" - " paddr: 0x%lx vm->page_size: 0x%x", - paddr, vm->page_size); - TEST_ASSERT((paddr >> vm->page_shift) <= vm->max_gfn, - "Physical address beyond beyond maximum supported,\n" - " paddr: 0x%lx vm->max_gfn: 0x%lx vm->page_size: 0x%x", - paddr, vm->max_gfn, vm->page_size); + /* + * Based on the mode check above there are 48 bits in the vaddr, so + * shift 16 to sign extend the last bit (bit-47), + */ + TEST_ASSERT(vaddr == (((int64_t)vaddr << 16) >> 16), + "Canonical check failed. The virtual address is invalid."); index[0] = (vaddr >> 12) & 0x1ffu; index[1] = (vaddr >> 21) & 0x1ffu; index[2] = (vaddr >> 30) & 0x1ffu; index[3] = (vaddr >> 39) & 0x1ffu; - /* Allocate page directory pointer table if not present. */ pml4e = addr_gpa2hva(vm, vm->pgd); - if (!pml4e[index[3]].present) { - pml4e[index[3]].address = vm_phy_page_alloc(vm, - KVM_GUEST_PAGE_TABLE_MIN_PADDR, pgd_memslot) - >> vm->page_shift; - pml4e[index[3]].writable = true; - pml4e[index[3]].present = true; - } + TEST_ASSERT(pml4e[index[3]] & PTE_PRESENT_MASK, + "Expected pml4e to be present for gva: 0x%08lx", vaddr); + TEST_ASSERT((pml4e[index[3]] & (rsvd_mask | PTE_LARGE_MASK)) == 0, + "Unexpected reserved bits set."); + + pdpe = addr_gpa2hva(vm, PTE_GET_PFN(pml4e[index[3]]) * vm->page_size); + TEST_ASSERT(pdpe[index[2]] & PTE_PRESENT_MASK, + "Expected pdpe to be present for gva: 0x%08lx", vaddr); + TEST_ASSERT(!(pdpe[index[2]] & PTE_LARGE_MASK), + "Expected pdpe to map a pde not a 1-GByte page."); + TEST_ASSERT((pdpe[index[2]] & rsvd_mask) == 0, + "Unexpected reserved bits set."); + + pde = addr_gpa2hva(vm, PTE_GET_PFN(pdpe[index[2]]) * vm->page_size); + TEST_ASSERT(pde[index[1]] & PTE_PRESENT_MASK, + "Expected pde to be present for gva: 0x%08lx", vaddr); + TEST_ASSERT(!(pde[index[1]] & PTE_LARGE_MASK), + "Expected pde to map a pte not a 2-MByte page."); + TEST_ASSERT((pde[index[1]] & rsvd_mask) == 0, + "Unexpected reserved bits set."); + + pte = addr_gpa2hva(vm, PTE_GET_PFN(pde[index[1]]) * vm->page_size); + TEST_ASSERT(pte[index[0]] & PTE_PRESENT_MASK, + "Expected pte to be present for gva: 0x%08lx", vaddr); + + return &pte[index[0]]; +} - /* Allocate page directory table if not present. */ - struct pageDirectoryPointerEntry *pdpe; - pdpe = addr_gpa2hva(vm, pml4e[index[3]].address * vm->page_size); - if (!pdpe[index[2]].present) { - pdpe[index[2]].address = vm_phy_page_alloc(vm, - KVM_GUEST_PAGE_TABLE_MIN_PADDR, pgd_memslot) - >> vm->page_shift; - pdpe[index[2]].writable = true; - pdpe[index[2]].present = true; - } +uint64_t vm_get_page_table_entry(struct kvm_vm *vm, struct kvm_vcpu *vcpu, + uint64_t vaddr) +{ + uint64_t *pte = _vm_get_page_table_entry(vm, vcpu, vaddr); - /* Allocate page table if not present. */ - struct pageDirectoryEntry *pde; - pde = addr_gpa2hva(vm, pdpe[index[2]].address * vm->page_size); - if (!pde[index[1]].present) { - pde[index[1]].address = vm_phy_page_alloc(vm, - KVM_GUEST_PAGE_TABLE_MIN_PADDR, pgd_memslot) - >> vm->page_shift; - pde[index[1]].writable = true; - pde[index[1]].present = true; - } + return *(uint64_t *)pte; +} - /* Fill in page table entry. */ - struct pageTableEntry *pte; - pte = addr_gpa2hva(vm, pde[index[1]].address * vm->page_size); - pte[index[0]].address = paddr >> vm->page_shift; - pte[index[0]].writable = true; - pte[index[0]].present = 1; +void vm_set_page_table_entry(struct kvm_vm *vm, struct kvm_vcpu *vcpu, + uint64_t vaddr, uint64_t pte) +{ + uint64_t *new_pte = _vm_get_page_table_entry(vm, vcpu, vaddr); + + *(uint64_t *)new_pte = pte; } -/* Virtual Translation Tables Dump - * - * Input Args: - * vm - Virtual Machine - * indent - Left margin indent amount - * - * Output Args: - * stream - Output FILE stream - * - * Return: None - * - * Dumps to the FILE stream given by stream, the contents of all the - * virtual translation tables for the VM given by vm. - */ -void virt_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent) +void virt_arch_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent) { - struct pageMapL4Entry *pml4e, *pml4e_start; - struct pageDirectoryPointerEntry *pdpe, *pdpe_start; - struct pageDirectoryEntry *pde, *pde_start; - struct pageTableEntry *pte, *pte_start; + uint64_t *pml4e, *pml4e_start; + uint64_t *pdpe, *pdpe_start; + uint64_t *pde, *pde_start; + uint64_t *pte, *pte_start; if (!vm->pgd_created) return; @@ -355,62 +344,58 @@ void virt_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent) fprintf(stream, "%*s index hvaddr gpaddr " "addr w exec dirty\n", indent, ""); - pml4e_start = (struct pageMapL4Entry *) addr_gpa2hva(vm, - vm->pgd); + pml4e_start = (uint64_t *) addr_gpa2hva(vm, vm->pgd); for (uint16_t n1 = 0; n1 <= 0x1ffu; n1++) { pml4e = &pml4e_start[n1]; - if (!pml4e->present) + if (!(*pml4e & PTE_PRESENT_MASK)) continue; - fprintf(stream, "%*spml4e 0x%-3zx %p 0x%-12lx 0x%-10lx %u " + fprintf(stream, "%*spml4e 0x%-3zx %p 0x%-12lx 0x%-10llx %u " " %u\n", indent, "", pml4e - pml4e_start, pml4e, - addr_hva2gpa(vm, pml4e), (uint64_t) pml4e->address, - pml4e->writable, pml4e->execute_disable); + addr_hva2gpa(vm, pml4e), PTE_GET_PFN(*pml4e), + !!(*pml4e & PTE_WRITABLE_MASK), !!(*pml4e & PTE_NX_MASK)); - pdpe_start = addr_gpa2hva(vm, pml4e->address - * vm->page_size); + pdpe_start = addr_gpa2hva(vm, *pml4e & PHYSICAL_PAGE_MASK); for (uint16_t n2 = 0; n2 <= 0x1ffu; n2++) { pdpe = &pdpe_start[n2]; - if (!pdpe->present) + if (!(*pdpe & PTE_PRESENT_MASK)) continue; - fprintf(stream, "%*spdpe 0x%-3zx %p 0x%-12lx 0x%-10lx " + fprintf(stream, "%*spdpe 0x%-3zx %p 0x%-12lx 0x%-10llx " "%u %u\n", indent, "", pdpe - pdpe_start, pdpe, addr_hva2gpa(vm, pdpe), - (uint64_t) pdpe->address, pdpe->writable, - pdpe->execute_disable); + PTE_GET_PFN(*pdpe), !!(*pdpe & PTE_WRITABLE_MASK), + !!(*pdpe & PTE_NX_MASK)); - pde_start = addr_gpa2hva(vm, - pdpe->address * vm->page_size); + pde_start = addr_gpa2hva(vm, *pdpe & PHYSICAL_PAGE_MASK); for (uint16_t n3 = 0; n3 <= 0x1ffu; n3++) { pde = &pde_start[n3]; - if (!pde->present) + if (!(*pde & PTE_PRESENT_MASK)) continue; fprintf(stream, "%*spde 0x%-3zx %p " - "0x%-12lx 0x%-10lx %u %u\n", + "0x%-12lx 0x%-10llx %u %u\n", indent, "", pde - pde_start, pde, addr_hva2gpa(vm, pde), - (uint64_t) pde->address, pde->writable, - pde->execute_disable); + PTE_GET_PFN(*pde), !!(*pde & PTE_WRITABLE_MASK), + !!(*pde & PTE_NX_MASK)); - pte_start = addr_gpa2hva(vm, - pde->address * vm->page_size); + pte_start = addr_gpa2hva(vm, *pde & PHYSICAL_PAGE_MASK); for (uint16_t n4 = 0; n4 <= 0x1ffu; n4++) { pte = &pte_start[n4]; - if (!pte->present) + if (!(*pte & PTE_PRESENT_MASK)) continue; fprintf(stream, "%*spte 0x%-3zx %p " - "0x%-12lx 0x%-10lx %u %u " + "0x%-12lx 0x%-10llx %u %u " " %u 0x%-10lx\n", indent, "", pte - pte_start, pte, addr_hva2gpa(vm, pte), - (uint64_t) pte->address, - pte->writable, - pte->execute_disable, - pte->dirty, + PTE_GET_PFN(*pte), + !!(*pte & PTE_WRITABLE_MASK), + !!(*pte & PTE_NX_MASK), + !!(*pte & PTE_DIRTY_MASK), ((uint64_t) n1 << 27) | ((uint64_t) n2 << 18) | ((uint64_t) n3 << 9) @@ -421,7 +406,8 @@ void virt_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent) } } -/* Set Unusable Segment +/* + * Set Unusable Segment * * Input Args: None * @@ -430,7 +416,7 @@ void virt_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent) * * Return: None * - * Sets the segment register pointed to by segp to an unusable state. + * Sets the segment register pointed to by @segp to an unusable state. */ static void kvm_seg_set_unusable(struct kvm_segment *segp) { @@ -446,11 +432,12 @@ static void kvm_seg_fill_gdt_64bit(struct kvm_vm *vm, struct kvm_segment *segp) desc->limit0 = segp->limit & 0xFFFF; desc->base0 = segp->base & 0xFFFF; desc->base1 = segp->base >> 16; - desc->s = segp->s; desc->type = segp->type; + desc->s = segp->s; desc->dpl = segp->dpl; desc->p = segp->present; desc->limit1 = segp->limit >> 16; + desc->avl = segp->avl; desc->l = segp->l; desc->db = segp->db; desc->g = segp->g; @@ -460,7 +447,8 @@ static void kvm_seg_fill_gdt_64bit(struct kvm_vm *vm, struct kvm_segment *segp) } -/* Set Long Mode Flat Kernel Code Segment +/* + * Set Long Mode Flat Kernel Code Segment * * Input Args: * vm - VM whose GDT is being filled, or NULL to only write segp @@ -471,8 +459,8 @@ static void kvm_seg_fill_gdt_64bit(struct kvm_vm *vm, struct kvm_segment *segp) * * Return: None * - * Sets up the KVM segment pointed to by segp, to be a code segment - * with the selector value given by selector. + * Sets up the KVM segment pointed to by @segp, to be a code segment + * with the selector value given by @selector. */ static void kvm_seg_set_kernel_code_64bit(struct kvm_vm *vm, uint16_t selector, struct kvm_segment *segp) @@ -491,7 +479,8 @@ static void kvm_seg_set_kernel_code_64bit(struct kvm_vm *vm, uint16_t selector, kvm_seg_fill_gdt_64bit(vm, segp); } -/* Set Long Mode Flat Kernel Data Segment +/* + * Set Long Mode Flat Kernel Data Segment * * Input Args: * vm - VM whose GDT is being filled, or NULL to only write segp @@ -502,8 +491,8 @@ static void kvm_seg_set_kernel_code_64bit(struct kvm_vm *vm, uint16_t selector, * * Return: None * - * Sets up the KVM segment pointed to by segp, to be a data segment - * with the selector value given by selector. + * Sets up the KVM segment pointed to by @segp, to be a data segment + * with the selector value given by @selector. */ static void kvm_seg_set_kernel_data_64bit(struct kvm_vm *vm, uint16_t selector, struct kvm_segment *segp) @@ -521,31 +510,11 @@ static void kvm_seg_set_kernel_data_64bit(struct kvm_vm *vm, uint16_t selector, kvm_seg_fill_gdt_64bit(vm, segp); } -/* Address Guest Virtual to Guest Physical - * - * Input Args: - * vm - Virtual Machine - * gpa - VM virtual address - * - * Output Args: None - * - * Return: - * Equivalent VM physical address - * - * Translates the VM virtual address given by gva to a VM physical - * address and then locates the memory region containing the VM - * physical address, within the VM given by vm. When found, the host - * virtual address providing the memory to the vm physical address is returned. - * A TEST_ASSERT failure occurs if no region containing translated - * VM virtual address exists. - */ -vm_paddr_t addr_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva) +vm_paddr_t addr_arch_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva) { uint16_t index[4]; - struct pageMapL4Entry *pml4e; - struct pageDirectoryPointerEntry *pdpe; - struct pageDirectoryEntry *pde; - struct pageTableEntry *pte; + uint64_t *pml4e, *pdpe, *pde; + uint64_t *pte; TEST_ASSERT(vm->mode == VM_MODE_PXXV48_4K, "Attempt to use " "unknown or unsupported guest mode, mode: 0x%x", vm->mode); @@ -558,47 +527,42 @@ vm_paddr_t addr_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva) if (!vm->pgd_created) goto unmapped_gva; pml4e = addr_gpa2hva(vm, vm->pgd); - if (!pml4e[index[3]].present) + if (!(pml4e[index[3]] & PTE_PRESENT_MASK)) goto unmapped_gva; - pdpe = addr_gpa2hva(vm, pml4e[index[3]].address * vm->page_size); - if (!pdpe[index[2]].present) + pdpe = addr_gpa2hva(vm, PTE_GET_PFN(pml4e[index[3]]) * vm->page_size); + if (!(pdpe[index[2]] & PTE_PRESENT_MASK)) goto unmapped_gva; - pde = addr_gpa2hva(vm, pdpe[index[2]].address * vm->page_size); - if (!pde[index[1]].present) + pde = addr_gpa2hva(vm, PTE_GET_PFN(pdpe[index[2]]) * vm->page_size); + if (!(pde[index[1]] & PTE_PRESENT_MASK)) goto unmapped_gva; - pte = addr_gpa2hva(vm, pde[index[1]].address * vm->page_size); - if (!pte[index[0]].present) + pte = addr_gpa2hva(vm, PTE_GET_PFN(pde[index[1]]) * vm->page_size); + if (!(pte[index[0]] & PTE_PRESENT_MASK)) goto unmapped_gva; - return (pte[index[0]].address * vm->page_size) + (gva & 0xfffu); + return (PTE_GET_PFN(pte[index[0]]) * vm->page_size) + (gva & ~PAGE_MASK); unmapped_gva: - TEST_ASSERT(false, "No mapping for vm virtual address, " - "gva: 0x%lx", gva); + TEST_FAIL("No mapping for vm virtual address, gva: 0x%lx", gva); exit(EXIT_FAILURE); } -static void kvm_setup_gdt(struct kvm_vm *vm, struct kvm_dtable *dt, int gdt_memslot, - int pgd_memslot) +static void kvm_setup_gdt(struct kvm_vm *vm, struct kvm_dtable *dt) { if (!vm->gdt) - vm->gdt = vm_vaddr_alloc(vm, getpagesize(), - KVM_UTIL_MIN_VADDR, gdt_memslot, pgd_memslot); + vm->gdt = vm_vaddr_alloc_page(vm); dt->base = vm->gdt; dt->limit = getpagesize(); } static void kvm_setup_tss_64bit(struct kvm_vm *vm, struct kvm_segment *segp, - int selector, int gdt_memslot, - int pgd_memslot) + int selector) { if (!vm->tss) - vm->tss = vm_vaddr_alloc(vm, getpagesize(), - KVM_UTIL_MIN_VADDR, gdt_memslot, pgd_memslot); + vm->tss = vm_vaddr_alloc_page(vm); memset(segp, 0, sizeof(*segp)); segp->base = vm->tss; @@ -609,16 +573,16 @@ static void kvm_setup_tss_64bit(struct kvm_vm *vm, struct kvm_segment *segp, kvm_seg_fill_gdt_64bit(vm, segp); } -static void vcpu_setup(struct kvm_vm *vm, int vcpuid, int pgd_memslot, int gdt_memslot) +static void vcpu_setup(struct kvm_vm *vm, struct kvm_vcpu *vcpu) { struct kvm_sregs sregs; /* Set mode specific system register values. */ - vcpu_sregs_get(vm, vcpuid, &sregs); + vcpu_sregs_get(vcpu, &sregs); sregs.idt.limit = 0; - kvm_setup_gdt(vm, &sregs.gdt, gdt_memslot, pgd_memslot); + kvm_setup_gdt(vm, &sregs.gdt); switch (vm->mode) { case VM_MODE_PXXV48_4K: @@ -627,322 +591,241 @@ static void vcpu_setup(struct kvm_vm *vm, int vcpuid, int pgd_memslot, int gdt_m sregs.efer |= (EFER_LME | EFER_LMA | EFER_NX); kvm_seg_set_unusable(&sregs.ldt); - kvm_seg_set_kernel_code_64bit(vm, 0x8, &sregs.cs); - kvm_seg_set_kernel_data_64bit(vm, 0x10, &sregs.ds); - kvm_seg_set_kernel_data_64bit(vm, 0x10, &sregs.es); - kvm_setup_tss_64bit(vm, &sregs.tr, 0x18, gdt_memslot, pgd_memslot); + kvm_seg_set_kernel_code_64bit(vm, DEFAULT_CODE_SELECTOR, &sregs.cs); + kvm_seg_set_kernel_data_64bit(vm, DEFAULT_DATA_SELECTOR, &sregs.ds); + kvm_seg_set_kernel_data_64bit(vm, DEFAULT_DATA_SELECTOR, &sregs.es); + kvm_setup_tss_64bit(vm, &sregs.tr, 0x18); break; default: - TEST_ASSERT(false, "Unknown guest mode, mode: 0x%x", vm->mode); + TEST_FAIL("Unknown guest mode, mode: 0x%x", vm->mode); } sregs.cr3 = vm->pgd; - vcpu_sregs_set(vm, vcpuid, &sregs); + vcpu_sregs_set(vcpu, &sregs); } -/* Adds a vCPU with reasonable defaults (i.e., a stack) - * - * Input Args: - * vcpuid - The id of the VCPU to add to the VM. - * guest_code - The vCPU's entry point - */ -void vm_vcpu_add_default(struct kvm_vm *vm, uint32_t vcpuid, void *guest_code) + +void __vm_xsave_require_permission(int bit, const char *name) +{ + int kvm_fd; + u64 bitmask; + long rc; + struct kvm_device_attr attr = { + .group = 0, + .attr = KVM_X86_XCOMP_GUEST_SUPP, + .addr = (unsigned long) &bitmask + }; + + TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_XFD)); + + kvm_fd = open_kvm_dev_path_or_exit(); + rc = __kvm_ioctl(kvm_fd, KVM_GET_DEVICE_ATTR, &attr); + close(kvm_fd); + + if (rc == -1 && (errno == ENXIO || errno == EINVAL)) + __TEST_REQUIRE(0, "KVM_X86_XCOMP_GUEST_SUPP not supported"); + + TEST_ASSERT(rc == 0, "KVM_GET_DEVICE_ATTR(0, KVM_X86_XCOMP_GUEST_SUPP) error: %ld", rc); + + __TEST_REQUIRE(bitmask & (1ULL << bit), + "Required XSAVE feature '%s' not supported", name); + + TEST_REQUIRE(!syscall(SYS_arch_prctl, ARCH_REQ_XCOMP_GUEST_PERM, bit)); + + rc = syscall(SYS_arch_prctl, ARCH_GET_XCOMP_GUEST_PERM, &bitmask); + TEST_ASSERT(rc == 0, "prctl(ARCH_GET_XCOMP_GUEST_PERM) error: %ld", rc); + TEST_ASSERT(bitmask & (1ULL << bit), + "prctl(ARCH_REQ_XCOMP_GUEST_PERM) failure bitmask=0x%lx", + bitmask); +} + +struct kvm_vcpu *vm_arch_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id, + void *guest_code) { struct kvm_mp_state mp_state; struct kvm_regs regs; vm_vaddr_t stack_vaddr; + struct kvm_vcpu *vcpu; + stack_vaddr = vm_vaddr_alloc(vm, DEFAULT_STACK_PGS * getpagesize(), - DEFAULT_GUEST_STACK_VADDR_MIN, 0, 0); + DEFAULT_GUEST_STACK_VADDR_MIN); - /* Create VCPU */ - vm_vcpu_add(vm, vcpuid); - vcpu_setup(vm, vcpuid, 0, 0); + vcpu = __vm_vcpu_add(vm, vcpu_id); + vcpu_init_cpuid(vcpu, kvm_get_supported_cpuid()); + vcpu_setup(vm, vcpu); /* Setup guest general purpose registers */ - vcpu_regs_get(vm, vcpuid, ®s); + vcpu_regs_get(vcpu, ®s); regs.rflags = regs.rflags | 0x2; regs.rsp = stack_vaddr + (DEFAULT_STACK_PGS * getpagesize()); regs.rip = (unsigned long) guest_code; - vcpu_regs_set(vm, vcpuid, ®s); + vcpu_regs_set(vcpu, ®s); /* Setup the MP state */ mp_state.mp_state = 0; - vcpu_set_mp_state(vm, vcpuid, &mp_state); + vcpu_mp_state_set(vcpu, &mp_state); + + return vcpu; } -/* Allocate an instance of struct kvm_cpuid2 - * - * Input Args: None - * - * Output Args: None - * - * Return: A pointer to the allocated struct. The caller is responsible - * for freeing this struct. - * - * Since kvm_cpuid2 uses a 0-length array to allow a the size of the - * array to be decided at allocation time, allocation is slightly - * complicated. This function uses a reasonable default length for - * the array and performs the appropriate allocation. - */ -static struct kvm_cpuid2 *allocate_kvm_cpuid2(void) -{ - struct kvm_cpuid2 *cpuid; - int nent = 100; - size_t size; - - size = sizeof(*cpuid); - size += nent * sizeof(struct kvm_cpuid_entry2); - cpuid = malloc(size); - if (!cpuid) { - perror("malloc"); - abort(); - } +struct kvm_vcpu *vm_arch_vcpu_recreate(struct kvm_vm *vm, uint32_t vcpu_id) +{ + struct kvm_vcpu *vcpu = __vm_vcpu_add(vm, vcpu_id); - cpuid->nent = nent; + vcpu_init_cpuid(vcpu, kvm_get_supported_cpuid()); - return cpuid; + return vcpu; } -/* KVM Supported CPUID Get - * - * Input Args: None - * - * Output Args: - * - * Return: The supported KVM CPUID - * - * Get the guest CPUID supported by KVM. - */ -struct kvm_cpuid2 *kvm_get_supported_cpuid(void) +void vcpu_arch_free(struct kvm_vcpu *vcpu) +{ + if (vcpu->cpuid) + free(vcpu->cpuid); +} + +const struct kvm_cpuid2 *kvm_get_supported_cpuid(void) { static struct kvm_cpuid2 *cpuid; - int ret; int kvm_fd; if (cpuid) return cpuid; - cpuid = allocate_kvm_cpuid2(); - kvm_fd = open(KVM_DEV_PATH, O_RDONLY); - if (kvm_fd < 0) - exit(KSFT_SKIP); + cpuid = allocate_kvm_cpuid2(MAX_NR_CPUID_ENTRIES); + kvm_fd = open_kvm_dev_path_or_exit(); - ret = ioctl(kvm_fd, KVM_GET_SUPPORTED_CPUID, cpuid); - TEST_ASSERT(ret == 0, "KVM_GET_SUPPORTED_CPUID failed %d %d\n", - ret, errno); + kvm_ioctl(kvm_fd, KVM_GET_SUPPORTED_CPUID, cpuid); close(kvm_fd); return cpuid; } -/* Locate a cpuid entry. - * - * Input Args: - * cpuid: The cpuid. - * function: The function of the cpuid entry to find. - * - * Output Args: None - * - * Return: A pointer to the cpuid entry. Never returns NULL. - */ -struct kvm_cpuid_entry2 * -kvm_get_supported_cpuid_index(uint32_t function, uint32_t index) +bool kvm_cpuid_has(const struct kvm_cpuid2 *cpuid, + struct kvm_x86_cpu_feature feature) { - struct kvm_cpuid2 *cpuid; - struct kvm_cpuid_entry2 *entry = NULL; + const struct kvm_cpuid_entry2 *entry; int i; - cpuid = kvm_get_supported_cpuid(); for (i = 0; i < cpuid->nent; i++) { - if (cpuid->entries[i].function == function && - cpuid->entries[i].index == index) { - entry = &cpuid->entries[i]; - break; - } + entry = &cpuid->entries[i]; + + /* + * The output registers in kvm_cpuid_entry2 are in alphabetical + * order, but kvm_x86_cpu_feature matches that mess, so yay + * pointer shenanigans! + */ + if (entry->function == feature.function && + entry->index == feature.index) + return (&entry->eax)[feature.reg] & BIT(feature.bit); } - TEST_ASSERT(entry, "Guest CPUID entry not found: (EAX=%x, ECX=%x).", - function, index); - return entry; + return false; } -/* VM VCPU CPUID Set - * - * Input Args: - * vm - Virtual Machine - * vcpuid - VCPU id - * cpuid - The CPUID values to set. - * - * Output Args: None - * - * Return: void - * - * Set the VCPU's CPUID. - */ -void vcpu_set_cpuid(struct kvm_vm *vm, - uint32_t vcpuid, struct kvm_cpuid2 *cpuid) +uint64_t kvm_get_feature_msr(uint64_t msr_index) { - struct vcpu *vcpu = vcpu_find(vm, vcpuid); - int rc; + struct { + struct kvm_msrs header; + struct kvm_msr_entry entry; + } buffer = {}; + int r, kvm_fd; - TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid); + buffer.header.nmsrs = 1; + buffer.entry.index = msr_index; + kvm_fd = open_kvm_dev_path_or_exit(); - rc = ioctl(vcpu->fd, KVM_SET_CPUID2, cpuid); - TEST_ASSERT(rc == 0, "KVM_SET_CPUID2 failed, rc: %i errno: %i", - rc, errno); + r = __kvm_ioctl(kvm_fd, KVM_GET_MSRS, &buffer.header); + TEST_ASSERT(r == 1, KVM_IOCTL_ERROR(KVM_GET_MSRS, r)); + close(kvm_fd); + return buffer.entry.data; } -/* Create a VM with reasonable defaults - * - * Input Args: - * vcpuid - The id of the single VCPU to add to the VM. - * extra_mem_pages - The size of extra memories to add (this will - * decide how much extra space we will need to - * setup the page tables using mem slot 0) - * guest_code - The vCPU's entry point - * - * Output Args: None - * - * Return: - * Pointer to opaque structure that describes the created VM. - */ -struct kvm_vm *vm_create_default(uint32_t vcpuid, uint64_t extra_mem_pages, - void *guest_code) +void vcpu_init_cpuid(struct kvm_vcpu *vcpu, const struct kvm_cpuid2 *cpuid) { - struct kvm_vm *vm; - /* - * For x86 the maximum page table size for a memory region - * will be when only 4K pages are used. In that case the - * total extra size for page tables (for extra N pages) will - * be: N/512+N/512^2+N/512^3+... which is definitely smaller - * than N/512*2. - */ - uint64_t extra_pg_pages = extra_mem_pages / 512 * 2; + TEST_ASSERT(cpuid != vcpu->cpuid, "@cpuid can't be the vCPU's CPUID"); - /* Create VM */ - vm = vm_create(VM_MODE_DEFAULT, - DEFAULT_GUEST_PHY_PAGES + extra_pg_pages, - O_RDWR); + /* Allow overriding the default CPUID. */ + if (vcpu->cpuid && vcpu->cpuid->nent < cpuid->nent) { + free(vcpu->cpuid); + vcpu->cpuid = NULL; + } - /* Setup guest code */ - kvm_vm_elf_load(vm, program_invocation_name, 0, 0); + if (!vcpu->cpuid) + vcpu->cpuid = allocate_kvm_cpuid2(cpuid->nent); - /* Setup IRQ Chip */ - vm_create_irqchip(vm); + memcpy(vcpu->cpuid, cpuid, kvm_cpuid2_size(cpuid->nent)); + vcpu_set_cpuid(vcpu); +} - /* Add the first vCPU. */ - vm_vcpu_add_default(vm, vcpuid, guest_code); +void vcpu_set_cpuid_maxphyaddr(struct kvm_vcpu *vcpu, uint8_t maxphyaddr) +{ + struct kvm_cpuid_entry2 *entry = vcpu_get_cpuid_entry(vcpu, 0x80000008); - return vm; + entry->eax = (entry->eax & ~0xff) | maxphyaddr; + vcpu_set_cpuid(vcpu); } -/* VCPU Get MSR - * - * Input Args: - * vm - Virtual Machine - * vcpuid - VCPU ID - * msr_index - Index of MSR - * - * Output Args: None - * - * Return: On success, value of the MSR. On failure a TEST_ASSERT is produced. - * - * Get value of MSR for VCPU. - */ -uint64_t vcpu_get_msr(struct kvm_vm *vm, uint32_t vcpuid, uint64_t msr_index) +void vcpu_clear_cpuid_entry(struct kvm_vcpu *vcpu, uint32_t function) +{ + struct kvm_cpuid_entry2 *entry = vcpu_get_cpuid_entry(vcpu, function); + + entry->eax = 0; + entry->ebx = 0; + entry->ecx = 0; + entry->edx = 0; + vcpu_set_cpuid(vcpu); +} + +void vcpu_set_or_clear_cpuid_feature(struct kvm_vcpu *vcpu, + struct kvm_x86_cpu_feature feature, + bool set) +{ + struct kvm_cpuid_entry2 *entry; + u32 *reg; + + entry = __vcpu_get_cpuid_entry(vcpu, feature.function, feature.index); + reg = (&entry->eax) + feature.reg; + + if (set) + *reg |= BIT(feature.bit); + else + *reg &= ~BIT(feature.bit); + + vcpu_set_cpuid(vcpu); +} + +uint64_t vcpu_get_msr(struct kvm_vcpu *vcpu, uint64_t msr_index) { - struct vcpu *vcpu = vcpu_find(vm, vcpuid); struct { struct kvm_msrs header; struct kvm_msr_entry entry; } buffer = {}; - int r; - TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid); buffer.header.nmsrs = 1; buffer.entry.index = msr_index; - r = ioctl(vcpu->fd, KVM_GET_MSRS, &buffer.header); - TEST_ASSERT(r == 1, "KVM_GET_MSRS IOCTL failed,\n" - " rc: %i errno: %i", r, errno); + + vcpu_msrs_get(vcpu, &buffer.header); return buffer.entry.data; } -/* _VCPU Set MSR - * - * Input Args: - * vm - Virtual Machine - * vcpuid - VCPU ID - * msr_index - Index of MSR - * msr_value - New value of MSR - * - * Output Args: None - * - * Return: The result of KVM_SET_MSRS. - * - * Sets the value of an MSR for the given VCPU. - */ -int _vcpu_set_msr(struct kvm_vm *vm, uint32_t vcpuid, uint64_t msr_index, - uint64_t msr_value) +int _vcpu_set_msr(struct kvm_vcpu *vcpu, uint64_t msr_index, uint64_t msr_value) { - struct vcpu *vcpu = vcpu_find(vm, vcpuid); struct { struct kvm_msrs header; struct kvm_msr_entry entry; } buffer = {}; - int r; - TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid); memset(&buffer, 0, sizeof(buffer)); buffer.header.nmsrs = 1; buffer.entry.index = msr_index; buffer.entry.data = msr_value; - r = ioctl(vcpu->fd, KVM_SET_MSRS, &buffer.header); - return r; -} - -/* VCPU Set MSR - * - * Input Args: - * vm - Virtual Machine - * vcpuid - VCPU ID - * msr_index - Index of MSR - * msr_value - New value of MSR - * - * Output Args: None - * - * Return: On success, nothing. On failure a TEST_ASSERT is produced. - * - * Set value of MSR for VCPU. - */ -void vcpu_set_msr(struct kvm_vm *vm, uint32_t vcpuid, uint64_t msr_index, - uint64_t msr_value) -{ - int r; - r = _vcpu_set_msr(vm, vcpuid, msr_index, msr_value); - TEST_ASSERT(r == 1, "KVM_SET_MSRS IOCTL failed,\n" - " rc: %i errno: %i", r, errno); + return __vcpu_ioctl(vcpu, KVM_SET_MSRS, &buffer.header); } -/* VM VCPU Args Set - * - * Input Args: - * vm - Virtual Machine - * vcpuid - VCPU ID - * num - number of arguments - * ... - arguments, each of type uint64_t - * - * Output Args: None - * - * Return: None - * - * Sets the first num function input arguments to the values - * given as variable args. Each of the variable args is expected to - * be of type uint64_t. - */ -void vcpu_args_set(struct kvm_vm *vm, uint32_t vcpuid, unsigned int num, ...) +void vcpu_args_set(struct kvm_vcpu *vcpu, unsigned int num, ...) { va_list ap; struct kvm_regs regs; @@ -952,7 +835,7 @@ void vcpu_args_set(struct kvm_vm *vm, uint32_t vcpuid, unsigned int num, ...) num); va_start(ap, num); - vcpu_regs_get(vm, vcpuid, ®s); + vcpu_regs_get(vcpu, ®s); if (num >= 1) regs.rdi = va_arg(ap, uint64_t); @@ -972,102 +855,112 @@ void vcpu_args_set(struct kvm_vm *vm, uint32_t vcpuid, unsigned int num, ...) if (num >= 6) regs.r9 = va_arg(ap, uint64_t); - vcpu_regs_set(vm, vcpuid, ®s); + vcpu_regs_set(vcpu, ®s); va_end(ap); } -/* - * VM VCPU Dump - * - * Input Args: - * vm - Virtual Machine - * vcpuid - VCPU ID - * indent - Left margin indent amount - * - * Output Args: - * stream - Output FILE stream - * - * Return: None - * - * Dumps the current state of the VCPU specified by vcpuid, within the VM - * given by vm, to the FILE stream given by stream. - */ -void vcpu_dump(FILE *stream, struct kvm_vm *vm, uint32_t vcpuid, uint8_t indent) +void vcpu_arch_dump(FILE *stream, struct kvm_vcpu *vcpu, uint8_t indent) { struct kvm_regs regs; struct kvm_sregs sregs; - fprintf(stream, "%*scpuid: %u\n", indent, "", vcpuid); + fprintf(stream, "%*svCPU ID: %u\n", indent, "", vcpu->id); fprintf(stream, "%*sregs:\n", indent + 2, ""); - vcpu_regs_get(vm, vcpuid, ®s); + vcpu_regs_get(vcpu, ®s); regs_dump(stream, ®s, indent + 4); fprintf(stream, "%*ssregs:\n", indent + 2, ""); - vcpu_sregs_get(vm, vcpuid, &sregs); + vcpu_sregs_get(vcpu, &sregs); sregs_dump(stream, &sregs, indent + 4); } -struct kvm_x86_state { - struct kvm_vcpu_events events; - struct kvm_mp_state mp_state; - struct kvm_regs regs; - struct kvm_xsave xsave; - struct kvm_xcrs xcrs; - struct kvm_sregs sregs; - struct kvm_debugregs debugregs; - union { - struct kvm_nested_state nested; - char nested_[16384]; - }; - struct kvm_msrs msrs; -}; - -static int kvm_get_num_msrs_fd(int kvm_fd) +static struct kvm_msr_list *__kvm_get_msr_index_list(bool feature_msrs) { + struct kvm_msr_list *list; struct kvm_msr_list nmsrs; - int r; + int kvm_fd, r; + + kvm_fd = open_kvm_dev_path_or_exit(); nmsrs.nmsrs = 0; - r = ioctl(kvm_fd, KVM_GET_MSR_INDEX_LIST, &nmsrs); - TEST_ASSERT(r == -1 && errno == E2BIG, "Unexpected result from KVM_GET_MSR_INDEX_LIST probe, r: %i", - r); + if (!feature_msrs) + r = __kvm_ioctl(kvm_fd, KVM_GET_MSR_INDEX_LIST, &nmsrs); + else + r = __kvm_ioctl(kvm_fd, KVM_GET_MSR_FEATURE_INDEX_LIST, &nmsrs); + + TEST_ASSERT(r == -1 && errno == E2BIG, + "Expected -E2BIG, got rc: %i errno: %i (%s)", + r, errno, strerror(errno)); + + list = malloc(sizeof(*list) + nmsrs.nmsrs * sizeof(list->indices[0])); + TEST_ASSERT(list, "-ENOMEM when allocating MSR index list"); + list->nmsrs = nmsrs.nmsrs; + + if (!feature_msrs) + kvm_ioctl(kvm_fd, KVM_GET_MSR_INDEX_LIST, list); + else + kvm_ioctl(kvm_fd, KVM_GET_MSR_FEATURE_INDEX_LIST, list); + close(kvm_fd); - return nmsrs.nmsrs; + TEST_ASSERT(list->nmsrs == nmsrs.nmsrs, + "Number of MSRs in list changed, was %d, now %d", + nmsrs.nmsrs, list->nmsrs); + return list; } -static int kvm_get_num_msrs(struct kvm_vm *vm) +const struct kvm_msr_list *kvm_get_msr_index_list(void) { - return kvm_get_num_msrs_fd(vm->kvm_fd); + static const struct kvm_msr_list *list; + + if (!list) + list = __kvm_get_msr_index_list(false); + return list; } -struct kvm_msr_list *kvm_get_msr_index_list(void) + +const struct kvm_msr_list *kvm_get_feature_msr_index_list(void) { - struct kvm_msr_list *list; - int nmsrs, r, kvm_fd; + static const struct kvm_msr_list *list; - kvm_fd = open(KVM_DEV_PATH, O_RDONLY); - if (kvm_fd < 0) - exit(KSFT_SKIP); + if (!list) + list = __kvm_get_msr_index_list(true); + return list; +} - nmsrs = kvm_get_num_msrs_fd(kvm_fd); - list = malloc(sizeof(*list) + nmsrs * sizeof(list->indices[0])); - list->nmsrs = nmsrs; - r = ioctl(kvm_fd, KVM_GET_MSR_INDEX_LIST, list); - close(kvm_fd); +bool kvm_msr_is_in_save_restore_list(uint32_t msr_index) +{ + const struct kvm_msr_list *list = kvm_get_msr_index_list(); + int i; - TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_MSR_INDEX_LIST, r: %i", - r); + for (i = 0; i < list->nmsrs; ++i) { + if (list->indices[i] == msr_index) + return true; + } - return list; + return false; } -struct kvm_x86_state *vcpu_save_state(struct kvm_vm *vm, uint32_t vcpuid) +static void vcpu_save_xsave_state(struct kvm_vcpu *vcpu, + struct kvm_x86_state *state) { - struct vcpu *vcpu = vcpu_find(vm, vcpuid); - struct kvm_msr_list *list; + int size = vm_check_cap(vcpu->vm, KVM_CAP_XSAVE2); + + if (size) { + state->xsave = malloc(size); + vcpu_xsave2_get(vcpu, state->xsave); + } else { + state->xsave = malloc(sizeof(struct kvm_xsave)); + vcpu_xsave_get(vcpu, state->xsave); + } +} + +struct kvm_x86_state *vcpu_save_state(struct kvm_vcpu *vcpu) +{ + const struct kvm_msr_list *msr_list = kvm_get_msr_index_list(); struct kvm_x86_state *state; - int nmsrs, r, i; + int i; + static int nested_size = -1; if (nested_size == -1) { @@ -1083,143 +976,90 @@ struct kvm_x86_state *vcpu_save_state(struct kvm_vm *vm, uint32_t vcpuid) * kernel with KVM_RUN. Complete IO prior to migrating state * to a new VM. */ - vcpu_run_complete_io(vm, vcpuid); - - nmsrs = kvm_get_num_msrs(vm); - list = malloc(sizeof(*list) + nmsrs * sizeof(list->indices[0])); - list->nmsrs = nmsrs; - r = ioctl(vm->kvm_fd, KVM_GET_MSR_INDEX_LIST, list); - TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_MSR_INDEX_LIST, r: %i", - r); - - state = malloc(sizeof(*state) + nmsrs * sizeof(state->msrs.entries[0])); - r = ioctl(vcpu->fd, KVM_GET_VCPU_EVENTS, &state->events); - TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_VCPU_EVENTS, r: %i", - r); - - r = ioctl(vcpu->fd, KVM_GET_MP_STATE, &state->mp_state); - TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_MP_STATE, r: %i", - r); - - r = ioctl(vcpu->fd, KVM_GET_REGS, &state->regs); - TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_REGS, r: %i", - r); - - r = ioctl(vcpu->fd, KVM_GET_XSAVE, &state->xsave); - TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_XSAVE, r: %i", - r); - - if (kvm_check_cap(KVM_CAP_XCRS)) { - r = ioctl(vcpu->fd, KVM_GET_XCRS, &state->xcrs); - TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_XCRS, r: %i", - r); - } + vcpu_run_complete_io(vcpu); + + state = malloc(sizeof(*state) + msr_list->nmsrs * sizeof(state->msrs.entries[0])); + + vcpu_events_get(vcpu, &state->events); + vcpu_mp_state_get(vcpu, &state->mp_state); + vcpu_regs_get(vcpu, &state->regs); + vcpu_save_xsave_state(vcpu, state); - r = ioctl(vcpu->fd, KVM_GET_SREGS, &state->sregs); - TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_SREGS, r: %i", - r); + if (kvm_has_cap(KVM_CAP_XCRS)) + vcpu_xcrs_get(vcpu, &state->xcrs); + + vcpu_sregs_get(vcpu, &state->sregs); if (nested_size) { state->nested.size = sizeof(state->nested_); - r = ioctl(vcpu->fd, KVM_GET_NESTED_STATE, &state->nested); - TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_NESTED_STATE, r: %i", - r); + + vcpu_nested_state_get(vcpu, &state->nested); TEST_ASSERT(state->nested.size <= nested_size, - "Nested state size too big, %i (KVM_CHECK_CAP gave %i)", - state->nested.size, nested_size); - } else + "Nested state size too big, %i (KVM_CHECK_CAP gave %i)", + state->nested.size, nested_size); + } else { state->nested.size = 0; + } - state->msrs.nmsrs = nmsrs; - for (i = 0; i < nmsrs; i++) - state->msrs.entries[i].index = list->indices[i]; - r = ioctl(vcpu->fd, KVM_GET_MSRS, &state->msrs); - TEST_ASSERT(r == nmsrs, "Unexpected result from KVM_GET_MSRS, r: %i (failed MSR was 0x%x)", - r, r == nmsrs ? -1 : list->indices[r]); + state->msrs.nmsrs = msr_list->nmsrs; + for (i = 0; i < msr_list->nmsrs; i++) + state->msrs.entries[i].index = msr_list->indices[i]; + vcpu_msrs_get(vcpu, &state->msrs); - r = ioctl(vcpu->fd, KVM_GET_DEBUGREGS, &state->debugregs); - TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_DEBUGREGS, r: %i", - r); + vcpu_debugregs_get(vcpu, &state->debugregs); - free(list); return state; } -void vcpu_load_state(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_x86_state *state) +void vcpu_load_state(struct kvm_vcpu *vcpu, struct kvm_x86_state *state) { - struct vcpu *vcpu = vcpu_find(vm, vcpuid); - int r; - - r = ioctl(vcpu->fd, KVM_SET_XSAVE, &state->xsave); - TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_XSAVE, r: %i", - r); - - if (kvm_check_cap(KVM_CAP_XCRS)) { - r = ioctl(vcpu->fd, KVM_SET_XCRS, &state->xcrs); - TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_XCRS, r: %i", - r); - } - - r = ioctl(vcpu->fd, KVM_SET_SREGS, &state->sregs); - TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_SREGS, r: %i", - r); - - r = ioctl(vcpu->fd, KVM_SET_MSRS, &state->msrs); - TEST_ASSERT(r == state->msrs.nmsrs, "Unexpected result from KVM_SET_MSRS, r: %i (failed at %x)", - r, r == state->msrs.nmsrs ? -1 : state->msrs.entries[r].index); + vcpu_sregs_set(vcpu, &state->sregs); + vcpu_msrs_set(vcpu, &state->msrs); - r = ioctl(vcpu->fd, KVM_SET_VCPU_EVENTS, &state->events); - TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_VCPU_EVENTS, r: %i", - r); + if (kvm_has_cap(KVM_CAP_XCRS)) + vcpu_xcrs_set(vcpu, &state->xcrs); - r = ioctl(vcpu->fd, KVM_SET_MP_STATE, &state->mp_state); - TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_MP_STATE, r: %i", - r); + vcpu_xsave_set(vcpu, state->xsave); + vcpu_events_set(vcpu, &state->events); + vcpu_mp_state_set(vcpu, &state->mp_state); + vcpu_debugregs_set(vcpu, &state->debugregs); + vcpu_regs_set(vcpu, &state->regs); - r = ioctl(vcpu->fd, KVM_SET_DEBUGREGS, &state->debugregs); - TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_DEBUGREGS, r: %i", - r); - - r = ioctl(vcpu->fd, KVM_SET_REGS, &state->regs); - TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_REGS, r: %i", - r); - - if (state->nested.size) { - r = ioctl(vcpu->fd, KVM_SET_NESTED_STATE, &state->nested); - TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_NESTED_STATE, r: %i", - r); - } + if (state->nested.size) + vcpu_nested_state_set(vcpu, &state->nested); } -bool is_intel_cpu(void) +void kvm_x86_state_cleanup(struct kvm_x86_state *state) { - int eax, ebx, ecx, edx; - const uint32_t *chunk; - const int leaf = 0; + free(state->xsave); + free(state); +} - __asm__ __volatile__( - "cpuid" - : /* output */ "=a"(eax), "=b"(ebx), - "=c"(ecx), "=d"(edx) - : /* input */ "0"(leaf), "2"(0)); +static bool cpu_vendor_string_is(const char *vendor) +{ + const uint32_t *chunk = (const uint32_t *)vendor; + uint32_t eax, ebx, ecx, edx; - chunk = (const uint32_t *)("GenuineIntel"); + cpuid(0, &eax, &ebx, &ecx, &edx); return (ebx == chunk[0] && edx == chunk[1] && ecx == chunk[2]); } -uint32_t kvm_get_cpuid_max_basic(void) +bool is_intel_cpu(void) { - return kvm_get_supported_cpuid_entry(0)->eax; + return cpu_vendor_string_is("GenuineIntel"); } -uint32_t kvm_get_cpuid_max_extended(void) +/* + * Exclude early K5 samples with a vendor string of "AMDisbetter!" + */ +bool is_amd_cpu(void) { - return kvm_get_supported_cpuid_entry(0x80000000)->eax; + return cpu_vendor_string_is("AuthenticAMD"); } void kvm_get_cpu_address_width(unsigned int *pa_bits, unsigned int *va_bits) { - struct kvm_cpuid_entry2 *entry; + const struct kvm_cpuid_entry2 *entry; bool pae; /* SDM 4.1.4 */ @@ -1233,3 +1073,257 @@ void kvm_get_cpu_address_width(unsigned int *pa_bits, unsigned int *va_bits) *va_bits = (entry->eax >> 8) & 0xff; } } + +struct idt_entry { + uint16_t offset0; + uint16_t selector; + uint16_t ist : 3; + uint16_t : 5; + uint16_t type : 4; + uint16_t : 1; + uint16_t dpl : 2; + uint16_t p : 1; + uint16_t offset1; + uint32_t offset2; uint32_t reserved; +}; + +static void set_idt_entry(struct kvm_vm *vm, int vector, unsigned long addr, + int dpl, unsigned short selector) +{ + struct idt_entry *base = + (struct idt_entry *)addr_gva2hva(vm, vm->idt); + struct idt_entry *e = &base[vector]; + + memset(e, 0, sizeof(*e)); + e->offset0 = addr; + e->selector = selector; + e->ist = 0; + e->type = 14; + e->dpl = dpl; + e->p = 1; + e->offset1 = addr >> 16; + e->offset2 = addr >> 32; +} + + +static bool kvm_fixup_exception(struct ex_regs *regs) +{ + if (regs->r9 != KVM_EXCEPTION_MAGIC || regs->rip != regs->r10) + return false; + + if (regs->vector == DE_VECTOR) + return false; + + regs->rip = regs->r11; + regs->r9 = regs->vector; + return true; +} + +void kvm_exit_unexpected_vector(uint32_t value) +{ + ucall(UCALL_UNHANDLED, 1, value); +} + +void route_exception(struct ex_regs *regs) +{ + typedef void(*handler)(struct ex_regs *); + handler *handlers = (handler *)exception_handlers; + + if (handlers && handlers[regs->vector]) { + handlers[regs->vector](regs); + return; + } + + if (kvm_fixup_exception(regs)) + return; + + kvm_exit_unexpected_vector(regs->vector); +} + +void vm_init_descriptor_tables(struct kvm_vm *vm) +{ + extern void *idt_handlers; + int i; + + vm->idt = vm_vaddr_alloc_page(vm); + vm->handlers = vm_vaddr_alloc_page(vm); + /* Handlers have the same address in both address spaces.*/ + for (i = 0; i < NUM_INTERRUPTS; i++) + set_idt_entry(vm, i, (unsigned long)(&idt_handlers)[i], 0, + DEFAULT_CODE_SELECTOR); +} + +void vcpu_init_descriptor_tables(struct kvm_vcpu *vcpu) +{ + struct kvm_vm *vm = vcpu->vm; + struct kvm_sregs sregs; + + vcpu_sregs_get(vcpu, &sregs); + sregs.idt.base = vm->idt; + sregs.idt.limit = NUM_INTERRUPTS * sizeof(struct idt_entry) - 1; + sregs.gdt.base = vm->gdt; + sregs.gdt.limit = getpagesize() - 1; + kvm_seg_set_kernel_data_64bit(NULL, DEFAULT_DATA_SELECTOR, &sregs.gs); + vcpu_sregs_set(vcpu, &sregs); + *(vm_vaddr_t *)addr_gva2hva(vm, (vm_vaddr_t)(&exception_handlers)) = vm->handlers; +} + +void vm_install_exception_handler(struct kvm_vm *vm, int vector, + void (*handler)(struct ex_regs *)) +{ + vm_vaddr_t *handlers = (vm_vaddr_t *)addr_gva2hva(vm, vm->handlers); + + handlers[vector] = (vm_vaddr_t)handler; +} + +void assert_on_unhandled_exception(struct kvm_vcpu *vcpu) +{ + struct ucall uc; + + if (get_ucall(vcpu, &uc) == UCALL_UNHANDLED) { + uint64_t vector = uc.args[0]; + + TEST_FAIL("Unexpected vectored event in guest (vector:0x%lx)", + vector); + } +} + +const struct kvm_cpuid_entry2 *get_cpuid_entry(const struct kvm_cpuid2 *cpuid, + uint32_t function, uint32_t index) +{ + int i; + + for (i = 0; i < cpuid->nent; i++) { + if (cpuid->entries[i].function == function && + cpuid->entries[i].index == index) + return &cpuid->entries[i]; + } + + TEST_FAIL("CPUID function 0x%x index 0x%x not found ", function, index); + + return NULL; +} + +uint64_t kvm_hypercall(uint64_t nr, uint64_t a0, uint64_t a1, uint64_t a2, + uint64_t a3) +{ + uint64_t r; + + asm volatile("vmcall" + : "=a"(r) + : "a"(nr), "b"(a0), "c"(a1), "d"(a2), "S"(a3)); + return r; +} + +const struct kvm_cpuid2 *kvm_get_supported_hv_cpuid(void) +{ + static struct kvm_cpuid2 *cpuid; + int kvm_fd; + + if (cpuid) + return cpuid; + + cpuid = allocate_kvm_cpuid2(MAX_NR_CPUID_ENTRIES); + kvm_fd = open_kvm_dev_path_or_exit(); + + kvm_ioctl(kvm_fd, KVM_GET_SUPPORTED_HV_CPUID, cpuid); + + close(kvm_fd); + return cpuid; +} + +void vcpu_set_hv_cpuid(struct kvm_vcpu *vcpu) +{ + static struct kvm_cpuid2 *cpuid_full; + const struct kvm_cpuid2 *cpuid_sys, *cpuid_hv; + int i, nent = 0; + + if (!cpuid_full) { + cpuid_sys = kvm_get_supported_cpuid(); + cpuid_hv = kvm_get_supported_hv_cpuid(); + + cpuid_full = allocate_kvm_cpuid2(cpuid_sys->nent + cpuid_hv->nent); + if (!cpuid_full) { + perror("malloc"); + abort(); + } + + /* Need to skip KVM CPUID leaves 0x400000xx */ + for (i = 0; i < cpuid_sys->nent; i++) { + if (cpuid_sys->entries[i].function >= 0x40000000 && + cpuid_sys->entries[i].function < 0x40000100) + continue; + cpuid_full->entries[nent] = cpuid_sys->entries[i]; + nent++; + } + + memcpy(&cpuid_full->entries[nent], cpuid_hv->entries, + cpuid_hv->nent * sizeof(struct kvm_cpuid_entry2)); + cpuid_full->nent = nent + cpuid_hv->nent; + } + + vcpu_init_cpuid(vcpu, cpuid_full); +} + +const struct kvm_cpuid2 *vcpu_get_supported_hv_cpuid(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid2 *cpuid = allocate_kvm_cpuid2(MAX_NR_CPUID_ENTRIES); + + vcpu_ioctl(vcpu, KVM_GET_SUPPORTED_HV_CPUID, cpuid); + + return cpuid; +} + +unsigned long vm_compute_max_gfn(struct kvm_vm *vm) +{ + const unsigned long num_ht_pages = 12 << (30 - vm->page_shift); /* 12 GiB */ + unsigned long ht_gfn, max_gfn, max_pfn; + uint32_t eax, ebx, ecx, edx, max_ext_leaf; + + max_gfn = (1ULL << (vm->pa_bits - vm->page_shift)) - 1; + + /* Avoid reserved HyperTransport region on AMD processors. */ + if (!is_amd_cpu()) + return max_gfn; + + /* On parts with <40 physical address bits, the area is fully hidden */ + if (vm->pa_bits < 40) + return max_gfn; + + /* Before family 17h, the HyperTransport area is just below 1T. */ + ht_gfn = (1 << 28) - num_ht_pages; + cpuid(1, &eax, &ebx, &ecx, &edx); + if (x86_family(eax) < 0x17) + goto done; + + /* + * Otherwise it's at the top of the physical address space, possibly + * reduced due to SME by bits 11:6 of CPUID[0x8000001f].EBX. Use + * the old conservative value if MAXPHYADDR is not enumerated. + */ + cpuid(0x80000000, &eax, &ebx, &ecx, &edx); + max_ext_leaf = eax; + if (max_ext_leaf < 0x80000008) + goto done; + + cpuid(0x80000008, &eax, &ebx, &ecx, &edx); + max_pfn = (1ULL << ((eax & 0xff) - vm->page_shift)) - 1; + if (max_ext_leaf >= 0x8000001f) { + cpuid(0x8000001f, &eax, &ebx, &ecx, &edx); + max_pfn >>= (ebx >> 6) & 0x3f; + } + + ht_gfn = max_pfn - num_ht_pages; +done: + return min(max_gfn, ht_gfn - 1); +} + +/* Returns true if kvm_intel was loaded with unrestricted_guest=1. */ +bool vm_is_unrestricted_guest(struct kvm_vm *vm) +{ + /* Ensure that a KVM vendor-specific module is loaded. */ + if (vm == NULL) + close(open_kvm_dev_path_or_exit()); + + return get_kvm_intel_param_bool("unrestricted_guest"); +} diff --git a/tools/testing/selftests/kvm/lib/x86_64/svm.c b/tools/testing/selftests/kvm/lib/x86_64/svm.c index 6e05a8fc3fe0..5495a92dfd5a 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/svm.c +++ b/tools/testing/selftests/kvm/lib/x86_64/svm.c @@ -9,10 +9,11 @@ #include "test_util.h" #include "kvm_util.h" -#include "../kvm_util_internal.h" #include "processor.h" #include "svm_util.h" +#define SEV_DEV_PATH "/dev/sev" + struct gpr64_regs guest_regs; u64 rflags; @@ -30,20 +31,22 @@ u64 rflags; struct svm_test_data * vcpu_alloc_svm(struct kvm_vm *vm, vm_vaddr_t *p_svm_gva) { - vm_vaddr_t svm_gva = vm_vaddr_alloc(vm, getpagesize(), - 0x10000, 0, 0); + vm_vaddr_t svm_gva = vm_vaddr_alloc_page(vm); struct svm_test_data *svm = addr_gva2hva(vm, svm_gva); - svm->vmcb = (void *)vm_vaddr_alloc(vm, getpagesize(), - 0x10000, 0, 0); + svm->vmcb = (void *)vm_vaddr_alloc_page(vm); svm->vmcb_hva = addr_gva2hva(vm, (uintptr_t)svm->vmcb); svm->vmcb_gpa = addr_gva2gpa(vm, (uintptr_t)svm->vmcb); - svm->save_area = (void *)vm_vaddr_alloc(vm, getpagesize(), - 0x10000, 0, 0); + svm->save_area = (void *)vm_vaddr_alloc_page(vm); svm->save_area_hva = addr_gva2hva(vm, (uintptr_t)svm->save_area); svm->save_area_gpa = addr_gva2gpa(vm, (uintptr_t)svm->save_area); + svm->msr = (void *)vm_vaddr_alloc_page(vm); + svm->msr_hva = addr_gva2hva(vm, (uintptr_t)svm->msr); + svm->msr_gpa = addr_gva2gpa(vm, (uintptr_t)svm->msr); + memset(svm->msr_hva, 0, getpagesize()); + *p_svm_gva = svm_gva; return svm; } @@ -74,7 +77,7 @@ void generic_svm_setup(struct svm_test_data *svm, void *guest_rip, void *guest_r wrmsr(MSR_VM_HSAVE_PA, svm->save_area_gpa); memset(vmcb, 0, sizeof(*vmcb)); - asm volatile ("vmsave\n\t" : : "a" (vmcb_gpa) : "memory"); + asm volatile ("vmsave %0\n\t" : : "a" (vmcb_gpa) : "memory"); vmcb_set_seg(&save->es, get_es(), 0, -1U, data_seg_attr); vmcb_set_seg(&save->cs, get_cs(), 0, -1U, code_seg_attr); vmcb_set_seg(&save->ss, get_ss(), 0, -1U, data_seg_attr); @@ -95,6 +98,7 @@ void generic_svm_setup(struct svm_test_data *svm, void *guest_rip, void *guest_r save->dbgctl = rdmsr(MSR_IA32_DEBUGCTLMSR); ctrl->intercept = (1ULL << INTERCEPT_VMRUN) | (1ULL << INTERCEPT_VMMCALL); + ctrl->msrpm_base_pa = svm->msr_gpa; vmcb->save.rip = (u64)guest_rip; vmcb->save.rsp = (u64)guest_rsp; @@ -131,31 +135,30 @@ void generic_svm_setup(struct svm_test_data *svm, void *guest_rip, void *guest_r void run_guest(struct vmcb *vmcb, uint64_t vmcb_gpa) { asm volatile ( - "vmload\n\t" + "vmload %[vmcb_gpa]\n\t" "mov rflags, %%r15\n\t" // rflags "mov %%r15, 0x170(%[vmcb])\n\t" "mov guest_regs, %%r15\n\t" // rax "mov %%r15, 0x1f8(%[vmcb])\n\t" LOAD_GPR_C - "vmrun\n\t" + "vmrun %[vmcb_gpa]\n\t" SAVE_GPR_C "mov 0x170(%[vmcb]), %%r15\n\t" // rflags "mov %%r15, rflags\n\t" "mov 0x1f8(%[vmcb]), %%r15\n\t" // rax "mov %%r15, guest_regs\n\t" - "vmsave\n\t" + "vmsave %[vmcb_gpa]\n\t" : : [vmcb] "r" (vmcb), [vmcb_gpa] "a" (vmcb_gpa) : "r15", "memory"); } -void nested_svm_check_supported(void) +/* + * Open SEV_DEV_PATH if available, otherwise exit the entire program. + * + * Return: + * The opened file descriptor of /dev/sev. + */ +int open_sev_dev_path_or_exit(void) { - struct kvm_cpuid_entry2 *entry = - kvm_get_supported_cpuid_entry(0x80000001); - - if (!(entry->ecx & CPUID_SVM)) { - fprintf(stderr, "nested SVM not enabled, skipping test\n"); - exit(KSFT_SKIP); - } + return open_path_or_exit(SEV_DEV_PATH, 0); } - diff --git a/tools/testing/selftests/kvm/lib/x86_64/ucall.c b/tools/testing/selftests/kvm/lib/x86_64/ucall.c index da4d89ad5419..e5f0f9e0d3ee 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/ucall.c +++ b/tools/testing/selftests/kvm/lib/x86_64/ucall.c @@ -24,7 +24,7 @@ void ucall(uint64_t cmd, int nargs, ...) va_list va; int i; - nargs = nargs <= UCALL_MAX_ARGS ? nargs : UCALL_MAX_ARGS; + nargs = min(nargs, UCALL_MAX_ARGS); va_start(va, nargs); for (i = 0; i < nargs; ++i) @@ -35,19 +35,22 @@ void ucall(uint64_t cmd, int nargs, ...) : : [port] "d" (UCALL_PIO_PORT), "D" (&uc) : "rax", "memory"); } -uint64_t get_ucall(struct kvm_vm *vm, uint32_t vcpu_id, struct ucall *uc) +uint64_t get_ucall(struct kvm_vcpu *vcpu, struct ucall *uc) { - struct kvm_run *run = vcpu_state(vm, vcpu_id); + struct kvm_run *run = vcpu->run; struct ucall ucall = {}; + if (uc) + memset(uc, 0, sizeof(*uc)); + if (run->exit_reason == KVM_EXIT_IO && run->io.port == UCALL_PIO_PORT) { struct kvm_regs regs; - vcpu_regs_get(vm, vcpu_id, ®s); - memcpy(&ucall, addr_gva2hva(vm, (vm_vaddr_t)regs.rdi), + vcpu_regs_get(vcpu, ®s); + memcpy(&ucall, addr_gva2hva(vcpu->vm, (vm_vaddr_t)regs.rdi), sizeof(ucall)); - vcpu_run_complete_io(vm, vcpu_id); + vcpu_run_complete_io(vcpu); if (uc) memcpy(uc, &ucall, sizeof(ucall)); } diff --git a/tools/testing/selftests/kvm/lib/x86_64/vmx.c b/tools/testing/selftests/kvm/lib/x86_64/vmx.c index 7aaa99ca4dbc..d21049c38fc5 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/vmx.c +++ b/tools/testing/selftests/kvm/lib/x86_64/vmx.c @@ -5,9 +5,10 @@ * Copyright (C) 2018, Google LLC. */ +#include <asm/msr-index.h> + #include "test_util.h" #include "kvm_util.h" -#include "../kvm_util_internal.h" #include "processor.h" #include "vmx.h" @@ -17,6 +18,9 @@ bool enable_evmcs; +struct hv_enlightened_vmcs *current_evmcs; +struct hv_vp_assist_page *current_vp_assist; + struct eptPageTableEntry { uint64_t readable:1; uint64_t writable:1; @@ -40,16 +44,12 @@ struct eptPageTablePointer { uint64_t address:40; uint64_t reserved_63_52:12; }; -int vcpu_enable_evmcs(struct kvm_vm *vm, int vcpu_id) +int vcpu_enable_evmcs(struct kvm_vcpu *vcpu) { uint16_t evmcs_ver; - struct kvm_enable_cap enable_evmcs_cap = { - .cap = KVM_CAP_HYPERV_ENLIGHTENED_VMCS, - .args[0] = (unsigned long)&evmcs_ver - }; - - vcpu_ioctl(vm, vcpu_id, KVM_ENABLE_CAP, &enable_evmcs_cap); + vcpu_enable_cap(vcpu, KVM_CAP_HYPERV_ENLIGHTENED_VMCS, + (unsigned long)&evmcs_ver); /* KVM should return supported EVMCS version range */ TEST_ASSERT(((evmcs_ver >> 8) >= (evmcs_ver & 0xff)) && @@ -74,50 +74,48 @@ int vcpu_enable_evmcs(struct kvm_vm *vm, int vcpu_id) struct vmx_pages * vcpu_alloc_vmx(struct kvm_vm *vm, vm_vaddr_t *p_vmx_gva) { - vm_vaddr_t vmx_gva = vm_vaddr_alloc(vm, getpagesize(), 0x10000, 0, 0); + vm_vaddr_t vmx_gva = vm_vaddr_alloc_page(vm); struct vmx_pages *vmx = addr_gva2hva(vm, vmx_gva); /* Setup of a region of guest memory for the vmxon region. */ - vmx->vmxon = (void *)vm_vaddr_alloc(vm, getpagesize(), 0x10000, 0, 0); + vmx->vmxon = (void *)vm_vaddr_alloc_page(vm); vmx->vmxon_hva = addr_gva2hva(vm, (uintptr_t)vmx->vmxon); vmx->vmxon_gpa = addr_gva2gpa(vm, (uintptr_t)vmx->vmxon); /* Setup of a region of guest memory for a vmcs. */ - vmx->vmcs = (void *)vm_vaddr_alloc(vm, getpagesize(), 0x10000, 0, 0); + vmx->vmcs = (void *)vm_vaddr_alloc_page(vm); vmx->vmcs_hva = addr_gva2hva(vm, (uintptr_t)vmx->vmcs); vmx->vmcs_gpa = addr_gva2gpa(vm, (uintptr_t)vmx->vmcs); /* Setup of a region of guest memory for the MSR bitmap. */ - vmx->msr = (void *)vm_vaddr_alloc(vm, getpagesize(), 0x10000, 0, 0); + vmx->msr = (void *)vm_vaddr_alloc_page(vm); vmx->msr_hva = addr_gva2hva(vm, (uintptr_t)vmx->msr); vmx->msr_gpa = addr_gva2gpa(vm, (uintptr_t)vmx->msr); memset(vmx->msr_hva, 0, getpagesize()); /* Setup of a region of guest memory for the shadow VMCS. */ - vmx->shadow_vmcs = (void *)vm_vaddr_alloc(vm, getpagesize(), 0x10000, 0, 0); + vmx->shadow_vmcs = (void *)vm_vaddr_alloc_page(vm); vmx->shadow_vmcs_hva = addr_gva2hva(vm, (uintptr_t)vmx->shadow_vmcs); vmx->shadow_vmcs_gpa = addr_gva2gpa(vm, (uintptr_t)vmx->shadow_vmcs); /* Setup of a region of guest memory for the VMREAD and VMWRITE bitmaps. */ - vmx->vmread = (void *)vm_vaddr_alloc(vm, getpagesize(), 0x10000, 0, 0); + vmx->vmread = (void *)vm_vaddr_alloc_page(vm); vmx->vmread_hva = addr_gva2hva(vm, (uintptr_t)vmx->vmread); vmx->vmread_gpa = addr_gva2gpa(vm, (uintptr_t)vmx->vmread); memset(vmx->vmread_hva, 0, getpagesize()); - vmx->vmwrite = (void *)vm_vaddr_alloc(vm, getpagesize(), 0x10000, 0, 0); + vmx->vmwrite = (void *)vm_vaddr_alloc_page(vm); vmx->vmwrite_hva = addr_gva2hva(vm, (uintptr_t)vmx->vmwrite); vmx->vmwrite_gpa = addr_gva2gpa(vm, (uintptr_t)vmx->vmwrite); memset(vmx->vmwrite_hva, 0, getpagesize()); /* Setup of a region of guest memory for the VP Assist page. */ - vmx->vp_assist = (void *)vm_vaddr_alloc(vm, getpagesize(), - 0x10000, 0, 0); + vmx->vp_assist = (void *)vm_vaddr_alloc_page(vm); vmx->vp_assist_hva = addr_gva2hva(vm, (uintptr_t)vmx->vp_assist); vmx->vp_assist_gpa = addr_gva2gpa(vm, (uintptr_t)vmx->vp_assist); /* Setup of a region of guest memory for the enlightened VMCS. */ - vmx->enlightened_vmcs = (void *)vm_vaddr_alloc(vm, getpagesize(), - 0x10000, 0, 0); + vmx->enlightened_vmcs = (void *)vm_vaddr_alloc_page(vm); vmx->enlightened_vmcs_hva = addr_gva2hva(vm, (uintptr_t)vmx->enlightened_vmcs); vmx->enlightened_vmcs_gpa = @@ -191,12 +189,22 @@ bool load_vmcs(struct vmx_pages *vmx) if (evmcs_vmptrld(vmx->enlightened_vmcs_gpa, vmx->enlightened_vmcs)) return false; - current_evmcs->revision_id = vmcs_revision(); + current_evmcs->revision_id = EVMCS_VERSION; } return true; } +static bool ept_vpid_cap_supported(uint64_t mask) +{ + return rdmsr(MSR_IA32_VMX_EPT_VPID_CAP) & mask; +} + +bool ept_1g_pages_supported(void) +{ + return ept_vpid_cap_supported(VMX_EPT_VPID_CAP_1G_PAGES); +} + /* * Initialize the control fields to the most basic settings possible. */ @@ -214,7 +222,7 @@ static inline void init_vmcs_control_fields(struct vmx_pages *vmx) struct eptPageTablePointer eptp = { .memory_type = VMX_BASIC_MEM_TYPE_WB, .page_walk_length = 3, /* + 1 */ - .ad_enabled = !!(rdmsr(MSR_IA32_VMX_EPT_VPID_CAP) & VMX_EPT_VPID_CAP_AD_BITS), + .ad_enabled = ept_vpid_cap_supported(VMX_EPT_VPID_CAP_AD_BITS), .address = vmx->eptp_gpa >> PAGE_SHIFT_4K, }; @@ -376,96 +384,93 @@ void prepare_vmcs(struct vmx_pages *vmx, void *guest_rip, void *guest_rsp) init_vmcs_guest_state(guest_rip, guest_rsp); } -void nested_vmx_check_supported(void) +static void nested_create_pte(struct kvm_vm *vm, + struct eptPageTableEntry *pte, + uint64_t nested_paddr, + uint64_t paddr, + int current_level, + int target_level) { - struct kvm_cpuid_entry2 *entry = kvm_get_supported_cpuid_entry(1); - - if (!(entry->ecx & CPUID_VMX)) { - fprintf(stderr, "nested VMX not enabled, skipping test\n"); - exit(KSFT_SKIP); + if (!pte->readable) { + pte->writable = true; + pte->readable = true; + pte->executable = true; + pte->page_size = (current_level == target_level); + if (pte->page_size) + pte->address = paddr >> vm->page_shift; + else + pte->address = vm_alloc_page_table(vm) >> vm->page_shift; + } else { + /* + * Entry already present. Assert that the caller doesn't want + * a hugepage at this level, and that there isn't a hugepage at + * this level. + */ + TEST_ASSERT(current_level != target_level, + "Cannot create hugepage at level: %u, nested_paddr: 0x%lx\n", + current_level, nested_paddr); + TEST_ASSERT(!pte->page_size, + "Cannot create page table at level: %u, nested_paddr: 0x%lx\n", + current_level, nested_paddr); } } -void nested_pg_map(struct vmx_pages *vmx, struct kvm_vm *vm, - uint64_t nested_paddr, uint64_t paddr, uint32_t eptp_memslot) + +void __nested_pg_map(struct vmx_pages *vmx, struct kvm_vm *vm, + uint64_t nested_paddr, uint64_t paddr, int target_level) { - uint16_t index[4]; - struct eptPageTableEntry *pml4e; + const uint64_t page_size = PG_LEVEL_SIZE(target_level); + struct eptPageTableEntry *pt = vmx->eptp_hva, *pte; + uint16_t index; TEST_ASSERT(vm->mode == VM_MODE_PXXV48_4K, "Attempt to use " "unknown or unsupported guest mode, mode: 0x%x", vm->mode); - TEST_ASSERT((nested_paddr % vm->page_size) == 0, + TEST_ASSERT((nested_paddr >> 48) == 0, + "Nested physical address 0x%lx requires 5-level paging", + nested_paddr); + TEST_ASSERT((nested_paddr % page_size) == 0, "Nested physical address not on page boundary,\n" - " nested_paddr: 0x%lx vm->page_size: 0x%x", - nested_paddr, vm->page_size); + " nested_paddr: 0x%lx page_size: 0x%lx", + nested_paddr, page_size); TEST_ASSERT((nested_paddr >> vm->page_shift) <= vm->max_gfn, "Physical address beyond beyond maximum supported,\n" " nested_paddr: 0x%lx vm->max_gfn: 0x%lx vm->page_size: 0x%x", paddr, vm->max_gfn, vm->page_size); - TEST_ASSERT((paddr % vm->page_size) == 0, + TEST_ASSERT((paddr % page_size) == 0, "Physical address not on page boundary,\n" - " paddr: 0x%lx vm->page_size: 0x%x", - paddr, vm->page_size); + " paddr: 0x%lx page_size: 0x%lx", + paddr, page_size); TEST_ASSERT((paddr >> vm->page_shift) <= vm->max_gfn, "Physical address beyond beyond maximum supported,\n" " paddr: 0x%lx vm->max_gfn: 0x%lx vm->page_size: 0x%x", paddr, vm->max_gfn, vm->page_size); - index[0] = (nested_paddr >> 12) & 0x1ffu; - index[1] = (nested_paddr >> 21) & 0x1ffu; - index[2] = (nested_paddr >> 30) & 0x1ffu; - index[3] = (nested_paddr >> 39) & 0x1ffu; - - /* Allocate page directory pointer table if not present. */ - pml4e = vmx->eptp_hva; - if (!pml4e[index[3]].readable) { - pml4e[index[3]].address = vm_phy_page_alloc(vm, - KVM_EPT_PAGE_TABLE_MIN_PADDR, eptp_memslot) - >> vm->page_shift; - pml4e[index[3]].writable = true; - pml4e[index[3]].readable = true; - pml4e[index[3]].executable = true; - } + for (int level = PG_LEVEL_512G; level >= PG_LEVEL_4K; level--) { + index = (nested_paddr >> PG_LEVEL_SHIFT(level)) & 0x1ffu; + pte = &pt[index]; - /* Allocate page directory table if not present. */ - struct eptPageTableEntry *pdpe; - pdpe = addr_gpa2hva(vm, pml4e[index[3]].address * vm->page_size); - if (!pdpe[index[2]].readable) { - pdpe[index[2]].address = vm_phy_page_alloc(vm, - KVM_EPT_PAGE_TABLE_MIN_PADDR, eptp_memslot) - >> vm->page_shift; - pdpe[index[2]].writable = true; - pdpe[index[2]].readable = true; - pdpe[index[2]].executable = true; - } + nested_create_pte(vm, pte, nested_paddr, paddr, level, target_level); - /* Allocate page table if not present. */ - struct eptPageTableEntry *pde; - pde = addr_gpa2hva(vm, pdpe[index[2]].address * vm->page_size); - if (!pde[index[1]].readable) { - pde[index[1]].address = vm_phy_page_alloc(vm, - KVM_EPT_PAGE_TABLE_MIN_PADDR, eptp_memslot) - >> vm->page_shift; - pde[index[1]].writable = true; - pde[index[1]].readable = true; - pde[index[1]].executable = true; - } + if (pte->page_size) + break; - /* Fill in page table entry. */ - struct eptPageTableEntry *pte; - pte = addr_gpa2hva(vm, pde[index[1]].address * vm->page_size); - pte[index[0]].address = paddr >> vm->page_shift; - pte[index[0]].writable = true; - pte[index[0]].readable = true; - pte[index[0]].executable = true; + pt = addr_gpa2hva(vm, pte->address * vm->page_size); + } /* * For now mark these as accessed and dirty because the only * testcase we have needs that. Can be reconsidered later. */ - pte[index[0]].accessed = true; - pte[index[0]].dirty = true; + pte->accessed = true; + pte->dirty = true; + +} + +void nested_pg_map(struct vmx_pages *vmx, struct kvm_vm *vm, + uint64_t nested_paddr, uint64_t paddr) +{ + __nested_pg_map(vmx, vm, nested_paddr, paddr, PG_LEVEL_4K); } /* @@ -476,7 +481,7 @@ void nested_pg_map(struct vmx_pages *vmx, struct kvm_vm *vm, * nested_paddr - Nested guest physical address to map * paddr - VM Physical Address * size - The size of the range to map - * eptp_memslot - Memory region slot for new virtual translation tables + * level - The level at which to map the range * * Output Args: None * @@ -485,28 +490,34 @@ void nested_pg_map(struct vmx_pages *vmx, struct kvm_vm *vm, * Within the VM given by vm, creates a nested guest translation for the * page range starting at nested_paddr to the page range starting at paddr. */ -void nested_map(struct vmx_pages *vmx, struct kvm_vm *vm, - uint64_t nested_paddr, uint64_t paddr, uint64_t size, - uint32_t eptp_memslot) +void __nested_map(struct vmx_pages *vmx, struct kvm_vm *vm, + uint64_t nested_paddr, uint64_t paddr, uint64_t size, + int level) { - size_t page_size = vm->page_size; + size_t page_size = PG_LEVEL_SIZE(level); size_t npages = size / page_size; TEST_ASSERT(nested_paddr + size > nested_paddr, "Vaddr overflow"); TEST_ASSERT(paddr + size > paddr, "Paddr overflow"); while (npages--) { - nested_pg_map(vmx, vm, nested_paddr, paddr, eptp_memslot); + __nested_pg_map(vmx, vm, nested_paddr, paddr, level); nested_paddr += page_size; paddr += page_size; } } +void nested_map(struct vmx_pages *vmx, struct kvm_vm *vm, + uint64_t nested_paddr, uint64_t paddr, uint64_t size) +{ + __nested_map(vmx, vm, nested_paddr, paddr, size, PG_LEVEL_4K); +} + /* Prepare an identity extended page table that maps all the * physical pages in VM. */ void nested_map_memslot(struct vmx_pages *vmx, struct kvm_vm *vm, - uint32_t memslot, uint32_t eptp_memslot) + uint32_t memslot) { sparsebit_idx_t i, last; struct userspace_mem_region *region = @@ -522,15 +533,46 @@ void nested_map_memslot(struct vmx_pages *vmx, struct kvm_vm *vm, nested_map(vmx, vm, (uint64_t)i << vm->page_shift, (uint64_t)i << vm->page_shift, - 1 << vm->page_shift, - eptp_memslot); + 1 << vm->page_shift); } } +/* Identity map a region with 1GiB Pages. */ +void nested_identity_map_1g(struct vmx_pages *vmx, struct kvm_vm *vm, + uint64_t addr, uint64_t size) +{ + __nested_map(vmx, vm, addr, addr, size, PG_LEVEL_1G); +} + +bool kvm_vm_has_ept(struct kvm_vm *vm) +{ + struct kvm_vcpu *vcpu; + uint64_t ctrl; + + vcpu = list_first_entry(&vm->vcpus, struct kvm_vcpu, list); + TEST_ASSERT(vcpu, "Cannot determine EPT support without vCPUs.\n"); + + ctrl = vcpu_get_msr(vcpu, MSR_IA32_VMX_TRUE_PROCBASED_CTLS) >> 32; + if (!(ctrl & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)) + return false; + + ctrl = vcpu_get_msr(vcpu, MSR_IA32_VMX_PROCBASED_CTLS2) >> 32; + return ctrl & SECONDARY_EXEC_ENABLE_EPT; +} + void prepare_eptp(struct vmx_pages *vmx, struct kvm_vm *vm, uint32_t eptp_memslot) { - vmx->eptp = (void *)vm_vaddr_alloc(vm, getpagesize(), 0x10000, 0, 0); + TEST_REQUIRE(kvm_vm_has_ept(vm)); + + vmx->eptp = (void *)vm_vaddr_alloc_page(vm); vmx->eptp_hva = addr_gva2hva(vm, (uintptr_t)vmx->eptp); vmx->eptp_gpa = addr_gva2gpa(vm, (uintptr_t)vmx->eptp); } + +void prepare_virtualize_apic_accesses(struct vmx_pages *vmx, struct kvm_vm *vm) +{ + vmx->apic_access = (void *)vm_vaddr_alloc_page(vm); + vmx->apic_access_hva = addr_gva2hva(vm, (uintptr_t)vmx->apic_access); + vmx->apic_access_gpa = addr_gva2gpa(vm, (uintptr_t)vmx->apic_access); +} |