summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--sys/arch/i386/conf/GENERIC3
-rw-r--r--sys/arch/i386/conf/Makefile.i3864
-rw-r--r--sys/arch/i386/conf/files.i38610
-rw-r--r--sys/arch/i386/i386/conf.c14
-rw-r--r--sys/arch/i386/i386/cpu.c34
-rw-r--r--sys/arch/i386/i386/ipifuncs.c30
-rw-r--r--sys/arch/i386/i386/machdep.c114
-rw-r--r--sys/arch/i386/i386/mainbus.c8
-rw-r--r--sys/arch/i386/i386/pmap.c24
-rw-r--r--sys/arch/i386/i386/pmapae.c64
-rw-r--r--sys/arch/i386/i386/vmm.c5433
-rw-r--r--sys/arch/i386/i386/vmm_support.S291
-rw-r--r--sys/arch/i386/include/cpu.h41
-rw-r--r--sys/arch/i386/include/intrdefs.h9
-rw-r--r--sys/arch/i386/include/pmap.h12
-rw-r--r--sys/arch/i386/include/pte.h9
-rw-r--r--sys/arch/i386/include/specialreg.h367
-rw-r--r--sys/arch/i386/include/vmmvar.h446
18 files changed, 6887 insertions, 26 deletions
diff --git a/sys/arch/i386/conf/GENERIC b/sys/arch/i386/conf/GENERIC
index 808917af519..9e3d2a265be 100644
--- a/sys/arch/i386/conf/GENERIC
+++ b/sys/arch/i386/conf/GENERIC
@@ -1,4 +1,4 @@
-# $OpenBSD: GENERIC,v 1.823 2016/09/12 08:28:44 mpi Exp $
+# $OpenBSD: GENERIC,v 1.824 2016/10/21 06:20:58 mlarkin Exp $
#
# For further information on compiling OpenBSD kernels, see the config(8)
# man page.
@@ -79,6 +79,7 @@ isa0 at gscpcib?
isa0 at glxpcib?
eisa0 at mainbus0
pci* at mainbus0
+vmm0 at mainbus0
pchb* at pci? # PCI-Host bridges
ppb* at pci? # PCI-PCI bridges
diff --git a/sys/arch/i386/conf/Makefile.i386 b/sys/arch/i386/conf/Makefile.i386
index 18f05560470..13b1b7cf8a2 100644
--- a/sys/arch/i386/conf/Makefile.i386
+++ b/sys/arch/i386/conf/Makefile.i386
@@ -1,4 +1,4 @@
-# $OpenBSD: Makefile.i386,v 1.97 2016/10/15 13:45:08 deraadt Exp $
+# $OpenBSD: Makefile.i386,v 1.98 2016/10/21 06:20:58 mlarkin Exp $
# For instructions on building kernels consult the config(8) and options(4)
# manual pages.
@@ -149,7 +149,7 @@ db_structinfo.h: $S/ddb/db_structinfo.c $S/ddb/parse_structinfo.pl
rm -f db_structinfo.o
locore.o: ${_machdir}/${_mach}/locore.s assym.h
-in_cksum.o mptramp.o kvm86call.o acpi_wakecode.o: assym.h
+in_cksum.o mptramp.o kvm86call.o acpi_wakecode.o vmm_support.o: assym.h
# The install target can be redefined by putting a
# install-kernel-${MACHINE_NAME} target into /etc/mk.conf
diff --git a/sys/arch/i386/conf/files.i386 b/sys/arch/i386/conf/files.i386
index 7f1ef1eb725..efb759667b0 100644
--- a/sys/arch/i386/conf/files.i386
+++ b/sys/arch/i386/conf/files.i386
@@ -1,4 +1,4 @@
-# $OpenBSD: files.i386,v 1.229 2016/02/28 15:46:18 naddy Exp $
+# $OpenBSD: files.i386,v 1.230 2016/10/21 06:20:58 mlarkin Exp $
#
# new style config file for i386 architecture
#
@@ -389,6 +389,14 @@ file arch/i386/i386/acpi_machdep.c acpi
file arch/i386/i386/acpi_wakecode.S acpi & !small_kernel
#
+# VMM
+#
+device vmm {}
+attach vmm at mainbus
+file arch/i386/i386/vmm.c vmm needs-flag
+file arch/i386/i386/vmm_support.S vmm
+
+#
# IPMI
#
attach ipmi at mainbus
diff --git a/sys/arch/i386/i386/conf.c b/sys/arch/i386/i386/conf.c
index 812d82d8550..1622e6a90eb 100644
--- a/sys/arch/i386/i386/conf.c
+++ b/sys/arch/i386/i386/conf.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: conf.c,v 1.157 2016/09/04 10:51:23 naddy Exp $ */
+/* $OpenBSD: conf.c,v 1.158 2016/10/21 06:20:58 mlarkin Exp $ */
/* $NetBSD: conf.c,v 1.75 1996/05/03 19:40:20 christos Exp $ */
/*
@@ -105,6 +105,14 @@ int nblkdev = nitems(bdevsw);
(dev_type_stop((*))) enodev, 0, seltrue, \
(dev_type_mmap((*))) enodev, 0 }
+/* open, close, ioctl */
+#define cdev_vmm_init(c,n) { \
+ dev_init(c,n,open), dev_init(c,n,close), \
+ (dev_type_read((*))) enodev, \
+ (dev_type_write((*))) enodev, \
+ dev_init(c,n,ioctl), \
+ (dev_type_stop((*))) enodev, 0, seltrue, \
+ (dev_type_mmap((*))) enodev }
#define mmread mmrw
#define mmwrite mmrw
@@ -178,6 +186,8 @@ cdev_decl(pci);
#include "pvbus.h"
#include "ipmi.h"
#include "switch.h"
+#include "vmm.h"
+cdev_decl(vmm);
struct cdevsw cdevsw[] =
{
@@ -191,7 +201,7 @@ struct cdevsw cdevsw[] =
cdev_log_init(1,log), /* 7: /dev/klog */
cdev_tty_init(NCOM,com), /* 8: serial port */
cdev_disk_init(NFD,fd), /* 9: floppy disk */
- cdev_notdef(), /* 10 */
+ cdev_vmm_init(NVMM,vmm), /* 10: vmm */
cdev_notdef(), /* 11 */
cdev_wsdisplay_init(NWSDISPLAY, /* 12: frame buffers, etc. */
wsdisplay),
diff --git a/sys/arch/i386/i386/cpu.c b/sys/arch/i386/i386/cpu.c
index babc4f56b76..3ce489a5531 100644
--- a/sys/arch/i386/i386/cpu.c
+++ b/sys/arch/i386/i386/cpu.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: cpu.c,v 1.79 2016/07/28 21:57:56 kettenis Exp $ */
+/* $OpenBSD: cpu.c,v 1.80 2016/10/21 06:20:58 mlarkin Exp $ */
/* $NetBSD: cpu.c,v 1.1.2.7 2000/06/26 02:04:05 sommerfeld Exp $ */
/*-
@@ -66,6 +66,7 @@
#include "lapic.h"
#include "ioapic.h"
+#include "vmm.h"
#include <sys/param.h>
#include <sys/timeout.h>
@@ -113,6 +114,9 @@ int cpu_activate(struct device *, int);
void patinit(struct cpu_info *ci);
void cpu_idle_mwait_cycle(void);
void cpu_init_mwait(struct device *);
+#if NVMM > 0
+void cpu_init_vmm(struct cpu_info *ci);
+#endif /* NVMM > 0 */
u_int cpu_mwait_size, cpu_mwait_states;
@@ -345,6 +349,10 @@ cpu_attach(struct device *parent, struct device *self, void *aux)
ci->ci_dev.dv_xname, pcb, pcb->pcb_esp);
}
#endif
+
+#if NVMM > 0
+ cpu_init_vmm(ci);
+#endif /* NVMM > 0 */
}
/*
@@ -407,6 +415,23 @@ cpu_init(struct cpu_info *ci)
}
void
+cpu_init_vmm(struct cpu_info *ci)
+{
+ /*
+ * Allocate a per-cpu VMXON region
+ */
+ if (ci->ci_vmm_flags & CI_VMM_VMX) {
+ ci->ci_vmxon_region_pa = 0;
+ ci->ci_vmxon_region = (struct vmxon_region *)malloc(PAGE_SIZE,
+ M_DEVBUF, M_WAITOK|M_ZERO);
+ if (!pmap_extract(pmap_kernel(), (vaddr_t)ci->ci_vmxon_region,
+ (paddr_t *)&ci->ci_vmxon_region_pa))
+ panic("Can't locate VMXON region in phys mem\n");
+ }
+}
+
+
+void
patinit(struct cpu_info *ci)
{
extern int pmap_pg_wc;
@@ -415,13 +440,6 @@ patinit(struct cpu_info *ci)
if ((ci->ci_feature_flags & CPUID_PAT) == 0)
return;
-#define PATENTRY(n, type) ((u_int64_t)type << ((n) * 8))
-#define PAT_UC 0x0UL
-#define PAT_WC 0x1UL
-#define PAT_WT 0x4UL
-#define PAT_WP 0x5UL
-#define PAT_WB 0x6UL
-#define PAT_UCMINUS 0x7UL
/*
* Set up PAT bits.
* The default pat table is the following:
diff --git a/sys/arch/i386/i386/ipifuncs.c b/sys/arch/i386/i386/ipifuncs.c
index b313879b852..e1b820fd77c 100644
--- a/sys/arch/i386/i386/ipifuncs.c
+++ b/sys/arch/i386/i386/ipifuncs.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: ipifuncs.c,v 1.27 2015/07/19 18:53:49 sf Exp $ */
+/* $OpenBSD: ipifuncs.c,v 1.28 2016/10/21 06:20:58 mlarkin Exp $ */
/* $NetBSD: ipifuncs.c,v 1.1.2.3 2000/06/26 02:04:06 sommerfeld Exp $ */
/*-
@@ -37,6 +37,7 @@
*/
#include "npx.h"
+#include "vmm.h"
#include <sys/param.h>
#include <sys/device.h>
@@ -70,6 +71,11 @@ void i386_ipi_reload_mtrr(struct cpu_info *);
#define i386_ipi_reload_mtrr 0
#endif
+#if NVMM > 0
+void i386_ipi_start_vmm(struct cpu_info *);
+void i386_ipi_stop_vmm(struct cpu_info *);
+#endif /* NVMM > 0 */
+
void (*ipifunc[I386_NIPI])(struct cpu_info *) =
{
i386_ipi_halt,
@@ -88,6 +94,13 @@ void (*ipifunc[I386_NIPI])(struct cpu_info *) =
NULL,
#endif
i386_setperf_ipi,
+#if NVMM > 0
+ i386_ipi_start_vmm,
+ i386_ipi_stop_vmm,
+#else
+ NULL,
+ NULL,
+#endif /* NVMM > 0 */
};
void
@@ -208,3 +221,18 @@ i386_ipi_handler(void)
}
}
}
+
+#if NVMM > 0
+void
+i386_ipi_start_vmm(struct cpu_info *ci)
+{
+ start_vmm_on_cpu(ci);
+}
+
+void
+i386_ipi_stop_vmm(struct cpu_info *ci)
+{
+ stop_vmm_on_cpu(ci);
+}
+#endif /* NVMM > 0 */
+
diff --git a/sys/arch/i386/i386/machdep.c b/sys/arch/i386/i386/machdep.c
index d6af51e1d80..d2ca55c98d8 100644
--- a/sys/arch/i386/i386/machdep.c
+++ b/sys/arch/i386/i386/machdep.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: machdep.c,v 1.592 2016/10/14 04:53:26 mlarkin Exp $ */
+/* $OpenBSD: machdep.c,v 1.593 2016/10/21 06:20:58 mlarkin Exp $ */
/* $NetBSD: machdep.c,v 1.214 1996/11/10 03:16:17 thorpej Exp $ */
/*-
@@ -168,6 +168,7 @@ extern struct proc *npxproc;
#include <machine/hibernate_var.h>
#endif /* HIBERNATE */
+#include "vmm.h"
void replacesmap(void);
int intr_handler(struct intrframe *, struct intrhand *);
@@ -339,6 +340,9 @@ void p3_get_bus_clock(struct cpu_info *);
void p4_update_cpuspeed(void);
void p3_update_cpuspeed(void);
int pentium_cpuspeed(int *);
+#if NVMM > 0
+void cpu_check_vmm_cap(struct cpu_info *);
+#endif /* NVMM > 0 */
static __inline u_char
cyrix_read_reg(u_char reg)
@@ -2077,6 +2081,10 @@ identifycpu(struct cpu_info *ci)
} else
i386_use_fxsave = 0;
+#if NVMM > 0
+ cpu_check_vmm_cap(ci);
+#endif /* NVMM > 0 */
+
}
char *
@@ -3967,3 +3975,107 @@ intr_barrier(void *ih)
{
sched_barrier(NULL);
}
+
+#if NVMM > 0
+/*
+ * cpu_check_vmm_cap
+ *
+ * Checks for VMM capabilities for 'ci'. Initializes certain per-cpu VMM
+ * state in 'ci' if virtualization extensions are found.
+ *
+ * Parameters:
+ * ci: the cpu being checked
+ */
+void
+cpu_check_vmm_cap(struct cpu_info *ci)
+{
+ uint64_t msr;
+ uint32_t cap, dummy;
+
+ /*
+ * Check for workable VMX
+ */
+ if (cpu_ecxfeature & CPUIDECX_VMX) {
+ msr = rdmsr(MSR_IA32_FEATURE_CONTROL);
+
+ if (!(msr & IA32_FEATURE_CONTROL_LOCK))
+ ci->ci_vmm_flags |= CI_VMM_VMX;
+ else {
+ if (msr & IA32_FEATURE_CONTROL_VMX_EN)
+ ci->ci_vmm_flags |= CI_VMM_VMX;
+ }
+ }
+
+ /*
+ * Check for EPT (Intel Nested Paging) and other secondary
+ * controls
+ */
+ if (ci->ci_vmm_flags & CI_VMM_VMX) {
+ /* Secondary controls available? */
+ /* XXX should we check true procbased ctls here if avail? */
+ msr = rdmsr(IA32_VMX_PROCBASED_CTLS);
+ if (msr & (IA32_VMX_ACTIVATE_SECONDARY_CONTROLS) << 32) {
+ msr = rdmsr(IA32_VMX_PROCBASED2_CTLS);
+ /* EPT available? */
+ if (msr & (IA32_VMX_ENABLE_EPT) << 32)
+ ci->ci_vmm_flags |= CI_VMM_EPT;
+ /* VM Functions available? */
+ if (msr & (IA32_VMX_ENABLE_VM_FUNCTIONS) << 32) {
+ ci->ci_vmm_cap.vcc_vmx.vmx_vm_func =
+ rdmsr(IA32_VMX_VMFUNC);
+ }
+ }
+ }
+
+ /*
+ * Check startup config (VMX)
+ */
+ if (ci->ci_vmm_flags & CI_VMM_VMX) {
+ /* CR0 fixed and flexible bits */
+ msr = rdmsr(IA32_VMX_CR0_FIXED0);
+ ci->ci_vmm_cap.vcc_vmx.vmx_cr0_fixed0 = msr;
+ msr = rdmsr(IA32_VMX_CR0_FIXED1);
+ ci->ci_vmm_cap.vcc_vmx.vmx_cr0_fixed1 = msr;
+
+ /* CR4 fixed and flexible bits */
+ msr = rdmsr(IA32_VMX_CR4_FIXED0);
+ ci->ci_vmm_cap.vcc_vmx.vmx_cr4_fixed0 = msr;
+ msr = rdmsr(IA32_VMX_CR4_FIXED1);
+ ci->ci_vmm_cap.vcc_vmx.vmx_cr4_fixed1 = msr;
+
+ /* VMXON region revision ID (bits 30:0 of IA32_VMX_BASIC) */
+ msr = rdmsr(IA32_VMX_BASIC);
+ ci->ci_vmm_cap.vcc_vmx.vmx_vmxon_revision =
+ (uint32_t)(msr & 0x7FFFFFFF);
+
+ /* MSR save / load table size */
+ msr = rdmsr(IA32_VMX_MISC);
+ ci->ci_vmm_cap.vcc_vmx.vmx_msr_table_size =
+ (uint32_t)(msr & IA32_VMX_MSR_LIST_SIZE_MASK) >> 25;
+
+ /* CR3 target count size */
+ ci->ci_vmm_cap.vcc_vmx.vmx_cr3_tgt_count =
+ (uint32_t)(msr & IA32_VMX_CR3_TGT_SIZE_MASK) >> 16;
+ }
+
+ /*
+ * Check for workable SVM
+ */
+ if (ecpu_ecxfeature & CPUIDECX_SVM) {
+ msr = rdmsr(MSR_AMD_VM_CR);
+
+ if (!(msr & AMD_SVMDIS))
+ ci->ci_vmm_flags |= CI_VMM_SVM;
+ }
+
+ /*
+ * Check for SVM Nested Paging
+ */
+ if (ci->ci_vmm_flags & CI_VMM_SVM) {
+ CPUID(CPUID_AMD_SVM_CAP, dummy, dummy, dummy, cap);
+ if (cap & AMD_SVM_NESTED_PAGING_CAP)
+ ci->ci_vmm_flags |= CI_VMM_RVI;
+ }
+}
+#endif /* NVMM > 0 */
+
diff --git a/sys/arch/i386/i386/mainbus.c b/sys/arch/i386/i386/mainbus.c
index d44a0f1c695..56acb1f57d6 100644
--- a/sys/arch/i386/i386/mainbus.c
+++ b/sys/arch/i386/i386/mainbus.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: mainbus.c,v 1.55 2016/07/28 21:57:56 kettenis Exp $ */
+/* $OpenBSD: mainbus.c,v 1.56 2016/10/21 06:20:58 mlarkin Exp $ */
/* $NetBSD: mainbus.c,v 1.21 1997/06/06 23:14:20 thorpej Exp $ */
/*
@@ -54,6 +54,7 @@
#include "ipmi.h"
#include "esm.h"
#include "amdmsr.h"
+#include "vmm.h"
#include "pvbus.h"
#include <machine/cpuvar.h>
@@ -269,6 +270,11 @@ mainbus_attach(struct device *parent, struct device *self, void *aux)
#endif
config_found(self, &mba.mba_iba, mainbus_print);
}
+
+#if NVMM > 0
+ mba.mba_busname = "vmm";
+ config_found(self, &mba.mba_busname, mainbus_print);
+#endif /* NVMM > 0 */
}
int
diff --git a/sys/arch/i386/i386/pmap.c b/sys/arch/i386/i386/pmap.c
index 81337e8f24b..04248baa30d 100644
--- a/sys/arch/i386/i386/pmap.c
+++ b/sys/arch/i386/i386/pmap.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: pmap.c,v 1.194 2016/09/17 07:37:57 mlarkin Exp $ */
+/* $OpenBSD: pmap.c,v 1.195 2016/10/21 06:20:58 mlarkin Exp $ */
/* $NetBSD: pmap.c,v 1.91 2000/06/02 17:46:37 thorpej Exp $ */
/*
@@ -74,6 +74,8 @@
#include <sys/msgbuf.h>
#include <stand/boot/bootarg.h>
+#include "vmm.h"
+
/*
* this file contains the code for the "pmap module." the module's
* job is to manage the hardware's virtual to physical address mappings.
@@ -931,6 +933,11 @@ pmap_bootstrap(vaddr_t kva_start)
kpm->pm_pdirpa = proc0.p_addr->u_pcb.pcb_cr3;
kpm->pm_stats.wired_count = kpm->pm_stats.resident_count =
atop(kva_start - VM_MIN_KERNEL_ADDRESS);
+ kpm->pm_type = PMAP_TYPE_NORMAL;
+#if NVMM > 0
+ kpm->pm_npt_pml4 = 0;
+ kpm->pm_npt_pdpt = 0;
+#endif /* NVMM > 0 */
/*
* the above is just a rough estimate and not critical to the proper
@@ -1289,6 +1296,12 @@ pmap_create(void)
setsegment(&pmap->pm_codeseg, 0, atop(I386_MAX_EXE_ADDR) - 1,
SDT_MEMERA, SEL_UPL, 1, 1);
+ pmap->pm_type = PMAP_TYPE_NORMAL;
+#if NVMM > 0
+ pmap->pm_npt_pml4 = 0;
+ pmap->pm_npt_pdpt = 0;
+#endif /* NVMM > 0 */
+
pmap_pinit_pd(pmap);
return (pmap);
}
@@ -1356,6 +1369,15 @@ pmap_destroy(struct pmap *pmap)
uvm_km_free(kernel_map, pmap->pm_pdir, pmap->pm_pdirsize);
pmap->pm_pdir = 0;
+#if NVMM > 0
+ if (pmap->pm_npt_pml4)
+ km_free((void *)pmap->pm_npt_pml4, PAGE_SIZE, &kv_any,
+ &kp_zero);
+ if (pmap->pm_npt_pdpt)
+ km_free((void *)pmap->pm_npt_pdpt, PAGE_SIZE, &kv_any,
+ &kp_zero);
+#endif /* NVMM > 0 */
+
pool_put(&pmap_pmap_pool, pmap);
}
diff --git a/sys/arch/i386/i386/pmapae.c b/sys/arch/i386/i386/pmapae.c
index 46b366b0360..e4ffa837c9d 100644
--- a/sys/arch/i386/i386/pmapae.c
+++ b/sys/arch/i386/i386/pmapae.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: pmapae.c,v 1.51 2016/09/17 07:37:57 mlarkin Exp $ */
+/* $OpenBSD: pmapae.c,v 1.52 2016/10/21 06:20:58 mlarkin Exp $ */
/*
* Copyright (c) 2006-2008 Michael Shalayeff
@@ -1915,3 +1915,65 @@ pmap_flush_page_pae(paddr_t pa)
*pte = 0;
pmap_update_pg(va);
}
+
+int
+pmap_convert(struct pmap *pmap, int mode)
+{
+ int ret;
+ pt_entry_t *pte;
+ paddr_t pml4_pa, pdpt_pa;
+
+ pmap->pm_type = mode;
+
+ ret = 0;
+ if (mode == PMAP_TYPE_EPT) {
+ pmap->pm_npt_pml4 = (vaddr_t)km_alloc(PAGE_SIZE, &kv_any,
+ &kp_zero, &kd_nowait);
+ if (!pmap->pm_npt_pml4) {
+ ret = ENOMEM;
+ goto error;
+ }
+
+ pmap->pm_npt_pdpt = (vaddr_t)km_alloc(PAGE_SIZE, &kv_any,
+ &kp_zero, &kd_nowait);
+ if (!pmap->pm_npt_pdpt) {
+ ret = ENOMEM;
+ goto error;
+ }
+
+ if (!pmap_extract(pmap_kernel(), pmap->pm_npt_pml4,
+ &pml4_pa)) {
+ ret = ENOMEM;
+ goto error;
+ }
+ pmap->pm_npt_pa = pml4_pa;
+
+ if (!pmap_extract(pmap_kernel(), pmap->pm_npt_pdpt,
+ &pdpt_pa)) {
+ ret = ENOMEM;
+ goto error;
+ }
+
+ pte = (pt_entry_t *)pmap->pm_npt_pml4;
+ pte[0] = (pdpt_pa & PG_FRAME) | EPT_R | EPT_W | EPT_X;
+ pte = (pt_entry_t *)pmap->pm_npt_pdpt;
+ pte[0] = (pmap->pm_pdidx[0] & PG_FRAME) |
+ EPT_R | EPT_W | EPT_X;
+ pte[1] = (pmap->pm_pdidx[1] & PG_FRAME) |
+ EPT_R | EPT_W | EPT_X;
+ pte[2] = (pmap->pm_pdidx[2] & PG_FRAME) |
+ EPT_R | EPT_W | EPT_X;
+ pte[3] = (pmap->pm_pdidx[3] & PG_FRAME) |
+ EPT_R | EPT_W | EPT_X;
+ }
+
+ return (ret);
+
+error:
+ if (pmap->pm_npt_pml4)
+ km_free((void *)pmap->pm_npt_pml4, PAGE_SIZE, &kv_any, &kp_zero);
+ if (pmap->pm_npt_pdpt)
+ km_free((void *)pmap->pm_npt_pdpt, PAGE_SIZE, &kv_any, &kp_zero);
+
+ return (ret);
+}
diff --git a/sys/arch/i386/i386/vmm.c b/sys/arch/i386/i386/vmm.c
new file mode 100644
index 00000000000..cea820e3bf4
--- /dev/null
+++ b/sys/arch/i386/i386/vmm.c
@@ -0,0 +1,5433 @@
+/*
+ * Copyright (c) 2014 Mike Larkin <mlarkin@openbsd.org>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/types.h>
+#include <sys/signalvar.h>
+#include <sys/malloc.h>
+#include <sys/device.h>
+#include <sys/pool.h>
+#include <sys/proc.h>
+#include <sys/ioctl.h>
+#include <sys/queue.h>
+#include <sys/rwlock.h>
+#include <sys/pledge.h>
+#include <sys/memrange.h>
+
+#include <uvm/uvm_extern.h>
+
+#include <machine/pmap.h>
+#include <machine/biosvar.h>
+#include <machine/segments.h>
+#include <machine/cpufunc.h>
+#include <machine/vmmvar.h>
+#include <machine/i82489reg.h>
+
+#include <dev/isa/isareg.h>
+
+#define VMM_DEBUG
+
+#ifdef VMM_DEBUG
+int vmm_debug = 1;
+#define DPRINTF(x...) do { if (vmm_debug) printf(x); } while(0)
+#else
+#define DPRINTF(x...)
+#endif /* VMM_DEBUG */
+
+#define DEVNAME(s) ((s)->sc_dev.dv_xname)
+
+#define CTRL_DUMP(x,y,z) printf(" %s: Can set:%s Can clear:%s\n", #z , \
+ vcpu_vmx_check_cap(x, IA32_VMX_##y ##_CTLS, \
+ IA32_VMX_##z, 1) ? "Yes" : "No", \
+ vcpu_vmx_check_cap(x, IA32_VMX_##y ##_CTLS, \
+ IA32_VMX_##z, 0) ? "Yes" : "No");
+
+#define VMX_EXIT_INFO_HAVE_RIP 0x1
+#define VMX_EXIT_INFO_HAVE_REASON 0x2
+#define VMX_EXIT_INFO_COMPLETE \
+ (VMX_EXIT_INFO_HAVE_RIP | VMX_EXIT_INFO_HAVE_REASON)
+
+struct vm {
+ vm_map_t vm_map;
+ uint32_t vm_id;
+ pid_t vm_creator_pid;
+ size_t vm_nmemranges;
+ size_t vm_memory_size;
+ char vm_name[VMM_MAX_NAME_LEN];
+ struct vm_mem_range vm_memranges[VMM_MAX_MEM_RANGES];
+
+ struct vcpu_head vm_vcpu_list;
+ uint32_t vm_vcpu_ct;
+ u_int vm_vcpus_running;
+ struct rwlock vm_vcpu_lock;
+
+ SLIST_ENTRY(vm) vm_link;
+};
+
+SLIST_HEAD(vmlist_head, vm);
+
+struct vmm_softc {
+ struct device sc_dev;
+
+ /* Capabilities */
+ uint32_t nr_vmx_cpus;
+ uint32_t nr_svm_cpus;
+ uint32_t nr_rvi_cpus;
+ uint32_t nr_ept_cpus;
+
+ /* Managed VMs */
+ struct vmlist_head vm_list;
+
+ int mode;
+
+ struct rwlock vm_lock;
+ size_t vm_ct; /* number of in-memory VMs */
+ size_t vm_idx; /* next unique VM index */
+};
+
+int vmm_probe(struct device *, void *, void *);
+void vmm_attach(struct device *, struct device *, void *);
+int vmmopen(dev_t, int, int, struct proc *);
+int vmmioctl(dev_t, u_long, caddr_t, int, struct proc *);
+int vmmclose(dev_t, int, int, struct proc *);
+int vmm_start(void);
+int vmm_stop(void);
+size_t vm_create_check_mem_ranges(struct vm_create_params *);
+int vm_create(struct vm_create_params *, struct proc *);
+int vm_run(struct vm_run_params *);
+int vm_terminate(struct vm_terminate_params *);
+int vm_get_info(struct vm_info_params *);
+int vm_resetcpu(struct vm_resetcpu_params *);
+int vm_intr_pending(struct vm_intr_params *);
+int vm_rwregs(struct vm_rwregs_params *, int);
+int vcpu_readregs_vmx(struct vcpu *, uint64_t, struct vcpu_reg_state *);
+int vcpu_readregs_svm(struct vcpu *, uint64_t, struct vcpu_reg_state *);
+int vcpu_writeregs_vmx(struct vcpu *, uint64_t, int, struct vcpu_reg_state *);
+int vcpu_writeregs_svm(struct vcpu *, uint64_t, struct vcpu_reg_state *);
+int vcpu_reset_regs(struct vcpu *, struct vcpu_reg_state *);
+int vcpu_reset_regs_vmx(struct vcpu *, struct vcpu_reg_state *);
+int vcpu_reset_regs_svm(struct vcpu *, struct vcpu_reg_state *);
+int vcpu_reload_vmcs_vmx(uint64_t *);
+int vcpu_init(struct vcpu *);
+int vcpu_init_vmx(struct vcpu *);
+int vcpu_init_svm(struct vcpu *);
+int vcpu_must_stop(struct vcpu *);
+int vcpu_run_vmx(struct vcpu *, struct vm_run_params *);
+int vcpu_run_svm(struct vcpu *, struct vm_run_params *);
+void vcpu_deinit(struct vcpu *);
+void vcpu_deinit_vmx(struct vcpu *);
+void vcpu_deinit_svm(struct vcpu *);
+int vm_impl_init(struct vm *, struct proc *);
+int vm_impl_init_vmx(struct vm *, struct proc *);
+int vm_impl_init_svm(struct vm *, struct proc *);
+void vm_impl_deinit(struct vm *);
+void vm_impl_deinit_vmx(struct vm *);
+void vm_impl_deinit_svm(struct vm *);
+void vm_teardown(struct vm *);
+int vcpu_vmx_check_cap(struct vcpu *, uint32_t, uint32_t, int);
+int vcpu_vmx_compute_ctrl(uint64_t, uint16_t, uint32_t, uint32_t, uint32_t *);
+int vmx_get_exit_info(uint32_t *, uint32_t *);
+int vmx_handle_exit(struct vcpu *);
+int vmx_handle_cpuid(struct vcpu *);
+int vmx_handle_rdmsr(struct vcpu *);
+int vmx_handle_wrmsr(struct vcpu *);
+int vmx_handle_cr(struct vcpu *);
+int vmx_handle_inout(struct vcpu *);
+int vmx_handle_hlt(struct vcpu *);
+void vmx_handle_intr(struct vcpu *);
+void vmx_handle_intwin(struct vcpu *);
+int vmm_get_guest_memtype(struct vm *, paddr_t);
+int vmm_get_guest_faulttype(void);
+int vmx_get_guest_faulttype(void);
+int svm_get_guest_faulttype(void);
+int vmx_get_exit_qualification(uint32_t *);
+int vmx_fault_page(struct vcpu *, paddr_t);
+int vmx_handle_np_fault(struct vcpu *);
+const char *vcpu_state_decode(u_int);
+const char *vmx_exit_reason_decode(uint32_t);
+const char *vmx_instruction_error_decode(uint32_t);
+void vmx_setmsrbr(struct vcpu *, uint32_t);
+void vmx_setmsrbw(struct vcpu *, uint32_t);
+void vmx_setmsrbrw(struct vcpu *, uint32_t);
+
+#ifdef VMM_DEBUG
+void dump_vcpu(struct vcpu *);
+void vmx_vcpu_dump_regs(struct vcpu *);
+void vmx_dump_vmcs(struct vcpu *);
+const char *msr_name_decode(uint32_t);
+void vmm_segment_desc_decode(uint32_t);
+void vmm_decode_cr0(uint32_t);
+void vmm_decode_cr4(uint32_t);
+void vmm_decode_msr_value(uint64_t, uint64_t);
+void vmm_decode_apicbase_msr_value(uint64_t);
+void vmm_decode_ia32_fc_value(uint64_t);
+void vmm_decode_mtrrcap_value(uint64_t);
+void vmm_decode_perf_status_value(uint64_t);
+void vmm_decode_perf_ctl_value(uint64_t);
+void vmm_decode_mtrrdeftype_value(uint64_t);
+void vmm_decode_efer_value(uint64_t);
+
+extern int mtrr2mrt(int);
+
+struct vmm_reg_debug_info {
+ uint64_t vrdi_bit;
+ const char *vrdi_present;
+ const char *vrdi_absent;
+};
+#endif /* VMM_DEBUG */
+
+const char *vmm_hv_signature = VMM_HV_SIGNATURE;
+
+struct cfdriver vmm_cd = {
+ NULL, "vmm", DV_DULL
+};
+
+const struct cfattach vmm_ca = {
+ sizeof(struct vmm_softc), vmm_probe, vmm_attach, NULL, NULL
+};
+
+/*
+ * Helper struct to easily get the VMCS field IDs needed in vmread/vmwrite
+ * to access the individual fields of the guest segment registers. This
+ * struct is indexed by VCPU_REGS_* id.
+ */
+const struct {
+ uint64_t selid;
+ uint64_t limitid;
+ uint64_t arid;
+ uint64_t baseid;
+} vmm_vmx_sreg_vmcs_fields[] = {
+ { VMCS_GUEST_IA32_CS_SEL, VMCS_GUEST_IA32_CS_LIMIT,
+ VMCS_GUEST_IA32_CS_AR, VMCS_GUEST_IA32_CS_BASE },
+ { VMCS_GUEST_IA32_DS_SEL, VMCS_GUEST_IA32_DS_LIMIT,
+ VMCS_GUEST_IA32_DS_AR, VMCS_GUEST_IA32_DS_BASE },
+ { VMCS_GUEST_IA32_ES_SEL, VMCS_GUEST_IA32_ES_LIMIT,
+ VMCS_GUEST_IA32_ES_AR, VMCS_GUEST_IA32_ES_BASE },
+ { VMCS_GUEST_IA32_FS_SEL, VMCS_GUEST_IA32_FS_LIMIT,
+ VMCS_GUEST_IA32_FS_AR, VMCS_GUEST_IA32_FS_BASE },
+ { VMCS_GUEST_IA32_GS_SEL, VMCS_GUEST_IA32_GS_LIMIT,
+ VMCS_GUEST_IA32_GS_AR, VMCS_GUEST_IA32_GS_BASE },
+ { VMCS_GUEST_IA32_SS_SEL, VMCS_GUEST_IA32_SS_LIMIT,
+ VMCS_GUEST_IA32_SS_AR, VMCS_GUEST_IA32_SS_BASE },
+ { VMCS_GUEST_IA32_LDTR_SEL, VMCS_GUEST_IA32_LDTR_LIMIT,
+ VMCS_GUEST_IA32_LDTR_AR, VMCS_GUEST_IA32_LDTR_BASE },
+ { VMCS_GUEST_IA32_TR_SEL, VMCS_GUEST_IA32_TR_LIMIT,
+ VMCS_GUEST_IA32_TR_AR, VMCS_GUEST_IA32_TR_BASE }
+};
+
+/* Pools for VMs and VCPUs */
+struct pool vm_pool;
+struct pool vcpu_pool;
+
+struct vmm_softc *vmm_softc;
+
+/* IDT information used when populating host state area */
+extern vaddr_t idt_vaddr;
+extern struct gate_descriptor *idt;
+
+/* CPU info (i386) */
+extern char cpu_brandstr[];
+extern uint32_t ecpu_eaxfeature;
+
+/* Constants used in "CR access exit" */
+#define CR_WRITE 0
+#define CR_READ 1
+#define CR_CLTS 2
+#define CR_LMSW 3
+
+/*
+ * vmm_probe
+ *
+ * Checks if we have at least one CPU with either VMX or SVM.
+ * Returns 1 if we have at least one of either type, but not both, 0 otherwise.
+ */
+int
+vmm_probe(struct device *parent, void *match, void *aux)
+{
+ struct cpu_info *ci;
+ CPU_INFO_ITERATOR cii;
+ const char **busname = (const char **)aux;
+ int found_vmx, found_svm;
+
+ /* Check if this probe is for us */
+ if (strcmp(*busname, vmm_cd.cd_name) != 0)
+ return (0);
+
+ found_vmx = 0;
+ found_svm = 0;
+
+ /* Check if we have at least one CPU with either VMX or SVM */
+ CPU_INFO_FOREACH(cii, ci) {
+ if (ci->ci_vmm_flags & CI_VMM_VMX)
+ found_vmx = 1;
+ if (ci->ci_vmm_flags & CI_VMM_SVM)
+ found_svm = 1;
+ }
+
+ /* Don't support both SVM and VMX at the same time */
+ if (found_vmx && found_svm)
+ return (0);
+
+ return (found_vmx || found_svm);
+}
+
+/*
+ * vmm_attach
+ *
+ * Calculates how many of each type of CPU we have, prints this into dmesg
+ * during attach. Initializes various locks, pools, and list structures for the
+ * VMM.
+ */
+void
+vmm_attach(struct device *parent, struct device *self, void *aux)
+{
+ struct vmm_softc *sc = (struct vmm_softc *)self;
+ struct cpu_info *ci;
+ CPU_INFO_ITERATOR cii;
+
+ sc->nr_vmx_cpus = 0;
+ sc->nr_svm_cpus = 0;
+ sc->nr_rvi_cpus = 0;
+ sc->nr_ept_cpus = 0;
+ sc->vm_ct = 0;
+ sc->vm_idx = 0;
+
+ /* Calculate CPU features */
+ CPU_INFO_FOREACH(cii, ci) {
+ if (ci->ci_vmm_flags & CI_VMM_VMX)
+ sc->nr_vmx_cpus++;
+ if (ci->ci_vmm_flags & CI_VMM_SVM)
+ sc->nr_svm_cpus++;
+ if (ci->ci_vmm_flags & CI_VMM_RVI)
+ sc->nr_rvi_cpus++;
+ if (ci->ci_vmm_flags & CI_VMM_EPT)
+ sc->nr_ept_cpus++;
+ }
+
+ SLIST_INIT(&sc->vm_list);
+ rw_init(&sc->vm_lock, "vmlistlock");
+
+ if (sc->nr_ept_cpus) {
+ printf(": VMX/EPT\n");
+ sc->mode = VMM_MODE_EPT;
+ } else if (sc->nr_vmx_cpus) {
+ printf(": VMX\n");
+ sc->mode = VMM_MODE_VMX;
+ } else if (sc->nr_rvi_cpus) {
+ printf(": SVM/RVI\n");
+ sc->mode = VMM_MODE_RVI;
+ } else if (sc->nr_svm_cpus) {
+ printf(": SVM\n");
+ sc->mode = VMM_MODE_SVM;
+ } else {
+ printf(": unknown\n");
+ sc->mode = VMM_MODE_UNKNOWN;
+ }
+
+ pool_init(&vm_pool, sizeof(struct vm), 0, IPL_NONE, PR_WAITOK,
+ "vmpool", NULL);
+ pool_init(&vcpu_pool, sizeof(struct vcpu), 0, IPL_NONE, PR_WAITOK,
+ "vcpupl", NULL);
+
+ vmm_softc = sc;
+}
+
+/*
+ * vmmopen
+ *
+ * Called during open of /dev/vmm. Presently unused.
+ */
+int
+vmmopen(dev_t dev, int flag, int mode, struct proc *p)
+{
+ /* Don't allow open if we didn't attach */
+ if (vmm_softc == NULL)
+ return (ENODEV);
+
+ /* Don't allow open if we didn't detect any supported CPUs */
+ /* XXX presently this means EPT until SP and SVM are back */
+ if (vmm_softc->mode != VMM_MODE_EPT)
+ return (ENODEV);
+
+ return 0;
+}
+
+/*
+ * vmmioctl
+ *
+ * Main ioctl dispatch routine for /dev/vmm. Parses ioctl type and calls
+ * appropriate lower level handler routine. Returns result to ioctl caller.
+ */
+int
+vmmioctl(dev_t dev, u_long cmd, caddr_t data, int flag, struct proc *p)
+{
+ int ret;
+
+ switch (cmd) {
+ case VMM_IOC_CREATE:
+ if ((ret = vmm_start()) != 0) {
+ vmm_stop();
+ break;
+ }
+ ret = vm_create((struct vm_create_params *)data, p);
+ break;
+ case VMM_IOC_RUN:
+ ret = vm_run((struct vm_run_params *)data);
+ break;
+ case VMM_IOC_INFO:
+ ret = vm_get_info((struct vm_info_params *)data);
+ break;
+ case VMM_IOC_TERM:
+ ret = vm_terminate((struct vm_terminate_params *)data);
+ break;
+ case VMM_IOC_RESETCPU:
+ ret = vm_resetcpu((struct vm_resetcpu_params *)data);
+ break;
+ case VMM_IOC_INTR:
+ ret = vm_intr_pending((struct vm_intr_params *)data);
+ break;
+ case VMM_IOC_READREGS:
+ ret = vm_rwregs((struct vm_rwregs_params *)data, 0);
+ break;
+ case VMM_IOC_WRITEREGS:
+ ret = vm_rwregs((struct vm_rwregs_params *)data, 1);
+ break;
+ default:
+ DPRINTF("vmmioctl: unknown ioctl code 0x%lx\n", cmd);
+ ret = ENOTTY;
+ }
+
+ return (ret);
+}
+
+/*
+ * pledge_ioctl_vmm
+ *
+ * Restrict the allowed ioctls in a pledged process context.
+ * Is called from pledge_ioctl().
+ */
+int
+pledge_ioctl_vmm(struct proc *p, long com)
+{
+ switch (com) {
+ case VMM_IOC_CREATE:
+ case VMM_IOC_INFO:
+ /* The "parent" process in vmd forks and manages VMs */
+ if (p->p_p->ps_pledge & PLEDGE_PROC)
+ return (0);
+ break;
+ case VMM_IOC_TERM:
+ /* XXX VM processes should only terminate themselves */
+ case VMM_IOC_RUN:
+ case VMM_IOC_RESETCPU:
+ return (0);
+ }
+
+ return (EPERM);
+}
+
+/*
+ * vmmclose
+ *
+ * Called when /dev/vmm is closed. Presently unused.
+ */
+int
+vmmclose(dev_t dev, int flag, int mode, struct proc *p)
+{
+ return 0;
+}
+
+/*
+ * vm_resetcpu
+ *
+ * Resets the vcpu defined in 'vrp' to power-on-init register state
+ *
+ * Parameters:
+ * vrp: ioctl structure defining the vcpu to reset (see vmmvar.h)
+ *
+ * Returns 0 if successful, or various error codes on failure:
+ * ENOENT if the VM id contained in 'vrp' refers to an unknown VM or
+ * if vrp describes an unknown vcpu for this VM
+ * EBUSY if the indicated VCPU is not stopped
+ * EIO if the indicated VCPU failed to reset
+ */
+int
+vm_resetcpu(struct vm_resetcpu_params *vrp)
+{
+ struct vm *vm;
+ struct vcpu *vcpu;
+
+ /* Find the desired VM */
+ rw_enter_read(&vmm_softc->vm_lock);
+ SLIST_FOREACH(vm, &vmm_softc->vm_list, vm_link) {
+ if (vm->vm_id == vrp->vrp_vm_id)
+ break;
+ }
+ rw_exit_read(&vmm_softc->vm_lock);
+
+ /* Not found? exit. */
+ if (vm == NULL) {
+ DPRINTF("vm_resetcpu: vm id %u not found\n",
+ vrp->vrp_vm_id);
+ return (ENOENT);
+ }
+
+ rw_enter_read(&vm->vm_vcpu_lock);
+ SLIST_FOREACH(vcpu, &vm->vm_vcpu_list, vc_vcpu_link) {
+ if (vcpu->vc_id == vrp->vrp_vcpu_id)
+ break;
+ }
+ rw_exit_read(&vm->vm_vcpu_lock);
+
+ if (vcpu == NULL) {
+ DPRINTF("vm_resetcpu: vcpu id %u of vm %u not found\n",
+ vrp->vrp_vcpu_id, vrp->vrp_vm_id);
+ return (ENOENT);
+ }
+
+ if (vcpu->vc_state != VCPU_STATE_STOPPED) {
+ DPRINTF("vm_resetcpu: reset of vcpu %u on vm %u attempted "
+ "while vcpu was in state %u (%s)\n", vrp->vrp_vcpu_id,
+ vrp->vrp_vm_id, vcpu->vc_state,
+ vcpu_state_decode(vcpu->vc_state));
+
+ return (EBUSY);
+ }
+
+ DPRINTF("vm_resetcpu: resetting vm %d vcpu %d to power on defaults\n",
+ vm->vm_id, vcpu->vc_id);
+
+ if (vcpu_reset_regs(vcpu, &vrp->vrp_init_state)) {
+ printf("vm_resetcpu: failed\n");
+#ifdef VMM_DEBUG
+ dump_vcpu(vcpu);
+#endif /* VMM_DEBUG */
+ return (EIO);
+ }
+
+ return (0);
+}
+
+/*
+ * vm_intr_pending
+ *
+ * IOCTL handler routine for VMM_IOC_INTR messages, sent from vmd when an
+ * interrupt is pending and needs acknowledgment
+ *
+ * Parameters:
+ * vip: Describes the vm/vcpu for which the interrupt is pending
+ *
+ * Return values:
+ * 0: if successful
+ * ENOENT: if the VM/VCPU defined by 'vip' cannot be found
+ */
+int
+vm_intr_pending(struct vm_intr_params *vip)
+{
+ struct vm *vm;
+ struct vcpu *vcpu;
+
+ /* Find the desired VM */
+ rw_enter_read(&vmm_softc->vm_lock);
+ SLIST_FOREACH(vm, &vmm_softc->vm_list, vm_link) {
+ if (vm->vm_id == vip->vip_vm_id)
+ break;
+ }
+
+ /* Not found? exit. */
+ if (vm == NULL) {
+ rw_exit_read(&vmm_softc->vm_lock);
+ return (ENOENT);
+ }
+
+ rw_enter_read(&vm->vm_vcpu_lock);
+ SLIST_FOREACH(vcpu, &vm->vm_vcpu_list, vc_vcpu_link) {
+ if (vcpu->vc_id == vip->vip_vcpu_id)
+ break;
+ }
+ rw_exit_read(&vm->vm_vcpu_lock);
+ rw_exit_read(&vmm_softc->vm_lock);
+
+ if (vcpu == NULL)
+ return (ENOENT);
+
+ vcpu->vc_intr = vip->vip_intr;
+
+#ifdef MULTIPROCESSOR
+ /*
+ * If the vcpu is running on another PCPU, attempt to force it
+ * to exit to process the pending interrupt. This could race as
+ * it could be running when we do the check but be stopped by the
+ * time we send the IPI. In this case, there is a small extra
+ * overhead to process the IPI but no other side effects.
+ *
+ * There is also a chance that the vcpu may have interrupts blocked.
+ * That's ok as that condition will be checked on exit, and we will
+ * simply re-enter the guest. This "fast notification" is done only
+ * as an optimization.
+ */
+ if (vcpu->vc_state == VCPU_STATE_RUNNING &&
+ vip->vip_intr == 1)
+ x86_send_ipi(vcpu->vc_last_pcpu, X86_IPI_NOP);
+#endif /* MULTIPROCESSOR */
+
+ return (0);
+}
+
+/*
+ * vm_readregs
+ *
+ * IOCTL handler to read/write the current register values of a guest VCPU.
+ * The VCPU must not be running.
+ *
+ * Parameters:
+ * vrwp: Describes the VM and VCPU to get/set the registers from. The
+ * register values are returned here as well.
+ * dir: 0 for reading, 1 for writing
+ *
+ * Return values:
+ * 0: if successful
+ * ENOENT: if the VM/VCPU defined by 'vgp' cannot be found
+ * EINVAL: if an error occured reading the registers of the guest
+ */
+int
+vm_rwregs(struct vm_rwregs_params *vrwp, int dir)
+{
+ struct vm *vm;
+ struct vcpu *vcpu;
+ struct vcpu_reg_state *vrs = &vrwp->vrwp_regs;
+
+ /* Find the desired VM */
+ rw_enter_read(&vmm_softc->vm_lock);
+ SLIST_FOREACH(vm, &vmm_softc->vm_list, vm_link) {
+ if (vm->vm_id == vrwp->vrwp_vm_id)
+ break;
+ }
+
+ /* Not found? exit. */
+ if (vm == NULL) {
+ rw_exit_read(&vmm_softc->vm_lock);
+ return (ENOENT);
+ }
+
+ rw_enter_read(&vm->vm_vcpu_lock);
+ SLIST_FOREACH(vcpu, &vm->vm_vcpu_list, vc_vcpu_link) {
+ if (vcpu->vc_id == vrwp->vrwp_vcpu_id)
+ break;
+ }
+ rw_exit_read(&vm->vm_vcpu_lock);
+ rw_exit_read(&vmm_softc->vm_lock);
+
+ if (vcpu == NULL)
+ return (ENOENT);
+
+ if (vmm_softc->mode == VMM_MODE_VMX ||
+ vmm_softc->mode == VMM_MODE_EPT)
+ return (dir == 0) ?
+ vcpu_readregs_vmx(vcpu, vrwp->vrwp_mask, vrs) :
+ vcpu_writeregs_vmx(vcpu, vrwp->vrwp_mask, 1, vrs);
+ else if (vmm_softc->mode == VMM_MODE_SVM ||
+ vmm_softc->mode == VMM_MODE_RVI)
+ return (dir == 0) ?
+ vcpu_readregs_svm(vcpu, vrwp->vrwp_mask, vrs) :
+ vcpu_writeregs_svm(vcpu, vrwp->vrwp_mask, vrs);
+ else
+ panic("unknown vmm mode\n");
+}
+
+/*
+ * vmm_start
+ *
+ * Starts VMM mode on the system
+ */
+int
+vmm_start(void)
+{
+ struct cpu_info *self = curcpu();
+ int ret = 0;
+#ifdef MULTIPROCESSOR
+ struct cpu_info *ci;
+ CPU_INFO_ITERATOR cii;
+ int i;
+#endif
+
+ /* VMM is already running */
+ if (self->ci_flags & CPUF_VMM)
+ return (0);
+
+#ifdef MULTIPROCESSOR
+ /* Broadcast start VMM IPI */
+ x86_broadcast_ipi(X86_IPI_START_VMM);
+
+ CPU_INFO_FOREACH(cii, ci) {
+ if (ci == self)
+ continue;
+ for (i = 100000; (!(ci->ci_flags & CPUF_VMM)) && i>0;i--)
+ delay(10);
+ if (!(ci->ci_flags & CPUF_VMM)) {
+ printf("%s: failed to enter VMM mode\n",
+ ci->ci_dev->dv_xname);
+ ret = EIO;
+ }
+ }
+#endif /* MULTIPROCESSOR */
+
+ /* Start VMM on this CPU */
+ start_vmm_on_cpu(self);
+ if (!(self->ci_flags & CPUF_VMM)) {
+ printf("%s: failed to enter VMM mode\n",
+ self->ci_dev.dv_xname);
+ ret = EIO;
+ }
+
+ return (ret);
+}
+
+/*
+ * vmm_stop
+ *
+ * Stops VMM mode on the system
+ */
+int
+vmm_stop(void)
+{
+ struct cpu_info *self = curcpu();
+ int ret = 0;
+#ifdef MULTIPROCESSOR
+ struct cpu_info *ci;
+ CPU_INFO_ITERATOR cii;
+ int i;
+#endif
+
+ /* VMM is not running */
+ if (!(self->ci_flags & CPUF_VMM))
+ return (0);
+
+#ifdef MULTIPROCESSOR
+ /* Stop VMM on other CPUs */
+ x86_broadcast_ipi(X86_IPI_STOP_VMM);
+
+ CPU_INFO_FOREACH(cii, ci) {
+ if (ci == self)
+ continue;
+ for (i = 100000; (ci->ci_flags & CPUF_VMM) && i>0 ;i--)
+ delay(10);
+ if (ci->ci_flags & CPUF_VMM) {
+ printf("%s: failed to exit VMM mode\n",
+ ci->ci_dev->dv_xname);
+ ret = EIO;
+ }
+ }
+#endif /* MULTIPROCESSOR */
+
+ /* Stop VMM on this CPU */
+ stop_vmm_on_cpu(self);
+ if (self->ci_flags & CPUF_VMM) {
+ printf("%s: failed to exit VMM mode\n",
+ self->ci_dev.dv_xname);
+ ret = EIO;
+ }
+
+ return (ret);
+}
+
+/*
+ * start_vmm_on_cpu
+ *
+ * Starts VMM mode on 'ci' by executing the appropriate CPU-specific insn
+ * sequence to enter VMM mode (eg, VMXON)
+ */
+void
+start_vmm_on_cpu(struct cpu_info *ci)
+{
+ uint64_t msr;
+ uint32_t cr4;
+
+ /* No VMM mode? exit. */
+ if ((ci->ci_vmm_flags & CI_VMM_VMX) == 0 &&
+ (ci->ci_vmm_flags & CI_VMM_SVM) == 0)
+ return;
+
+ /*
+ * AMD SVM
+ */
+ if (ci->ci_vmm_flags & CI_VMM_SVM) {
+ msr = rdmsr(MSR_EFER);
+ msr |= EFER_SVME;
+ wrmsr(MSR_EFER, msr);
+ }
+
+ /*
+ * Intel VMX
+ */
+ if (ci->ci_vmm_flags & CI_VMM_VMX) {
+ if (ci->ci_vmxon_region == 0)
+ return;
+ else {
+ bzero(ci->ci_vmxon_region, PAGE_SIZE);
+ ci->ci_vmxon_region->vr_revision =
+ ci->ci_vmm_cap.vcc_vmx.vmx_vmxon_revision;
+
+ /* Set CR4.VMXE */
+ cr4 = rcr4();
+ cr4 |= CR4_VMXE;
+ lcr4(cr4);
+
+ /* Enable VMX */
+ msr = rdmsr(MSR_IA32_FEATURE_CONTROL);
+ if (msr & IA32_FEATURE_CONTROL_LOCK) {
+ if (!(msr & IA32_FEATURE_CONTROL_VMX_EN))
+ return;
+ } else {
+ msr |= IA32_FEATURE_CONTROL_VMX_EN |
+ IA32_FEATURE_CONTROL_LOCK;
+ wrmsr(MSR_IA32_FEATURE_CONTROL, msr);
+ }
+
+ /* Enter VMX mode */
+ if (vmxon(&ci->ci_vmxon_region_pa))
+ return;
+ }
+ }
+
+ ci->ci_flags |= CPUF_VMM;
+}
+
+/*
+ * stop_vmm_on_cpu
+ *
+ * Stops VMM mode on 'ci' by executing the appropriate CPU-specific insn
+ * sequence to exit VMM mode (eg, VMXOFF)
+ */
+void
+stop_vmm_on_cpu(struct cpu_info *ci)
+{
+ uint64_t msr;
+ uint32_t cr4;
+
+ if (!(ci->ci_flags & CPUF_VMM))
+ return;
+
+ /*
+ * AMD SVM
+ */
+ if (ci->ci_vmm_flags & CI_VMM_SVM) {
+ msr = rdmsr(MSR_EFER);
+ msr &= ~EFER_SVME;
+ wrmsr(MSR_EFER, msr);
+ }
+
+ /*
+ * Intel VMX
+ */
+ if (ci->ci_vmm_flags & CI_VMM_VMX) {
+ if (vmxoff())
+ panic("VMXOFF failed\n");
+
+ cr4 = rcr4();
+ cr4 &= ~CR4_VMXE;
+ lcr4(cr4);
+ }
+
+ ci->ci_flags &= ~CPUF_VMM;
+}
+
+/*
+ * vm_create_check_mem_ranges:
+ *
+ * Make sure that the guest physical memory ranges given by the user process
+ * do not overlap and are in ascending order.
+ *
+ * The last physical address may not exceed VMM_MAX_VM_MEM_SIZE.
+ *
+ * Return Values:
+ * The total memory size in MB if the checks were successful
+ * 0: One of the memory ranges was invalid, or VMM_MAX_VM_MEM_SIZE was
+ * exceeded
+ */
+size_t
+vm_create_check_mem_ranges(struct vm_create_params *vcp)
+{
+ int disjunct_range;
+ size_t i, memsize = 0;
+ struct vm_mem_range *vmr, *pvmr;
+ const paddr_t maxgpa = (uint32_t)VMM_MAX_VM_MEM_SIZE * 1024 * 1024;
+
+ if (vcp->vcp_nmemranges == 0 ||
+ vcp->vcp_nmemranges > VMM_MAX_MEM_RANGES)
+ return (0);
+
+ for (i = 0; i < vcp->vcp_nmemranges; i++) {
+ vmr = &vcp->vcp_memranges[i];
+
+ /* Only page-aligned addresses and sizes are permitted */
+ if ((vmr->vmr_gpa & PAGE_MASK) || (vmr->vmr_va & PAGE_MASK) ||
+ (vmr->vmr_size & PAGE_MASK) || vmr->vmr_size == 0)
+ return (0);
+
+ /* Make sure that VMM_MAX_VM_MEM_SIZE is not exceeded */
+ if (vmr->vmr_gpa >= maxgpa ||
+ vmr->vmr_size > maxgpa - vmr->vmr_gpa)
+ return (0);
+
+ /*
+ * Make sure that all virtual addresses are within the address
+ * space of the process and that they do not wrap around.
+ * Calling uvm_share() when creating the VM will take care of
+ * further checks.
+ */
+ if (vmr->vmr_va < VM_MIN_ADDRESS ||
+ vmr->vmr_va >= VM_MAXUSER_ADDRESS ||
+ vmr->vmr_size >= VM_MAXUSER_ADDRESS - vmr->vmr_va)
+ return (0);
+
+ /* Specifying ranges within the PCI MMIO space is forbidden */
+ disjunct_range = (vmr->vmr_gpa > VMM_PCI_MMIO_BAR_END) ||
+ (vmr->vmr_gpa + vmr->vmr_size <= VMM_PCI_MMIO_BAR_BASE);
+ if (!disjunct_range)
+ return (0);
+
+ /*
+ * Make sure that guest physcal memory ranges do not overlap
+ * and that they are ascending.
+ */
+ if (i > 0 && pvmr->vmr_gpa + pvmr->vmr_size > vmr->vmr_gpa)
+ return (0);
+
+ memsize += vmr->vmr_size;
+ pvmr = vmr;
+ }
+
+ if (memsize % (1024 * 1024) != 0)
+ return (0);
+ memsize /= 1024 * 1024;
+ return (memsize);
+}
+
+/*
+ * vm_create
+ *
+ * Creates the in-memory VMM structures for the VM defined by 'vcp'. The
+ * parent of this VM shall be the process defined by 'p'.
+ * This function does not start the VCPU(s) - see vm_start.
+ *
+ * Return Values:
+ * 0: the create operation was successful
+ * ENOMEM: out of memory
+ * various other errors from vcpu_init/vm_impl_init
+ */
+int
+vm_create(struct vm_create_params *vcp, struct proc *p)
+{
+ int i, ret;
+ size_t memsize;
+ struct vm *vm;
+ struct vcpu *vcpu;
+
+ if (!(curcpu()->ci_flags & CPUF_VMM))
+ return (EINVAL);
+
+ memsize = vm_create_check_mem_ranges(vcp);
+ if (memsize == 0)
+ return (EINVAL);
+
+ /* XXX - support UP only (for now) */
+ if (vcp->vcp_ncpus != 1)
+ return (EINVAL);
+
+ vm = pool_get(&vm_pool, PR_WAITOK | PR_ZERO);
+ SLIST_INIT(&vm->vm_vcpu_list);
+ rw_init(&vm->vm_vcpu_lock, "vcpulock");
+
+ vm->vm_creator_pid = p->p_p->ps_pid;
+ vm->vm_nmemranges = vcp->vcp_nmemranges;
+ memcpy(vm->vm_memranges, vcp->vcp_memranges,
+ vm->vm_nmemranges * sizeof(vm->vm_memranges[0]));
+ vm->vm_memory_size = memsize;
+ strncpy(vm->vm_name, vcp->vcp_name, VMM_MAX_NAME_LEN);
+
+ if (vm_impl_init(vm, p)) {
+ printf("failed to init arch-specific features for vm 0x%p\n",
+ vm);
+ vm_teardown(vm);
+ return (ENOMEM);
+ }
+
+ rw_enter_write(&vmm_softc->vm_lock);
+ vmm_softc->vm_ct++;
+ vmm_softc->vm_idx++;
+
+ /*
+ * XXX we use the vm_id for the VPID/ASID, so we need to prevent
+ * wrapping around 65536/4096 entries here
+ */
+ vm->vm_id = vmm_softc->vm_idx;
+ vm->vm_vcpu_ct = 0;
+ vm->vm_vcpus_running = 0;
+
+ /* Initialize each VCPU defined in 'vcp' */
+ for (i = 0; i < vcp->vcp_ncpus; i++) {
+ vcpu = pool_get(&vcpu_pool, PR_WAITOK | PR_ZERO);
+ vcpu->vc_parent = vm;
+ if ((ret = vcpu_init(vcpu)) != 0) {
+ printf("failed to init vcpu %d for vm 0x%p\n", i, vm);
+ vm_teardown(vm);
+ vmm_softc->vm_ct--;
+ vmm_softc->vm_idx--;
+ rw_exit_write(&vmm_softc->vm_lock);
+ return (ret);
+ }
+ rw_enter_write(&vm->vm_vcpu_lock);
+ vcpu->vc_id = vm->vm_vcpu_ct;
+ vm->vm_vcpu_ct++;
+ SLIST_INSERT_HEAD(&vm->vm_vcpu_list, vcpu, vc_vcpu_link);
+ rw_exit_write(&vm->vm_vcpu_lock);
+ }
+
+ /* XXX init various other hardware parts (vlapic, vioapic, etc) */
+
+ SLIST_INSERT_HEAD(&vmm_softc->vm_list, vm, vm_link);
+ rw_exit_write(&vmm_softc->vm_lock);
+
+ vcp->vcp_id = vm->vm_id;
+
+ return (0);
+}
+
+/*
+ * vm_impl_init_vmx
+ *
+ * Intel VMX specific VM initialization routine
+ */
+int
+vm_impl_init_vmx(struct vm *vm, struct proc *p)
+{
+ int i, ret;
+ vaddr_t mingpa, maxgpa;
+ struct pmap *pmap;
+ struct vm_mem_range *vmr;
+
+ /* If not EPT, nothing to do here */
+ if (vmm_softc->mode != VMM_MODE_EPT)
+ return (0);
+
+ /* Create a new pmap for this VM */
+ pmap = pmap_create();
+ if (!pmap) {
+ printf("vm_impl_init_vmx: pmap_create failed\n");
+ return (ENOMEM);
+ }
+
+ /*
+ * Create a new UVM map for this VM, and assign it the pmap just
+ * created.
+ */
+ vmr = &vm->vm_memranges[0];
+ mingpa = vmr->vmr_gpa;
+ vmr = &vm->vm_memranges[vm->vm_nmemranges - 1];
+ maxgpa = vmr->vmr_gpa + vmr->vmr_size;
+ vm->vm_map = uvm_map_create(pmap, mingpa, maxgpa,
+ VM_MAP_ISVMSPACE | VM_MAP_PAGEABLE);
+
+ if (!vm->vm_map) {
+ printf("vm_impl_init_vmx: uvm_map_create failed\n");
+ pmap_destroy(pmap);
+ return (ENOMEM);
+ }
+
+ /* Map the new map with an anon */
+ DPRINTF("vm_impl_init_vmx: created vm_map @ %p\n", vm->vm_map);
+ for (i = 0; i < vm->vm_nmemranges; i++) {
+ vmr = &vm->vm_memranges[i];
+ ret = uvm_share(vm->vm_map, vmr->vmr_gpa,
+ PROT_READ | PROT_WRITE | PROT_EXEC,
+ &p->p_vmspace->vm_map, vmr->vmr_va, vmr->vmr_size);
+ if (ret) {
+ printf("vm_impl_init_vmx: uvm_share failed (%d)\n",
+ ret);
+ /* uvm_map_deallocate calls pmap_destroy for us */
+ uvm_map_deallocate(vm->vm_map);
+ vm->vm_map = NULL;
+ return (ENOMEM);
+ }
+ }
+
+ /* Convert the low 512GB of the pmap to EPT */
+ ret = pmap_convert(pmap, PMAP_TYPE_EPT);
+ if (ret) {
+ printf("vm_impl_init_vmx: pmap_convert failed\n");
+ /* uvm_map_deallocate calls pmap_destroy for us */
+ uvm_map_deallocate(vm->vm_map);
+ vm->vm_map = NULL;
+ return (ENOMEM);
+ }
+
+ return (0);
+}
+
+/*
+ * vm_impl_init_svm
+ *
+ * AMD SVM specific VM initialization routine
+ */
+int
+vm_impl_init_svm(struct vm *vm, struct proc *p)
+{
+ /* XXX removed due to rot */
+ return (-1);
+}
+
+/*
+ * vm_impl_init
+ *
+ * Calls the architecture-specific VM init routine
+ */
+int
+vm_impl_init(struct vm *vm, struct proc *p)
+{
+ if (vmm_softc->mode == VMM_MODE_VMX ||
+ vmm_softc->mode == VMM_MODE_EPT)
+ return vm_impl_init_vmx(vm, p);
+ else if (vmm_softc->mode == VMM_MODE_SVM ||
+ vmm_softc->mode == VMM_MODE_RVI)
+ return vm_impl_init_svm(vm, p);
+ else
+ panic("unknown vmm mode\n");
+}
+
+/*
+ * vm_impl_deinit_vmx
+ *
+ * Intel VMX specific VM initialization routine
+ */
+void
+vm_impl_deinit_vmx(struct vm *vm)
+{
+ /* Unused */
+}
+
+/*
+ * vm_impl_deinit_svm
+ *
+ * AMD SVM specific VM initialization routine
+ */
+void
+vm_impl_deinit_svm(struct vm *vm)
+{
+ /* Unused */
+}
+
+/*
+ * vm_impl_deinit
+ *
+ * Calls the architecture-specific VM init routine
+ */
+void
+vm_impl_deinit(struct vm *vm)
+{
+ if (vmm_softc->mode == VMM_MODE_VMX ||
+ vmm_softc->mode == VMM_MODE_EPT)
+ vm_impl_deinit_vmx(vm);
+ else if (vmm_softc->mode == VMM_MODE_SVM ||
+ vmm_softc->mode == VMM_MODE_RVI)
+ vm_impl_deinit_svm(vm);
+ else
+ panic("unknown vmm mode\n");
+}
+
+/*
+ * vcpu_reload_vmcs_vmx
+ *
+ * Loads 'vmcs' on the current CPU, possibly flushing any old vmcs state
+ * of the previous occupant.
+ *
+ * Parameters:
+ * vmcs: Pointer to uint64_t containing the PA of the vmcs to load
+ *
+ * Return values:
+ * 0: if successful
+ * EINVAL: an error occurred during flush or reload
+ */
+int
+vcpu_reload_vmcs_vmx(uint64_t *vmcs)
+{
+ uint64_t old;
+
+ /* Flush any old state */
+ if (!vmptrst(&old)) {
+ if (old != 0xFFFFFFFFFFFFFFFFULL) {
+ if (vmclear(&old))
+ return (EINVAL);
+ }
+ } else
+ return (EINVAL);
+
+ /*
+ * Load the VMCS onto this PCPU
+ */
+ if (vmptrld(vmcs))
+ return (EINVAL);
+
+ return (0);
+}
+
+/*
+ * vcpu_readregs_vmx
+ *
+ * Reads 'vcpu's registers
+ *
+ * Parameters:
+ * vcpu: the vcpu to read register values from
+ * regmask: the types of registers to read
+ * vrs: output parameter where register values are stored
+ *
+ * Return values:
+ * 0: if successful
+ * EINVAL: an error reading registers occured
+ */
+int
+vcpu_readregs_vmx(struct vcpu *vcpu, uint64_t regmask,
+ struct vcpu_reg_state *vrs)
+{
+ int i, ret = 0;
+ uint32_t ar, sel;
+ uint32_t limit;
+ uint32_t *gprs = vrs->vrs_gprs;
+ uint32_t *crs = vrs->vrs_crs;
+ struct vcpu_segment_info *sregs = vrs->vrs_sregs;
+
+ if (vcpu_reload_vmcs_vmx(&vcpu->vc_control_pa))
+ return (EINVAL);
+
+ if (regmask & VM_RWREGS_GPRS) {
+ gprs[VCPU_REGS_EAX] = vcpu->vc_gueststate.vg_eax;
+ gprs[VCPU_REGS_EBX] = vcpu->vc_gueststate.vg_ebx;
+ gprs[VCPU_REGS_ECX] = vcpu->vc_gueststate.vg_ecx;
+ gprs[VCPU_REGS_EDX] = vcpu->vc_gueststate.vg_edx;
+ gprs[VCPU_REGS_ESI] = vcpu->vc_gueststate.vg_esi;
+ gprs[VCPU_REGS_EDI] = vcpu->vc_gueststate.vg_edi;
+ gprs[VCPU_REGS_EBP] = vcpu->vc_gueststate.vg_ebp;
+ gprs[VCPU_REGS_EIP] = vcpu->vc_gueststate.vg_eip;
+ if (vmread(VMCS_GUEST_IA32_RSP, &gprs[VCPU_REGS_ESP]))
+ goto errout;
+ if (vmread(VMCS_GUEST_IA32_RFLAGS, &gprs[VCPU_REGS_EFLAGS]))
+ goto errout;
+ }
+ if (regmask & VM_RWREGS_SREGS) {
+ for (i = 0; i < nitems(vmm_vmx_sreg_vmcs_fields); i++) {
+ if (vmread(vmm_vmx_sreg_vmcs_fields[i].selid, &sel))
+ goto errout;
+ if (vmread(vmm_vmx_sreg_vmcs_fields[i].limitid, &limit))
+ goto errout;
+ if (vmread(vmm_vmx_sreg_vmcs_fields[i].arid, &ar))
+ goto errout;
+ if (vmread(vmm_vmx_sreg_vmcs_fields[i].baseid,
+ &sregs[i].vsi_base))
+ goto errout;
+
+ sregs[i].vsi_sel = sel;
+ sregs[i].vsi_limit = limit;
+ sregs[i].vsi_ar = ar;
+ }
+
+ if (vmread(VMCS_GUEST_IA32_GDTR_LIMIT, &limit))
+ goto errout;
+ if (vmread(VMCS_GUEST_IA32_GDTR_BASE,
+ &vrs->vrs_gdtr.vsi_base))
+ goto errout;
+ vrs->vrs_gdtr.vsi_limit = limit;
+
+ if (vmread(VMCS_GUEST_IA32_IDTR_LIMIT, &limit))
+ goto errout;
+ if (vmread(VMCS_GUEST_IA32_IDTR_BASE,
+ &vrs->vrs_idtr.vsi_base))
+ goto errout;
+ vrs->vrs_idtr.vsi_limit = limit;
+ }
+ if (regmask & VM_RWREGS_CRS) {
+ crs[VCPU_REGS_CR2] = vcpu->vc_gueststate.vg_cr2;
+ if (vmread(VMCS_GUEST_IA32_CR0, &crs[VCPU_REGS_CR0]))
+ goto errout;
+ if (vmread(VMCS_GUEST_IA32_CR3, &crs[VCPU_REGS_CR3]))
+ goto errout;
+ if (vmread(VMCS_GUEST_IA32_CR4, &crs[VCPU_REGS_CR4]))
+ goto errout;
+ }
+
+ goto out;
+
+errout:
+ ret = EINVAL;
+out:
+ if (vmclear(&vcpu->vc_control_pa))
+ ret = EINVAL;
+ return (ret);
+}
+
+/*
+ * vcpu_readregs_svm
+ *
+ * XXX - unimplemented
+ */
+int
+vcpu_readregs_svm(struct vcpu *vcpu, uint64_t regmask,
+ struct vcpu_reg_state *regs)
+{
+ return (0);
+}
+
+/*
+ * vcpu_writeregs_vmx
+ *
+ * Writes 'vcpu's registers
+ *
+ * Parameters:
+ * vcpu: the vcpu that has to get its registers written to
+ * regmask: the types of registers to write
+ * loadvmcs: bit to indicate whether the VMCS has to be loaded first
+ * vrs: the register values to write
+ *
+ * Return values:
+ * 0: if successful
+ * EINVAL an error writing registers occured
+ */
+int
+vcpu_writeregs_vmx(struct vcpu *vcpu, uint64_t regmask, int loadvmcs,
+ struct vcpu_reg_state *vrs)
+{
+ int i, ret = 0;
+ uint16_t sel;
+ uint32_t limit, ar;
+ uint32_t *gprs = vrs->vrs_gprs;
+ uint32_t *crs = vrs->vrs_crs;
+ struct vcpu_segment_info *sregs = vrs->vrs_sregs;
+
+ if (loadvmcs) {
+ if (vcpu_reload_vmcs_vmx(&vcpu->vc_control_pa))
+ return (EINVAL);
+ }
+
+ if (regmask & VM_RWREGS_GPRS) {
+ vcpu->vc_gueststate.vg_eax = gprs[VCPU_REGS_EAX];
+ vcpu->vc_gueststate.vg_ebx = gprs[VCPU_REGS_EBX];
+ vcpu->vc_gueststate.vg_ecx = gprs[VCPU_REGS_ECX];
+ vcpu->vc_gueststate.vg_edx = gprs[VCPU_REGS_EDX];
+ vcpu->vc_gueststate.vg_esi = gprs[VCPU_REGS_ESI];
+ vcpu->vc_gueststate.vg_edi = gprs[VCPU_REGS_EDI];
+ vcpu->vc_gueststate.vg_ebp = gprs[VCPU_REGS_EBP];
+ vcpu->vc_gueststate.vg_eip = gprs[VCPU_REGS_EIP];
+ if (vmwrite(VMCS_GUEST_IA32_RIP, gprs[VCPU_REGS_EIP]))
+ goto errout;
+ if (vmwrite(VMCS_GUEST_IA32_RSP, gprs[VCPU_REGS_ESP]))
+ goto errout;
+ if (vmwrite(VMCS_GUEST_IA32_RFLAGS, gprs[VCPU_REGS_EFLAGS]))
+ goto errout;
+ }
+ if (regmask & VM_RWREGS_SREGS) {
+ for (i = 0; i < nitems(vmm_vmx_sreg_vmcs_fields); i++) {
+ sel = sregs[i].vsi_sel;
+ limit = sregs[i].vsi_limit;
+ ar = sregs[i].vsi_ar;
+
+ if (vmwrite(vmm_vmx_sreg_vmcs_fields[i].selid, sel))
+ goto errout;
+ if (vmwrite(vmm_vmx_sreg_vmcs_fields[i].limitid, limit))
+ goto errout;
+ if (vmwrite(vmm_vmx_sreg_vmcs_fields[i].arid, ar))
+ goto errout;
+ if (vmwrite(vmm_vmx_sreg_vmcs_fields[i].baseid,
+ sregs[i].vsi_base))
+ goto errout;
+ }
+
+ if (vmwrite(VMCS_GUEST_IA32_GDTR_LIMIT,
+ vrs->vrs_gdtr.vsi_limit))
+ goto errout;
+ if (vmwrite(VMCS_GUEST_IA32_GDTR_BASE,
+ vrs->vrs_gdtr.vsi_base))
+ goto errout;
+ if (vmwrite(VMCS_GUEST_IA32_IDTR_LIMIT,
+ vrs->vrs_idtr.vsi_limit))
+ goto errout;
+ if (vmwrite(VMCS_GUEST_IA32_IDTR_BASE,
+ vrs->vrs_idtr.vsi_base))
+ goto errout;
+ }
+ if (regmask & VM_RWREGS_CRS) {
+ if (vmwrite(VMCS_GUEST_IA32_CR0, crs[VCPU_REGS_CR0]))
+ goto errout;
+ if (vmwrite(VMCS_GUEST_IA32_CR3, crs[VCPU_REGS_CR3]))
+ goto errout;
+ if (vmwrite(VMCS_GUEST_IA32_CR4, crs[VCPU_REGS_CR4]))
+ goto errout;
+ }
+
+ goto out;
+
+errout:
+ ret = EINVAL;
+out:
+ if (loadvmcs) {
+ if (vmclear(&vcpu->vc_control_pa))
+ ret = EINVAL;
+ }
+ return (ret);
+}
+
+/*
+ * vcpu_writeregs_svm
+ *
+ * XXX - unimplemented
+ */
+int
+vcpu_writeregs_svm(struct vcpu *vcpu, uint64_t regmask,
+ struct vcpu_reg_state *vrs)
+{
+ return (0);
+}
+
+/*
+ * vcpu_reset_regs_svm
+ *
+ * XXX - unimplemented
+ */
+int
+vcpu_reset_regs_svm(struct vcpu *vcpu, struct vcpu_reg_state *vrs)
+{
+ return (0);
+}
+
+/*
+ * vmx_setmsrbr
+ *
+ * Allow read access to the specified msr on the supplied vcpu.
+ *
+ * Parameters:
+ * vcpu: the VCPU to allow access
+ * msr: the MSR number to allow access to
+ */
+void
+vmx_setmsrbr(struct vcpu *vcpu, uint32_t msr)
+{
+ uint8_t *msrs;
+ uint16_t idx;
+
+ msrs = (uint8_t *)vcpu->vc_msr_bitmap_va;
+
+ /*
+ * MSR Read bitmap layout:
+ * "Low" MSRs (0x0 - 0x1fff) @ 0x0
+ * "High" MSRs (0xc0000000 - 0xc0001fff) @ 0x400
+ */
+ if (msr <= 0x1fff) {
+ idx = MSRIDX(msr);
+ msrs[idx] &= ~(MSRBIT(msr));
+ } else if (msr >= 0xc0000000 && msr <= 0xc0001fff) {
+ idx = MSRIDX(msr - 0xc0000000) + 0x400;
+ msrs[idx] &= ~(MSRBIT(msr - 0xc0000000));
+ } else
+ printf("%s: invalid msr 0x%x\n", __func__, msr);
+}
+
+/*
+ * vmx_setmsrbw
+ *
+ * Allow write access to the specified msr on the supplied vcpu
+ *
+ * Parameters:
+ * vcpu: the VCPU to allow access
+ * msr: the MSR number to allow access to
+ */
+void
+vmx_setmsrbw(struct vcpu *vcpu, uint32_t msr)
+{
+ uint8_t *msrs;
+ uint16_t idx;
+
+ msrs = (uint8_t *)vcpu->vc_msr_bitmap_va;
+
+ /*
+ * MSR Write bitmap layout:
+ * "Low" MSRs (0x0 - 0x1fff) @ 0x800
+ * "High" MSRs (0xc0000000 - 0xc0001fff) @ 0xc00
+ */
+ if (msr <= 0x1fff) {
+ idx = MSRIDX(msr) + 0x800;
+ msrs[idx] &= ~(MSRBIT(msr));
+ } else if (msr >= 0xc0000000 && msr <= 0xc0001fff) {
+ idx = MSRIDX(msr - 0xc0000000) + 0xc00;
+ msrs[idx] &= ~(MSRBIT(msr - 0xc0000000));
+ } else
+ printf("%s: invalid msr 0x%x\n", __func__, msr);
+}
+
+/*
+ * vmx_setmsrbrw
+ *
+ * Allow read/write access to the specified msr on the supplied vcpu
+ *
+ * Parameters:
+ * vcpu: the VCPU to allow access
+ * msr: the MSR number to allow access to
+ */
+void
+vmx_setmsrbrw(struct vcpu *vcpu, uint32_t msr)
+{
+ vmx_setmsrbr(vcpu, msr);
+ vmx_setmsrbw(vcpu, msr);
+}
+
+/*
+ * vcpu_reset_regs_vmx
+ *
+ * Initializes 'vcpu's registers to supplied state
+ *
+ * Parameters:
+ * vcpu: the vcpu whose register state is to be initialized
+ * vrs: the register state to set
+ *
+ * Return values:
+ * 0: registers init'ed successfully
+ * EINVAL: an error occurred setting register state
+ */
+int
+vcpu_reset_regs_vmx(struct vcpu *vcpu, struct vcpu_reg_state *vrs)
+{
+ int ret, ug;
+ uint32_t cr0, cr4;
+ uint32_t pinbased, procbased, procbased2, exit, entry;
+ uint32_t want1, want0;
+ uint64_t msr, ctrlval, eptp, cr3;
+ uint16_t ctrl;
+ struct vmx_msr_store *msr_store;
+
+ ret = 0;
+ ug = 0;
+
+ if (vcpu_reload_vmcs_vmx(&vcpu->vc_control_pa))
+ return (EINVAL);
+
+ /* Compute Basic Entry / Exit Controls */
+ vcpu->vc_vmx_basic = rdmsr(IA32_VMX_BASIC);
+ vcpu->vc_vmx_entry_ctls = rdmsr(IA32_VMX_ENTRY_CTLS);
+ vcpu->vc_vmx_exit_ctls = rdmsr(IA32_VMX_EXIT_CTLS);
+ vcpu->vc_vmx_pinbased_ctls = rdmsr(IA32_VMX_PINBASED_CTLS);
+ vcpu->vc_vmx_procbased_ctls = rdmsr(IA32_VMX_PROCBASED_CTLS);
+
+ /* Compute True Entry / Exit Controls (if applicable) */
+ if (vcpu->vc_vmx_basic & IA32_VMX_TRUE_CTLS_AVAIL) {
+ vcpu->vc_vmx_true_entry_ctls = rdmsr(IA32_VMX_TRUE_ENTRY_CTLS);
+ vcpu->vc_vmx_true_exit_ctls = rdmsr(IA32_VMX_TRUE_EXIT_CTLS);
+ vcpu->vc_vmx_true_pinbased_ctls =
+ rdmsr(IA32_VMX_TRUE_PINBASED_CTLS);
+ vcpu->vc_vmx_true_procbased_ctls =
+ rdmsr(IA32_VMX_TRUE_PROCBASED_CTLS);
+ }
+
+ /* Compute Secondary Procbased Controls (if applicable) */
+ if (vcpu_vmx_check_cap(vcpu, IA32_VMX_PROCBASED_CTLS,
+ IA32_VMX_ACTIVATE_SECONDARY_CONTROLS, 1))
+ vcpu->vc_vmx_procbased2_ctls = rdmsr(IA32_VMX_PROCBASED2_CTLS);
+
+ /*
+ * Pinbased ctrls
+ *
+ * We must be able to set the following:
+ * IA32_VMX_EXTERNAL_INT_EXITING - exit on host interrupt
+ * IA32_VMX_NMI_EXITING - exit on host NMI
+ */
+ want1 = IA32_VMX_EXTERNAL_INT_EXITING |
+ IA32_VMX_NMI_EXITING;
+ want0 = 0;
+
+ if (vcpu->vc_vmx_basic & IA32_VMX_TRUE_CTLS_AVAIL) {
+ ctrl = IA32_VMX_TRUE_PINBASED_CTLS;
+ ctrlval = vcpu->vc_vmx_true_pinbased_ctls;
+ } else {
+ ctrl = IA32_VMX_PINBASED_CTLS;
+ ctrlval = vcpu->vc_vmx_pinbased_ctls;
+ }
+
+ if (vcpu_vmx_compute_ctrl(ctrlval, ctrl, want1, want0, &pinbased)) {
+ ret = EINVAL;
+ goto exit;
+ }
+
+ if (vmwrite(VMCS_PINBASED_CTLS, pinbased)) {
+ ret = EINVAL;
+ goto exit;
+ }
+
+ /*
+ * Procbased ctrls
+ *
+ * We must be able to set the following:
+ * IA32_VMX_HLT_EXITING - exit on HLT instruction
+ * IA32_VMX_MWAIT_EXITING - exit on MWAIT instruction
+ * IA32_VMX_UNCONDITIONAL_IO_EXITING - exit on I/O instructions
+ * IA32_VMX_USE_MSR_BITMAPS - exit on various MSR accesses
+ * IA32_VMX_CR8_LOAD_EXITING - guest TPR access
+ * IA32_VMX_CR8_STORE_EXITING - guest TPR access
+ * IA32_VMX_USE_TPR_SHADOW - guest TPR access (shadow)
+ *
+ * If we have EPT, we must be able to clear the following
+ * IA32_VMX_CR3_LOAD_EXITING - don't care about guest CR3 accesses
+ * IA32_VMX_CR3_STORE_EXITING - don't care about guest CR3 accesses
+ */
+ want1 = IA32_VMX_HLT_EXITING |
+ IA32_VMX_MWAIT_EXITING |
+ IA32_VMX_UNCONDITIONAL_IO_EXITING |
+ IA32_VMX_USE_MSR_BITMAPS |
+ IA32_VMX_CR8_LOAD_EXITING |
+ IA32_VMX_CR8_STORE_EXITING |
+ IA32_VMX_USE_TPR_SHADOW;
+ want0 = 0;
+
+ if (vmm_softc->mode == VMM_MODE_EPT) {
+ want1 |= IA32_VMX_ACTIVATE_SECONDARY_CONTROLS;
+ want0 |= IA32_VMX_CR3_LOAD_EXITING |
+ IA32_VMX_CR3_STORE_EXITING;
+ }
+
+ if (vcpu->vc_vmx_basic & IA32_VMX_TRUE_CTLS_AVAIL) {
+ ctrl = IA32_VMX_TRUE_PROCBASED_CTLS;
+ ctrlval = vcpu->vc_vmx_true_procbased_ctls;
+ } else {
+ ctrl = IA32_VMX_PROCBASED_CTLS;
+ ctrlval = vcpu->vc_vmx_procbased_ctls;
+ }
+
+ if (vcpu_vmx_compute_ctrl(ctrlval, ctrl, want1, want0, &procbased)) {
+ ret = EINVAL;
+ goto exit;
+ }
+
+ if (vmwrite(VMCS_PROCBASED_CTLS, procbased)) {
+ ret = EINVAL;
+ goto exit;
+ }
+
+ /*
+ * Secondary Procbased ctrls
+ *
+ * We want to be able to set the following, if available:
+ * IA32_VMX_ENABLE_VPID - use VPIDs where available
+ *
+ * If we have EPT, we must be able to set the following:
+ * IA32_VMX_ENABLE_EPT - enable EPT
+ *
+ * If we have unrestricted guest capability, we must be able to set
+ * the following:
+ * IA32_VMX_UNRESTRICTED_GUEST - enable unrestricted guest
+ */
+ want1 = 0;
+
+ /* XXX checking for 2ndary controls can be combined here */
+ if (vcpu_vmx_check_cap(vcpu, IA32_VMX_PROCBASED_CTLS,
+ IA32_VMX_ACTIVATE_SECONDARY_CONTROLS, 1)) {
+ if (vcpu_vmx_check_cap(vcpu, IA32_VMX_PROCBASED2_CTLS,
+ IA32_VMX_ENABLE_VPID, 1))
+ want1 |= IA32_VMX_ENABLE_VPID;
+ }
+
+ if (vmm_softc->mode == VMM_MODE_EPT)
+ want1 |= IA32_VMX_ENABLE_EPT;
+
+ if (vcpu_vmx_check_cap(vcpu, IA32_VMX_PROCBASED_CTLS,
+ IA32_VMX_ACTIVATE_SECONDARY_CONTROLS, 1)) {
+ if (vcpu_vmx_check_cap(vcpu, IA32_VMX_PROCBASED2_CTLS,
+ IA32_VMX_UNRESTRICTED_GUEST, 1)) {
+ want1 |= IA32_VMX_UNRESTRICTED_GUEST;
+ ug = 1;
+ }
+ }
+
+ want0 = ~want1;
+ ctrlval = vcpu->vc_vmx_procbased2_ctls;
+ ctrl = IA32_VMX_PROCBASED2_CTLS;
+
+ if (vcpu_vmx_compute_ctrl(ctrlval, ctrl, want1, want0, &procbased2)) {
+ ret = EINVAL;
+ goto exit;
+ }
+
+ if (vmwrite(VMCS_PROCBASED2_CTLS, procbased2)) {
+ ret = EINVAL;
+ goto exit;
+ }
+
+ /*
+ * Exit ctrls
+ *
+ * We must be able to set the following:
+ * IA32_VMX_ACKNOWLEDGE_INTERRUPT_ON_EXIT - ack interrupt on exit
+ * XXX clear save_debug_ctrls on exit ?
+ */
+ want1 = IA32_VMX_ACKNOWLEDGE_INTERRUPT_ON_EXIT;
+ want0 = 0;
+
+ if (vcpu->vc_vmx_basic & IA32_VMX_TRUE_CTLS_AVAIL) {
+ ctrl = IA32_VMX_TRUE_EXIT_CTLS;
+ ctrlval = vcpu->vc_vmx_true_exit_ctls;
+ } else {
+ ctrl = IA32_VMX_EXIT_CTLS;
+ ctrlval = vcpu->vc_vmx_exit_ctls;
+ }
+
+ if (vcpu_vmx_compute_ctrl(ctrlval, ctrl, want1, want0, &exit)) {
+ ret = EINVAL;
+ goto exit;
+ }
+
+ if (vmwrite(VMCS_EXIT_CTLS, exit)) {
+ ret = EINVAL;
+ goto exit;
+ }
+
+ /*
+ * Entry ctrls
+ *
+ * We must be able to set the following:
+ * IA32_VMX_IA32E_MODE_GUEST (if no unrestricted guest)
+ * We must be able to clear the following:
+ * IA32_VMX_ENTRY_TO_SMM - enter to SMM
+ * IA32_VMX_DEACTIVATE_DUAL_MONITOR_TREATMENT
+ * IA32_VMX_LOAD_DEBUG_CONTROLS
+ * IA32_VMX_LOAD_IA32_PERF_GLOBAL_CTRL_ON_ENTRY
+ */
+ if (ug == 1)
+ want1 = 0;
+ else
+ want1 = IA32_VMX_IA32E_MODE_GUEST;
+
+ want0 = IA32_VMX_ENTRY_TO_SMM |
+ IA32_VMX_DEACTIVATE_DUAL_MONITOR_TREATMENT |
+ IA32_VMX_LOAD_DEBUG_CONTROLS |
+ IA32_VMX_LOAD_IA32_PERF_GLOBAL_CTRL_ON_ENTRY;
+
+ if (vcpu->vc_vmx_basic & IA32_VMX_TRUE_CTLS_AVAIL) {
+ ctrl = IA32_VMX_TRUE_ENTRY_CTLS;
+ ctrlval = vcpu->vc_vmx_true_entry_ctls;
+ } else {
+ ctrl = IA32_VMX_ENTRY_CTLS;
+ ctrlval = vcpu->vc_vmx_entry_ctls;
+ }
+
+ if (vcpu_vmx_compute_ctrl(ctrlval, ctrl, want1, want0, &entry)) {
+ ret = EINVAL;
+ goto exit;
+ }
+
+ if (vmwrite(VMCS_ENTRY_CTLS, entry)) {
+ ret = EINVAL;
+ goto exit;
+ }
+
+ if (vmm_softc->mode == VMM_MODE_EPT) {
+ eptp = vcpu->vc_parent->vm_map->pmap->pm_npt_pa;
+ msr = rdmsr(IA32_VMX_EPT_VPID_CAP);
+ if (msr & IA32_EPT_VPID_CAP_PAGE_WALK_4) {
+ /* Page walk length 4 supported */
+ eptp |= ((IA32_EPT_PAGE_WALK_LENGTH - 1) << 3);
+ }
+
+ if (msr & IA32_EPT_VPID_CAP_WB) {
+ /* WB cache type supported */
+ eptp |= IA32_EPT_PAGING_CACHE_TYPE_WB;
+ }
+
+ if (msr & IA32_EPT_VPID_CAP_AD_BITS) {
+ /* EPT A/D bits supported */
+ eptp |= IA32_EPT_AD_BITS_ENABLE;
+ }
+
+ DPRINTF("guest eptp = 0x%llx\n", eptp);
+ DPRINTF("write 0x%x to EPT_LO\n", (uint32_t)(eptp & 0xFFFFFFFFUL));
+ if (vmwrite(VMCS_GUEST_IA32_EPTP, (uint32_t)(eptp & 0xFFFFFFFFUL))) {
+ ret = EINVAL;
+ goto exit;
+ }
+
+ if (vmwrite(VMCS_GUEST_IA32_EPTP_HI, 0)) {
+ ret = EINVAL;
+ goto exit;
+ }
+ }
+
+ if (vcpu_vmx_check_cap(vcpu, IA32_VMX_PROCBASED_CTLS,
+ IA32_VMX_ACTIVATE_SECONDARY_CONTROLS, 1)) {
+ if (vcpu_vmx_check_cap(vcpu, IA32_VMX_PROCBASED2_CTLS,
+ IA32_VMX_ENABLE_VPID, 1))
+ if (vmwrite(VMCS_GUEST_VPID,
+ (uint16_t)vcpu->vc_parent->vm_id)) {
+ ret = EINVAL;
+ goto exit;
+ }
+ }
+
+ /*
+ * Determine which bits in CR0 have to be set to a fixed
+ * value as per Intel SDM A.7.
+ * CR0 bits in the vrs parameter must match these.
+ */
+
+ want1 = (curcpu()->ci_vmm_cap.vcc_vmx.vmx_cr0_fixed0) &
+ (curcpu()->ci_vmm_cap.vcc_vmx.vmx_cr0_fixed1);
+ want0 = ~(curcpu()->ci_vmm_cap.vcc_vmx.vmx_cr0_fixed0) &
+ ~(curcpu()->ci_vmm_cap.vcc_vmx.vmx_cr0_fixed1);
+
+ /*
+ * CR0_FIXED0 and CR0_FIXED1 may report the CR0_PG and CR0_PE bits as
+ * fixed to 1 even if the CPU supports the unrestricted guest
+ * feature. Update want1 and want0 accordingly to allow
+ * any value for CR0_PG and CR0_PE in vrs->vrs_crs[VCPU_REGS_CR0] if
+ * the CPU has the unrestricted guest capability.
+ */
+ cr0 = vrs->vrs_crs[VCPU_REGS_CR0];
+
+ if (ug) {
+ want1 &= ~(CR0_PG | CR0_PE);
+ want0 &= ~(CR0_PG | CR0_PE);
+ cr0 &= ~(CR0_PG | CR0_PE);
+ }
+
+ /*
+ * VMX may require some bits to be set that userland should not have
+ * to care about. Set those here.
+ */
+ if (want1 & CR0_NE)
+ cr0 |= CR0_NE;
+
+ if ((cr0 & want1) != want1) {
+ ret = EINVAL;
+ goto exit;
+ }
+ if ((~cr0 & want0) != want0) {
+ ret = EINVAL;
+ goto exit;
+ }
+
+ if (ug)
+ cr3 = 0;
+ else
+ cr3 = vrs->vrs_crs[VCPU_REGS_CR3];
+
+ /*
+ * Determine default CR4 as per Intel SDM A.8
+ * All flexible bits are set to 0
+ */
+ cr4 = (curcpu()->ci_vmm_cap.vcc_vmx.vmx_cr4_fixed0) &
+ (curcpu()->ci_vmm_cap.vcc_vmx.vmx_cr4_fixed1);
+
+ /*
+ * If we are starting in restricted guest mode, enable PAE
+ */
+ if (ug == 0)
+ cr4 |= CR4_PAE;
+
+ vrs->vrs_crs[VCPU_REGS_CR0] = cr0;
+ vrs->vrs_crs[VCPU_REGS_CR3] = cr3;
+ vrs->vrs_crs[VCPU_REGS_CR4] = cr4;
+
+ /*
+ * Select MSRs to be loaded on exit
+ */
+ msr_store = (struct vmx_msr_store *)vcpu->vc_vmx_msr_exit_load_va;
+ msr_store[0].vms_index = MSR_EFER;
+ msr_store[0].vms_data = rdmsr(MSR_EFER);
+
+ /*
+ * Select MSRs to be loaded on entry / saved on exit
+ */
+ msr_store = (struct vmx_msr_store *)vcpu->vc_vmx_msr_exit_save_va;
+
+ msr_store[0].vms_index = MSR_EFER;
+ msr_store[0].vms_data = 0ULL; /* Initial value */
+
+ /*
+ * Currently we have the same count of entry/exit MSRs loads/stores
+ * but this is not an architectural requirement.
+ */
+ if (vmwrite(VMCS_EXIT_MSR_STORE_COUNT, VMX_NUM_MSR_STORE)) {
+ ret = EINVAL;
+ goto exit;
+ }
+
+ if (vmwrite(VMCS_EXIT_MSR_LOAD_COUNT, VMX_NUM_MSR_STORE)) {
+ ret = EINVAL;
+ goto exit;
+ }
+
+ if (vmwrite(VMCS_ENTRY_MSR_LOAD_COUNT, VMX_NUM_MSR_STORE)) {
+ ret = EINVAL;
+ goto exit;
+ }
+
+ if (vmwrite(VMCS_EXIT_STORE_MSR_ADDRESS,
+ vcpu->vc_vmx_msr_exit_save_pa)) {
+ ret = EINVAL;
+ goto exit;
+ }
+
+ if (vmwrite(VMCS_EXIT_STORE_MSR_ADDRESS_HI, 0)) {
+ ret = EINVAL;
+ goto exit;
+ }
+
+ if (vmwrite(VMCS_EXIT_LOAD_MSR_ADDRESS,
+ vcpu->vc_vmx_msr_exit_load_pa)) {
+ ret = EINVAL;
+ goto exit;
+ }
+
+ if (vmwrite(VMCS_EXIT_LOAD_MSR_ADDRESS_HI, 0)) {
+ ret = EINVAL;
+ goto exit;
+ }
+
+ if (vmwrite(VMCS_ENTRY_LOAD_MSR_ADDRESS,
+ vcpu->vc_vmx_msr_exit_save_pa)) {
+ ret = EINVAL;
+ goto exit;
+ }
+
+ if (vmwrite(VMCS_ENTRY_LOAD_MSR_ADDRESS_HI, 0)) {
+ ret = EINVAL;
+ goto exit;
+ }
+
+ if (vmwrite(VMCS_MSR_BITMAP_ADDRESS,
+ vcpu->vc_msr_bitmap_pa)) {
+ ret = EINVAL;
+ goto exit;
+ }
+
+ if (vmwrite(VMCS_MSR_BITMAP_ADDRESS_HI, 0)) {
+ ret = EINVAL;
+ goto exit;
+ }
+
+ /*
+ * Set up the VMCS for the register state we want during VCPU start.
+ * This matches what the CPU state would be after a bootloader
+ * transition to 'start'.
+ */
+ ret = vcpu_writeregs_vmx(vcpu, VM_RWREGS_ALL, 0, vrs);
+
+ /*
+ * Set up the MSR bitmap
+ */
+ memset((uint8_t *)vcpu->vc_msr_bitmap_va, 0xFF, PAGE_SIZE);
+ vmx_setmsrbrw(vcpu, MSR_IA32_FEATURE_CONTROL);
+ vmx_setmsrbrw(vcpu, MSR_MTRRcap);
+ vmx_setmsrbrw(vcpu, MSR_SYSENTER_CS);
+ vmx_setmsrbrw(vcpu, MSR_SYSENTER_ESP);
+ vmx_setmsrbrw(vcpu, MSR_SYSENTER_EIP);
+ vmx_setmsrbrw(vcpu, MSR_MTRRvarBase);
+ vmx_setmsrbrw(vcpu, MSR_CR_PAT);
+ vmx_setmsrbrw(vcpu, MSR_MTRRdefType);
+ vmx_setmsrbrw(vcpu, MSR_EFER);
+ vmx_setmsrbrw(vcpu, MSR_STAR);
+ vmx_setmsrbrw(vcpu, MSR_LSTAR);
+ vmx_setmsrbrw(vcpu, MSR_CSTAR);
+ vmx_setmsrbrw(vcpu, MSR_SFMASK);
+ vmx_setmsrbrw(vcpu, MSR_FSBASE);
+ vmx_setmsrbrw(vcpu, MSR_GSBASE);
+ vmx_setmsrbrw(vcpu, MSR_KERNELGSBASE);
+
+
+ /* XXX CR0 shadow */
+ /* XXX CR4 shadow */
+
+ /* Flush the VMCS */
+ if (vmclear(&vcpu->vc_control_pa)) {
+ ret = EINVAL;
+ goto exit;
+ }
+
+exit:
+ return (ret);
+}
+
+/*
+ * vcpu_init_vmx
+ *
+ * Intel VMX specific VCPU initialization routine.
+ *
+ * This function allocates various per-VCPU memory regions, sets up initial
+ * VCPU VMCS controls, and sets initial register values.
+ */
+int
+vcpu_init_vmx(struct vcpu *vcpu)
+{
+ struct vmcs *vmcs;
+ uint32_t cr0, cr4;
+ int ret;
+
+ ret = 0;
+
+ /* Allocate VMCS VA */
+ vcpu->vc_control_va = (vaddr_t)km_alloc(PAGE_SIZE, &kv_page, &kp_zero,
+ &kd_waitok);
+
+ if (!vcpu->vc_control_va)
+ return (ENOMEM);
+
+ /* Compute VMCS PA */
+ if (!pmap_extract(pmap_kernel(), vcpu->vc_control_va,
+ (paddr_t *)&vcpu->vc_control_pa)) {
+ ret = ENOMEM;
+ goto exit;
+ }
+
+ /* Allocate MSR bitmap VA */
+ vcpu->vc_msr_bitmap_va = (vaddr_t)km_alloc(PAGE_SIZE, &kv_page, &kp_zero,
+ &kd_waitok);
+
+ if (!vcpu->vc_msr_bitmap_va) {
+ ret = ENOMEM;
+ goto exit;
+ }
+
+ /* Compute MSR bitmap PA */
+ if (!pmap_extract(pmap_kernel(), vcpu->vc_msr_bitmap_va,
+ (paddr_t *)&vcpu->vc_msr_bitmap_pa)) {
+ ret = ENOMEM;
+ goto exit;
+ }
+
+ /* Allocate MSR exit load area VA */
+ vcpu->vc_vmx_msr_exit_load_va = (vaddr_t)km_alloc(PAGE_SIZE, &kv_page,
+ &kp_zero, &kd_waitok);
+
+ if (!vcpu->vc_vmx_msr_exit_load_va) {
+ ret = ENOMEM;
+ goto exit;
+ }
+
+ /* Compute MSR exit load area PA */
+ if (!pmap_extract(pmap_kernel(), vcpu->vc_vmx_msr_exit_load_va,
+ &vcpu->vc_vmx_msr_exit_load_pa)) {
+ ret = ENOMEM;
+ goto exit;
+ }
+
+ /* Allocate MSR exit save area VA */
+ vcpu->vc_vmx_msr_exit_save_va = (vaddr_t)km_alloc(PAGE_SIZE, &kv_page,
+ &kp_zero, &kd_waitok);
+
+ if (!vcpu->vc_vmx_msr_exit_save_va) {
+ ret = ENOMEM;
+ goto exit;
+ }
+
+ /* Compute MSR exit save area PA */
+ if (!pmap_extract(pmap_kernel(), vcpu->vc_vmx_msr_exit_save_va,
+ &vcpu->vc_vmx_msr_exit_save_pa)) {
+ ret = ENOMEM;
+ goto exit;
+ }
+
+ /* Allocate MSR entry load area VA */
+ vcpu->vc_vmx_msr_entry_load_va = (vaddr_t)km_alloc(PAGE_SIZE, &kv_page,
+ &kp_zero, &kd_waitok);
+
+ if (!vcpu->vc_vmx_msr_entry_load_va) {
+ ret = ENOMEM;
+ goto exit;
+ }
+
+ /* Compute MSR entry load area PA */
+ if (!pmap_extract(pmap_kernel(), vcpu->vc_vmx_msr_entry_load_va,
+ &vcpu->vc_vmx_msr_entry_load_pa)) {
+ ret = ENOMEM;
+ goto exit;
+ }
+
+ vmcs = (struct vmcs *)vcpu->vc_control_va;
+ vmcs->vmcs_revision = curcpu()->ci_vmm_cap.vcc_vmx.vmx_vmxon_revision;
+
+ /*
+ * Load the VMCS onto this PCPU so we can write registers
+ */
+ if (vmptrld(&vcpu->vc_control_pa)) {
+ ret = EINVAL;
+ goto exit;
+ }
+
+ /* Host CR0 */
+ cr0 = rcr0();
+ if (vmwrite(VMCS_HOST_IA32_CR0, cr0)) {
+ ret = EINVAL;
+ goto exit;
+ }
+
+ /* Host CR4 */
+ cr4 = rcr4();
+ if (vmwrite(VMCS_HOST_IA32_CR4, cr4)) {
+ ret = EINVAL;
+ goto exit;
+ }
+
+ /* Host Segment Selectors */
+ if (vmwrite(VMCS_HOST_IA32_CS_SEL, GSEL(GCODE_SEL, SEL_KPL))) {
+ ret = EINVAL;
+ goto exit;
+ }
+
+ if (vmwrite(VMCS_HOST_IA32_DS_SEL, GSEL(GDATA_SEL, SEL_KPL))) {
+ ret = EINVAL;
+ goto exit;
+ }
+
+ if (vmwrite(VMCS_HOST_IA32_ES_SEL, GSEL(GDATA_SEL, SEL_KPL))) {
+ ret = EINVAL;
+ goto exit;
+ }
+
+ if (vmwrite(VMCS_HOST_IA32_FS_SEL, GSEL(GDATA_SEL, SEL_KPL))) {
+ ret = EINVAL;
+ goto exit;
+ }
+
+ if (vmwrite(VMCS_HOST_IA32_GS_SEL, GSEL(GDATA_SEL, SEL_KPL))) {
+ ret = EINVAL;
+ goto exit;
+ }
+
+ if (vmwrite(VMCS_HOST_IA32_SS_SEL, GSEL(GDATA_SEL, SEL_KPL))) {
+ ret = EINVAL;
+ goto exit;
+ }
+
+ if (vmwrite(VMCS_HOST_IA32_TR_SEL, proc0.p_md.md_tss_sel)) {
+ ret = EINVAL;
+ goto exit;
+ }
+
+ /* Host IDTR base */
+ if (vmwrite(VMCS_HOST_IA32_IDTR_BASE, (uint32_t)idt)) {
+ ret = EINVAL;
+ goto exit;
+ }
+
+ /* VMCS link */
+ if (vmwrite(VMCS_LINK_POINTER, 0xFFFFFFFF)) {
+ ret = EINVAL;
+ goto exit;
+ }
+
+ if (vmwrite(VMCS_LINK_POINTER_HI, 0xFFFFFFFF)) {
+ ret = EINVAL;
+ goto exit;
+ }
+
+exit:
+ if (ret) {
+ if (vcpu->vc_control_va)
+ km_free((void *)vcpu->vc_control_va, PAGE_SIZE,
+ &kv_page, &kp_zero);
+ if (vcpu->vc_msr_bitmap_va)
+ km_free((void *)vcpu->vc_msr_bitmap_va, PAGE_SIZE,
+ &kv_page, &kp_zero);
+ if (vcpu->vc_vmx_msr_exit_save_va)
+ km_free((void *)vcpu->vc_vmx_msr_exit_save_va,
+ PAGE_SIZE, &kv_page, &kp_zero);
+ if (vcpu->vc_vmx_msr_exit_load_va)
+ km_free((void *)vcpu->vc_vmx_msr_exit_load_va,
+ PAGE_SIZE, &kv_page, &kp_zero);
+ if (vcpu->vc_vmx_msr_entry_load_va)
+ km_free((void *)vcpu->vc_vmx_msr_entry_load_va,
+ PAGE_SIZE, &kv_page, &kp_zero);
+ }
+
+ return (ret);
+}
+
+/*
+ * vcpu_reset_regs
+ *
+ * Resets a vcpu's registers to the provided state
+ *
+ * Parameters:
+ * vcpu: the vcpu whose registers shall be reset
+ * vrs: the desired register state
+ *
+ * Return values:
+ * 0: the vcpu's registers were successfully reset
+ * !0: the vcpu's registers could not be reset (see arch-specific reset
+ * function for various values that can be returned here)
+ */
+int
+vcpu_reset_regs(struct vcpu *vcpu, struct vcpu_reg_state *vrs)
+{
+ int ret;
+
+ if (vmm_softc->mode == VMM_MODE_VMX ||
+ vmm_softc->mode == VMM_MODE_EPT)
+ ret = vcpu_reset_regs_vmx(vcpu, vrs);
+ else if (vmm_softc->mode == VMM_MODE_SVM ||
+ vmm_softc->mode == VMM_MODE_RVI)
+ ret = vcpu_reset_regs_svm(vcpu, vrs);
+ else
+ panic("unknown vmm mode\n");
+
+ return (ret);
+}
+
+/*
+ * vcpu_init_svm
+ *
+ * AMD SVM specific VCPU initialization routine.
+ */
+int
+vcpu_init_svm(struct vcpu *vcpu)
+{
+ /* XXX removed due to rot */
+ return (0);
+}
+
+/*
+ * vcpu_init
+ *
+ * Calls the architecture-specific VCPU init routine
+ */
+int
+vcpu_init(struct vcpu *vcpu)
+{
+ int ret = 0;
+
+ vcpu->vc_hsa_stack_va = (vaddr_t)malloc(PAGE_SIZE, M_DEVBUF,
+ M_NOWAIT|M_ZERO);
+ if (!vcpu->vc_hsa_stack_va)
+ return (ENOMEM);
+
+ vcpu->vc_virt_mode = vmm_softc->mode;
+ vcpu->vc_state = VCPU_STATE_STOPPED;
+ if (vmm_softc->mode == VMM_MODE_VMX ||
+ vmm_softc->mode == VMM_MODE_EPT)
+ ret = vcpu_init_vmx(vcpu);
+ else if (vmm_softc->mode == VMM_MODE_SVM ||
+ vmm_softc->mode == VMM_MODE_RVI)
+ ret = vcpu_init_svm(vcpu);
+ else
+ panic("unknown vmm mode\n");
+
+ if (ret)
+ free((void *)vcpu->vc_hsa_stack_va, M_DEVBUF, PAGE_SIZE);
+
+ return (ret);
+}
+
+/*
+ * vcpu_deinit_vmx
+ *
+ * Deinitializes the vcpu described by 'vcpu'
+ */
+void
+vcpu_deinit_vmx(struct vcpu *vcpu)
+{
+ if (vcpu->vc_control_va)
+ km_free((void *)vcpu->vc_control_va, PAGE_SIZE,
+ &kv_page, &kp_zero);
+ if (vcpu->vc_vmx_msr_exit_save_va)
+ km_free((void *)vcpu->vc_vmx_msr_exit_save_va,
+ PAGE_SIZE, &kv_page, &kp_zero);
+ if (vcpu->vc_vmx_msr_exit_load_va)
+ km_free((void *)vcpu->vc_vmx_msr_exit_load_va,
+ PAGE_SIZE, &kv_page, &kp_zero);
+ if (vcpu->vc_vmx_msr_entry_load_va)
+ km_free((void *)vcpu->vc_vmx_msr_entry_load_va,
+ PAGE_SIZE, &kv_page, &kp_zero);
+ if (vcpu->vc_hsa_stack_va)
+ free((void *)vcpu->vc_hsa_stack_va, M_DEVBUF, PAGE_SIZE);
+}
+
+/*
+ * vcpu_deinit_svm
+ *
+ * Deinitializes the vcpu described by 'vcpu'
+ */
+void
+vcpu_deinit_svm(struct vcpu *vcpu)
+{
+ /* Unused */
+}
+
+/*
+ * vcpu_deinit
+ *
+ * Calls the architecture-specific VCPU deinit routine
+ */
+void
+vcpu_deinit(struct vcpu *vcpu)
+{
+ if (vmm_softc->mode == VMM_MODE_VMX ||
+ vmm_softc->mode == VMM_MODE_EPT)
+ vcpu_deinit_vmx(vcpu);
+ else if (vmm_softc->mode == VMM_MODE_SVM ||
+ vmm_softc->mode == VMM_MODE_RVI)
+ vcpu_deinit_svm(vcpu);
+ else
+ panic("unknown vmm mode\n");
+}
+
+/*
+ * vm_teardown
+ *
+ * Tears down (destroys) the vm indicated by 'vm'.
+ */
+void
+vm_teardown(struct vm *vm)
+{
+ struct vcpu *vcpu, *tmp;
+
+ /* Free VCPUs */
+ rw_enter_write(&vm->vm_vcpu_lock);
+ SLIST_FOREACH_SAFE(vcpu, &vm->vm_vcpu_list, vc_vcpu_link, tmp) {
+ SLIST_REMOVE(&vm->vm_vcpu_list, vcpu, vcpu, vc_vcpu_link);
+ vcpu_deinit(vcpu);
+ pool_put(&vcpu_pool, vcpu);
+ }
+
+ vm_impl_deinit(vm);
+
+ /* teardown guest vmspace */
+ if (vm->vm_map != NULL)
+ uvm_map_deallocate(vm->vm_map);
+
+ vmm_softc->vm_ct--;
+ if (vmm_softc->vm_ct < 1)
+ vmm_stop();
+ rw_exit_write(&vm->vm_vcpu_lock);
+ pool_put(&vm_pool, vm);
+}
+
+/*
+ * vcpu_vmx_check_cap
+ *
+ * Checks if the 'cap' bit in the 'msr' MSR can be set or cleared (set = 1
+ * or set = 0, respectively).
+ *
+ * When considering 'msr', we check to see if true controls are available,
+ * and use those if so.
+ *
+ * Returns 1 of 'cap' can be set/cleared as requested, 0 otherwise.
+ */
+int
+vcpu_vmx_check_cap(struct vcpu *vcpu, uint32_t msr, uint32_t cap, int set)
+{
+ uint64_t ctl;
+
+ if (vcpu->vc_vmx_basic & IA32_VMX_TRUE_CTLS_AVAIL) {
+ switch (msr) {
+ case IA32_VMX_PINBASED_CTLS:
+ ctl = vcpu->vc_vmx_true_pinbased_ctls;
+ break;
+ case IA32_VMX_PROCBASED_CTLS:
+ ctl = vcpu->vc_vmx_true_procbased_ctls;
+ break;
+ case IA32_VMX_PROCBASED2_CTLS:
+ ctl = vcpu->vc_vmx_procbased2_ctls;
+ break;
+ case IA32_VMX_ENTRY_CTLS:
+ ctl = vcpu->vc_vmx_true_entry_ctls;
+ break;
+ case IA32_VMX_EXIT_CTLS:
+ ctl = vcpu->vc_vmx_true_exit_ctls;
+ break;
+ default:
+ return (0);
+ }
+ } else {
+ switch (msr) {
+ case IA32_VMX_PINBASED_CTLS:
+ ctl = vcpu->vc_vmx_pinbased_ctls;
+ break;
+ case IA32_VMX_PROCBASED_CTLS:
+ ctl = vcpu->vc_vmx_procbased_ctls;
+ break;
+ case IA32_VMX_PROCBASED2_CTLS:
+ ctl = vcpu->vc_vmx_procbased2_ctls;
+ break;
+ case IA32_VMX_ENTRY_CTLS:
+ ctl = vcpu->vc_vmx_entry_ctls;
+ break;
+ case IA32_VMX_EXIT_CTLS:
+ ctl = vcpu->vc_vmx_exit_ctls;
+ break;
+ default:
+ return (0);
+ }
+ }
+
+ if (set) {
+ /* Check bit 'cap << 32', must be !0 */
+ return (ctl & ((uint64_t)cap << 32)) != 0;
+ } else {
+ /* Check bit 'cap', must be 0 */
+ return (ctl & cap) == 0;
+ }
+}
+
+/*
+ * vcpu_vmx_compute_ctrl
+ *
+ * Computes the appropriate control value, given the supplied parameters
+ * and CPU capabilities.
+ *
+ * Intel has made somewhat of a mess of this computation - it is described
+ * using no fewer than three different approaches, spread across many
+ * pages of the SDM. Further compounding the problem is the fact that now
+ * we have "true controls" for each type of "control", and each needs to
+ * be examined to get the calculation right, but only if "true" controls
+ * are present on the CPU we're on.
+ *
+ * Parameters:
+ * ctrlval: the control value, as read from the CPU MSR
+ * ctrl: which control is being set (eg, pinbased, procbased, etc)
+ * want0: the set of desired 0 bits
+ * want1: the set of desired 1 bits
+ * out: (out) the correct value to write into the VMCS for this VCPU,
+ * for the 'ctrl' desired.
+ *
+ * Returns 0 if successful, or EINVAL if the supplied parameters define
+ * an unworkable control setup.
+ */
+int
+vcpu_vmx_compute_ctrl(uint64_t ctrlval, uint16_t ctrl, uint32_t want1,
+ uint32_t want0, uint32_t *out)
+{
+ int i, set, clear;
+
+ /*
+ * The Intel SDM gives three formulae for determining which bits to
+ * set/clear for a given control and desired functionality. Formula
+ * 1 is the simplest but disallows use of newer features that are
+ * enabled by functionality in later CPUs.
+ *
+ * Formulas 2 and 3 allow such extra functionality. We use formula
+ * 2 - this requires us to know the identity of controls in the
+ * "default1" class for each control register, but allows us to not
+ * have to pass along and/or query both sets of capability MSRs for
+ * each control lookup. This makes the code slightly longer,
+ * however.
+ */
+ for (i = 0; i < 32; i++) {
+ /* Figure out if we can set and / or clear this bit */
+ set = (ctrlval & (1ULL << (i + 32))) != 0;
+ clear = ((1ULL << i) & ((uint64_t)ctrlval)) == 0;
+
+ /* If the bit can't be set nor cleared, something's wrong */
+ if (!set && !clear)
+ return (EINVAL);
+
+ /*
+ * Formula 2.c.i - "If the relevant VMX capability MSR
+ * reports that a control has a single setting, use that
+ * setting."
+ */
+ if (set && !clear) {
+ if (want0 & (1ULL << i))
+ return (EINVAL);
+ else
+ *out |= (1ULL << i);
+ } else if (clear && !set) {
+ if (want1 & (1ULL << i))
+ return (EINVAL);
+ else
+ *out &= ~(1ULL << i);
+ } else {
+ /*
+ * 2.c.ii - "If the relevant VMX capability MSR
+ * reports that a control can be set to 0 or 1
+ * and that control's meaning is known to the VMM,
+ * set the control based on the functionality desired."
+ */
+ if (want1 & (1ULL << i))
+ *out |= (1ULL << i);
+ else if (want0 & (1 << i))
+ *out &= ~(1ULL << i);
+ else {
+ /*
+ * ... assuming the control's meaning is not
+ * known to the VMM ...
+ *
+ * 2.c.iii - "If the relevant VMX capability
+ * MSR reports that a control can be set to 0
+ * or 1 and the control is not in the default1
+ * class, set the control to 0."
+ *
+ * 2.c.iv - "If the relevant VMX capability
+ * MSR reports that a control can be set to 0
+ * or 1 and the control is in the default1
+ * class, set the control to 1."
+ */
+ switch (ctrl) {
+ case IA32_VMX_PINBASED_CTLS:
+ case IA32_VMX_TRUE_PINBASED_CTLS:
+ /*
+ * A.3.1 - default1 class of pinbased
+ * controls comprises bits 1,2,4
+ */
+ switch (i) {
+ case 1:
+ case 2:
+ case 4:
+ *out |= (1ULL << i);
+ break;
+ default:
+ *out &= ~(1ULL << i);
+ break;
+ }
+ break;
+ case IA32_VMX_PROCBASED_CTLS:
+ case IA32_VMX_TRUE_PROCBASED_CTLS:
+ /*
+ * A.3.2 - default1 class of procbased
+ * controls comprises bits 1, 4-6, 8,
+ * 13-16, 26
+ */
+ switch (i) {
+ case 1:
+ case 4 ... 6:
+ case 8:
+ case 13 ... 16:
+ case 26:
+ *out |= (1ULL << i);
+ break;
+ default:
+ *out &= ~(1ULL << i);
+ break;
+ }
+ break;
+ /*
+ * Unknown secondary procbased controls
+ * can always be set to 0
+ */
+ case IA32_VMX_PROCBASED2_CTLS:
+ *out &= ~(1ULL << i);
+ break;
+ case IA32_VMX_EXIT_CTLS:
+ case IA32_VMX_TRUE_EXIT_CTLS:
+ /*
+ * A.4 - default1 class of exit
+ * controls comprises bits 0-8, 10,
+ * 11, 13, 14, 16, 17
+ */
+ switch (i) {
+ case 0 ... 8:
+ case 10 ... 11:
+ case 13 ... 14:
+ case 16 ... 17:
+ *out |= (1ULL << i);
+ break;
+ default:
+ *out &= ~(1ULL << i);
+ break;
+ }
+ break;
+ case IA32_VMX_ENTRY_CTLS:
+ case IA32_VMX_TRUE_ENTRY_CTLS:
+ /*
+ * A.5 - default1 class of entry
+ * controls comprises bits 0-8, 12
+ */
+ switch (i) {
+ case 0 ... 8:
+ case 12:
+ *out |= (1ULL << i);
+ break;
+ default:
+ *out &= ~(1ULL << i);
+ break;
+ }
+ break;
+ }
+ }
+ }
+ }
+
+ return (0);
+}
+
+/*
+ * vm_get_info
+ *
+ * Returns information about the VM indicated by 'vip'.
+ */
+int
+vm_get_info(struct vm_info_params *vip)
+{
+ struct vm_info_result *out;
+ struct vm *vm;
+ struct vcpu *vcpu;
+ int i, j;
+ size_t need;
+
+ rw_enter_read(&vmm_softc->vm_lock);
+ need = vmm_softc->vm_ct * sizeof(struct vm_info_result);
+ if (vip->vip_size < need) {
+ vip->vip_info_ct = 0;
+ vip->vip_size = need;
+ rw_exit_read(&vmm_softc->vm_lock);
+ return (0);
+ }
+
+ out = malloc(need, M_DEVBUF, M_NOWAIT|M_ZERO);
+ if (out == NULL) {
+ vip->vip_info_ct = 0;
+ rw_exit_read(&vmm_softc->vm_lock);
+ return (ENOMEM);
+ }
+
+ i = 0;
+ vip->vip_info_ct = vmm_softc->vm_ct;
+ SLIST_FOREACH(vm, &vmm_softc->vm_list, vm_link) {
+ out[i].vir_memory_size = vm->vm_memory_size;
+ out[i].vir_used_size =
+ pmap_resident_count(vm->vm_map->pmap) * PAGE_SIZE;
+ out[i].vir_ncpus = vm->vm_vcpu_ct;
+ out[i].vir_id = vm->vm_id;
+ out[i].vir_creator_pid = vm->vm_creator_pid;
+ strncpy(out[i].vir_name, vm->vm_name, VMM_MAX_NAME_LEN);
+ rw_enter_read(&vm->vm_vcpu_lock);
+ for (j = 0; j < vm->vm_vcpu_ct; j++) {
+ out[i].vir_vcpu_state[j] = VCPU_STATE_UNKNOWN;
+ SLIST_FOREACH(vcpu, &vm->vm_vcpu_list,
+ vc_vcpu_link) {
+ if (vcpu->vc_id == j)
+ out[i].vir_vcpu_state[j] =
+ vcpu->vc_state;
+ }
+ }
+ rw_exit_read(&vm->vm_vcpu_lock);
+ i++;
+ }
+ rw_exit_read(&vmm_softc->vm_lock);
+ if (copyout(out, vip->vip_info, need) == EFAULT) {
+ free(out, M_DEVBUF, need);
+ return (EFAULT);
+ }
+
+ free(out, M_DEVBUF, need);
+ return (0);
+}
+
+/*
+ * vm_terminate
+ *
+ * Terminates the VM indicated by 'vtp'.
+ */
+int
+vm_terminate(struct vm_terminate_params *vtp)
+{
+ struct vm *vm;
+ struct vcpu *vcpu;
+ u_int old, next;
+
+ /*
+ * Find desired VM
+ */
+ rw_enter_read(&vmm_softc->vm_lock);
+ SLIST_FOREACH(vm, &vmm_softc->vm_list, vm_link) {
+ if (vm->vm_id == vtp->vtp_vm_id)
+ break;
+ }
+
+ if (vm != NULL) {
+ rw_enter_read(&vm->vm_vcpu_lock);
+ SLIST_FOREACH(vcpu, &vm->vm_vcpu_list, vc_vcpu_link) {
+ do {
+ old = vcpu->vc_state;
+ if (old == VCPU_STATE_RUNNING)
+ next = VCPU_STATE_REQTERM;
+ else if (old == VCPU_STATE_STOPPED)
+ next = VCPU_STATE_TERMINATED;
+ else /* must be REQTERM or TERMINATED */
+ break;
+ } while (old != atomic_cas_uint(&vcpu->vc_state,
+ old, next));
+ }
+ rw_exit_read(&vm->vm_vcpu_lock);
+ }
+ rw_exit_read(&vmm_softc->vm_lock);
+
+ if (vm == NULL)
+ return (ENOENT);
+
+ /* XXX possible race here two threads terminating the same vm? */
+ rw_enter_write(&vmm_softc->vm_lock);
+ SLIST_REMOVE(&vmm_softc->vm_list, vm, vm, vm_link);
+ rw_exit_write(&vmm_softc->vm_lock);
+ if (vm->vm_vcpus_running == 0)
+ vm_teardown(vm);
+
+ return (0);
+}
+
+/*
+ * vm_run
+ *
+ * Run the vm / vcpu specified by 'vrp'
+ */
+int
+vm_run(struct vm_run_params *vrp)
+{
+ struct vm *vm;
+ struct vcpu *vcpu;
+ int ret = 0;
+ u_int old, next;
+
+ /*
+ * Find desired VM
+ */
+ rw_enter_read(&vmm_softc->vm_lock);
+
+ SLIST_FOREACH(vm, &vmm_softc->vm_list, vm_link) {
+ if (vm->vm_id == vrp->vrp_vm_id)
+ break;
+ }
+
+ /*
+ * Attempt to locate the requested VCPU. If found, attempt to
+ * to transition from VCPU_STATE_STOPPED -> VCPU_STATE_RUNNING.
+ * Failure to make the transition indicates the VCPU is busy.
+ */
+ if (vm != NULL) {
+ rw_enter_read(&vm->vm_vcpu_lock);
+ SLIST_FOREACH(vcpu, &vm->vm_vcpu_list, vc_vcpu_link) {
+ if (vcpu->vc_id == vrp->vrp_vcpu_id)
+ break;
+ }
+
+ if (vcpu != NULL) {
+ old = VCPU_STATE_STOPPED;
+ next = VCPU_STATE_RUNNING;
+
+ if (atomic_cas_uint(&vcpu->vc_state, old, next) != old)
+ ret = EBUSY;
+ else
+ atomic_inc_int(&vm->vm_vcpus_running);
+ }
+ rw_exit_read(&vm->vm_vcpu_lock);
+
+ if (vcpu == NULL)
+ ret = ENOENT;
+ }
+ rw_exit_read(&vmm_softc->vm_lock);
+
+ if (vm == NULL)
+ ret = ENOENT;
+
+ /* Bail if errors detected in the previous steps */
+ if (ret)
+ return (ret);
+
+ /*
+ * We may be returning from userland helping us from the last exit.
+ * If so (vrp_continue == 1), copy in the exit data from vmd. The
+ * exit data will be consumed before the next entry (this typically
+ * comprises VCPU register changes as the result of vmd(8)'s actions).
+ */
+ if (vrp->vrp_continue) {
+ if (copyin(vrp->vrp_exit, &vcpu->vc_exit,
+ sizeof(union vm_exit)) == EFAULT) {
+ return (EFAULT);
+ }
+ }
+
+ /* Run the VCPU specified in vrp */
+ if (vcpu->vc_virt_mode == VMM_MODE_VMX ||
+ vcpu->vc_virt_mode == VMM_MODE_EPT) {
+ ret = vcpu_run_vmx(vcpu, vrp);
+ } else if (vcpu->vc_virt_mode == VMM_MODE_SVM ||
+ vcpu->vc_virt_mode == VMM_MODE_RVI) {
+ ret = vcpu_run_svm(vcpu, vrp);
+ }
+
+ /*
+ * We can set the VCPU states here without CAS because once
+ * a VCPU is in state RUNNING or REQTERM, only the VCPU itself
+ * can switch the state.
+ */
+ atomic_dec_int(&vm->vm_vcpus_running);
+ if (vcpu->vc_state == VCPU_STATE_REQTERM) {
+ vrp->vrp_exit_reason = VM_EXIT_TERMINATED;
+ vcpu->vc_state = VCPU_STATE_TERMINATED;
+ if (vm->vm_vcpus_running == 0)
+ vm_teardown(vm);
+ ret = 0;
+ } else if (ret == EAGAIN) {
+ /* If we are exiting, populate exit data so vmd can help. */
+ vrp->vrp_exit_reason = vcpu->vc_gueststate.vg_exit_reason;
+ vrp->vrp_irqready = vcpu->vc_irqready;
+ vcpu->vc_state = VCPU_STATE_STOPPED;
+
+ if (copyout(&vcpu->vc_exit, vrp->vrp_exit,
+ sizeof(union vm_exit)) == EFAULT) {
+ ret = EFAULT;
+ } else
+ ret = 0;
+ } else if (ret == 0) {
+ vrp->vrp_exit_reason = VM_EXIT_NONE;
+ vcpu->vc_state = VCPU_STATE_STOPPED;
+ } else {
+ vrp->vrp_exit_reason = VM_EXIT_TERMINATED;
+ vcpu->vc_state = VCPU_STATE_TERMINATED;
+ }
+
+ return (ret);
+}
+
+/*
+ * vcpu_must_stop
+ *
+ * Check if we need to (temporarily) stop running the VCPU for some reason,
+ * such as:
+ * - the VM was requested to terminate
+ * - the proc running this VCPU has pending signals
+ */
+int
+vcpu_must_stop(struct vcpu *vcpu)
+{
+ struct proc *p = curproc;
+
+ if (vcpu->vc_state == VCPU_STATE_REQTERM)
+ return (1);
+ if (CURSIG(p) != 0)
+ return (1);
+ return (0);
+}
+
+/*
+ * vcpu_run_vmx
+ *
+ * VMM main loop used to run a VCPU.
+ *
+ * Parameters:
+ * vcpu: The VCPU to run
+ * vrp: run parameters
+ *
+ * Return values:
+ * 0: The run loop exited and no help is needed from vmd
+ * EAGAIN: The run loop exited and help from vmd is needed
+ * EINVAL: an error occured
+ */
+int
+vcpu_run_vmx(struct vcpu *vcpu, struct vm_run_params *vrp)
+{
+ int ret = 0, resume, locked, exitinfo;
+ struct region_descriptor gdt;
+ struct cpu_info *ci;
+ uint64_t cr3, vmcs_ptr;
+ uint32_t insn_error, exit_reason;
+ struct schedstate_percpu *spc;
+ struct vmx_invvpid_descriptor vid;
+ uint32_t eii;
+ uint32_t procbased;
+ uint16_t irq;
+
+ resume = 0;
+ irq = vrp->vrp_irq;
+
+ /*
+ * If we are returning from userspace (vmd) because we exited
+ * last time, fix up any needed vcpu state first. Which state
+ * needs to be fixed up depends on what vmd populated in the
+ * exit data structure.
+ */
+ if (vrp->vrp_continue) {
+ switch (vcpu->vc_gueststate.vg_exit_reason) {
+ case VMX_EXIT_IO:
+ vcpu->vc_gueststate.vg_eax =
+ vcpu->vc_exit.vei.vei_data;
+ break;
+ case VMX_EXIT_HLT:
+ break;
+ case VMX_EXIT_INT_WINDOW:
+ break;
+ case VMX_EXIT_EXTINT:
+ break;
+ case VMX_EXIT_EPT_VIOLATION:
+ break;
+#ifdef VMM_DEBUG
+ case VMX_EXIT_TRIPLE_FAULT:
+ DPRINTF("%s: vm %d vcpu %d triple fault\n",
+ __func__, vcpu->vc_parent->vm_id,
+ vcpu->vc_id);
+ vmx_vcpu_dump_regs(vcpu);
+ dump_vcpu(vcpu);
+ vmx_dump_vmcs(vcpu);
+ break;
+ case VMX_EXIT_ENTRY_FAILED_GUEST_STATE:
+ DPRINTF("%s: vm %d vcpu %d failed entry "
+ "due to invalid guest state\n",
+ __func__, vcpu->vc_parent->vm_id,
+ vcpu->vc_id);
+ vmx_vcpu_dump_regs(vcpu);
+ dump_vcpu(vcpu);
+ return EINVAL;
+ default:
+ DPRINTF("%s: unimplemented exit type %d (%s)\n",
+ __func__,
+ vcpu->vc_gueststate.vg_exit_reason,
+ vmx_exit_reason_decode(
+ vcpu->vc_gueststate.vg_exit_reason));
+ vmx_vcpu_dump_regs(vcpu);
+ dump_vcpu(vcpu);
+ break;
+#endif /* VMM_DEBUG */
+ }
+ }
+
+ while (ret == 0) {
+ if (!resume) {
+ /*
+ * We are launching for the first time, or we are
+ * resuming from a different pcpu, so we need to
+ * reset certain pcpu-specific values.
+ */
+ ci = curcpu();
+ setregion(&gdt, ci->ci_gdt, NGDT * sizeof(union descriptor) - 1);
+
+ vcpu->vc_last_pcpu = ci;
+
+ if (vmptrld(&vcpu->vc_control_pa)) {
+ ret = EINVAL;
+ break;
+ }
+
+ if (gdt.rd_base == 0) {
+ ret = EINVAL;
+ break;
+ }
+
+ /* Host GDTR base */
+ if (vmwrite(VMCS_HOST_IA32_GDTR_BASE, gdt.rd_base)) {
+ ret = EINVAL;
+ break;
+ }
+
+ /* Host TR base */
+ if (vmwrite(VMCS_HOST_IA32_TR_BASE,
+ proc0.p_md.md_tss_sel)) {
+ ret = EINVAL;
+ break;
+ }
+
+ /* Host CR3 */
+ cr3 = rcr3();
+ if (vmwrite(VMCS_HOST_IA32_CR3, cr3)) {
+ ret = EINVAL;
+ break;
+ }
+ }
+
+ /* Handle vmd(8) injected interrupts */
+ /* XXX - 0x20 should be changed to PIC's vector base */
+
+ /* Is there an interrupt pending injection? */
+ if (irq != 0xFFFF) {
+ if (!vcpu->vc_irqready) {
+ printf("vcpu_run_vmx: error - irq injected"
+ " while not ready\n");
+ ret = EINVAL;
+ break;
+ }
+
+ eii = (irq & 0xFF) + 0x20;
+ eii |= (1ULL << 31); /* Valid */
+ eii |= (0ULL << 8); /* Hardware Interrupt */
+ if (vmwrite(VMCS_ENTRY_INTERRUPTION_INFO, eii)) {
+ printf("vcpu_run_vmx: can't vector "
+ "interrupt to guest\n");
+ ret = EINVAL;
+ break;
+ }
+
+ irq = 0xFFFF;
+ } else if (!vcpu->vc_intr) {
+ /*
+ * Disable window exiting
+ */
+ if (vmread(VMCS_PROCBASED_CTLS, &procbased)) {
+ printf("vcpu_run_vmx: can't read"
+ "procbased ctls on exit\n");
+ ret = EINVAL;
+ break;
+ } else {
+ procbased &= ~IA32_VMX_INTERRUPT_WINDOW_EXITING;
+ if (vmwrite(VMCS_PROCBASED_CTLS, procbased)) {
+ printf("vcpu_run_vmx: can't write"
+ " procbased ctls on exit\n");
+ ret = EINVAL;
+ break;
+ }
+ }
+ }
+
+ /* Invalidate old TLB mappings */
+ vid.vid_vpid = (uint64_t)vcpu->vc_parent->vm_id;
+ vid.vid_addr = 0ULL;
+ invvpid(IA32_VMX_INVVPID_SINGLE_CTX_GLB, &vid);
+
+ /* Start / resume the VCPU */
+ KERNEL_ASSERT_LOCKED();
+ KERNEL_UNLOCK();
+ ret = vmx_enter_guest(&vcpu->vc_control_pa,
+ &vcpu->vc_gueststate, resume, gdt.rd_base);
+
+ /* XXX */
+ tlbflushg();
+
+ exit_reason = VM_EXIT_NONE;
+ if (ret == 0) {
+ /*
+ * ret == 0 implies we entered the guest, and later
+ * exited for some valid reason
+ */
+ exitinfo = vmx_get_exit_info(
+ &vcpu->vc_gueststate.vg_eip, &exit_reason);
+ if (vmread(VMCS_GUEST_IA32_RFLAGS,
+ &vcpu->vc_gueststate.vg_eflags)) {
+ printf("vcpu_run_vmx: can't read guest rflags"
+ " during exit\n");
+ ret = EINVAL;
+ break;
+ }
+ }
+
+ if (ret || exitinfo != VMX_EXIT_INFO_COMPLETE ||
+ exit_reason != VMX_EXIT_EXTINT) {
+ KERNEL_LOCK();
+ locked = 1;
+ } else
+ locked = 0;
+
+ /* If we exited successfully ... */
+ if (ret == 0) {
+ resume = 1;
+ if (!(exitinfo & VMX_EXIT_INFO_HAVE_RIP)) {
+ printf("vcpu_run_vmx: cannot read guest rip\n");
+ ret = EINVAL;
+ break;
+ }
+
+ if (!(exitinfo & VMX_EXIT_INFO_HAVE_REASON)) {
+ printf("vcpu_run_vmx: cant read exit reason\n");
+ ret = EINVAL;
+ break;
+ }
+
+ /*
+ * Handle the exit. This will alter "ret" to EAGAIN if
+ * the exit handler determines help from vmd is needed.
+ */
+ vcpu->vc_gueststate.vg_exit_reason = exit_reason;
+ ret = vmx_handle_exit(vcpu);
+
+ /*
+ * When the guest exited due to an external interrupt,
+ * we do not yet hold the kernel lock: we need to
+ * handle interrupts first before grabbing the lock:
+ * the interrupt handler might do work that
+ * another CPU holding the kernel lock waits for.
+ *
+ * Example: the TLB shootdown code in the pmap module
+ * sends an IPI to all other CPUs and busy-waits for
+ * them to decrement tlb_shoot_wait to zero. While
+ * busy-waiting, the kernel lock is held.
+ *
+ * If this code here attempted to grab the kernel lock
+ * before handling the interrupt, it would block
+ * forever.
+ */
+ if (!locked)
+ KERNEL_LOCK();
+
+ if (vcpu->vc_gueststate.vg_eflags & PSL_I)
+ vcpu->vc_irqready = 1;
+ else
+ vcpu->vc_irqready = 0;
+
+ /*
+ * If not ready for interrupts, but interrupts pending,
+ * enable interrupt window exiting.
+ */
+ if (vcpu->vc_irqready == 0 && vcpu->vc_intr) {
+ if (vmread(VMCS_PROCBASED_CTLS, &procbased)) {
+ printf("vcpu_run_vmx: can't read"
+ " procbased ctls on intwin exit\n");
+ ret = EINVAL;
+ break;
+ }
+
+ procbased |= IA32_VMX_INTERRUPT_WINDOW_EXITING;
+ if (vmwrite(VMCS_PROCBASED_CTLS, procbased)) {
+ printf("vcpu_run_vmx: can't write"
+ " procbased ctls on intwin exit\n");
+ ret = EINVAL;
+ break;
+ }
+ }
+
+ /*
+ * Exit to vmd if we are terminating, failed to enter,
+ * or need help (device I/O)
+ */
+ if (ret || vcpu_must_stop(vcpu))
+ break;
+
+ if (vcpu->vc_intr && vcpu->vc_irqready) {
+ ret = EAGAIN;
+ break;
+ }
+
+ /* Check if we should yield - don't hog the cpu */
+ spc = &ci->ci_schedstate;
+ if (spc->spc_schedflags & SPCF_SHOULDYIELD) {
+ resume = 0;
+ if (vmclear(&vcpu->vc_control_pa)) {
+ ret = EINVAL;
+ break;
+ }
+ yield();
+ }
+ } else if (ret == VMX_FAIL_LAUNCH_INVALID_VMCS) {
+ printf("vcpu_run_vmx: failed launch with invalid "
+ "vmcs\n");
+#ifdef VMM_DEBUG
+ vmx_vcpu_dump_regs(vcpu);
+ dump_vcpu(vcpu);
+#endif /* VMM_DEBUG */
+ ret = EINVAL;
+ } else if (ret == VMX_FAIL_LAUNCH_VALID_VMCS) {
+ exit_reason = vcpu->vc_gueststate.vg_exit_reason;
+ printf("vcpu_run_vmx: failed launch with valid "
+ "vmcs, code=%d (%s)\n", exit_reason,
+ vmx_instruction_error_decode(exit_reason));
+ if (vmread(VMCS_INSTRUCTION_ERROR, &insn_error)) {
+ printf("vcpu_run_vmx: can't read"
+ " insn error field\n");
+ } else
+ printf("vcpu_run_vmx: insn error code = "
+ "%d\n", insn_error);
+#ifdef VMM_DEBUG
+ vmx_vcpu_dump_regs(vcpu);
+ dump_vcpu(vcpu);
+#endif /* VMM_DEBUG */
+ ret = EINVAL;
+ } else {
+ printf("vcpu_run_vmx: failed launch for unknown "
+ "reason %d\n", ret);
+#ifdef VMM_DEBUG
+ vmx_vcpu_dump_regs(vcpu);
+ dump_vcpu(vcpu);
+#endif /* VMM_DEBUG */
+ ret = EINVAL;
+ }
+ }
+
+ /*
+ * We are heading back to userspace (vmd), either because we need help
+ * handling an exit, a guest interrupt is pending, or we failed in some
+ * way to enter the guest. Clear any current VMCS pointer as we may end
+ * up coming back on a different CPU.
+ */
+ if (!vmptrst(&vmcs_ptr)) {
+ if (vmcs_ptr != 0xFFFFFFFFFFFFFFFFULL)
+ if (vmclear(&vcpu->vc_control_pa))
+ ret = EINVAL;
+ } else
+ ret = EINVAL;
+
+ return (ret);
+}
+
+/*
+ * vmx_handle_intr
+ *
+ * Handle host (external) interrupts. We read which interrupt fired by
+ * extracting the vector from the VMCS and dispatch the interrupt directly
+ * to the host using vmm_dispatch_intr.
+ */
+void
+vmx_handle_intr(struct vcpu *vcpu)
+{
+ uint8_t vec;
+ uint32_t eii;
+ struct gate_descriptor *idte;
+ vaddr_t handler;
+
+ if (vmread(VMCS_EXIT_INTERRUPTION_INFO, &eii)) {
+ printf("vmx_handle_intr: can't obtain intr info\n");
+ return;
+ }
+
+ vec = eii & 0xFF;
+
+ /* XXX check "error valid" code in eii, abort if 0 */
+ idte=&idt[vec];
+ handler = idte->gd_looffset + ((uint64_t)idte->gd_hioffset << 16);
+ vmm_dispatch_intr(handler);
+}
+
+/*
+ * vmx_handle_hlt
+ *
+ * Handle HLT exits
+ */
+int
+vmx_handle_hlt(struct vcpu *vcpu)
+{
+ uint32_t insn_length;
+
+ if (vmread(VMCS_INSTRUCTION_LENGTH, &insn_length)) {
+ printf("vmx_handle_hlt: can't obtain instruction length\n");
+ return (EINVAL);
+ }
+
+ vcpu->vc_gueststate.vg_eip += insn_length;
+ return (EAGAIN);
+}
+
+/*
+ * vmx_get_exit_info
+ *
+ * Returns exit information containing the current guest RIP and exit reason
+ * in rip and exit_reason. The return value is a bitmask indicating whether
+ * reading the RIP and exit reason was successful.
+ */
+int
+vmx_get_exit_info(uint32_t *eip, uint32_t *exit_reason)
+{
+ int rv = 0;
+
+ if (vmread(VMCS_GUEST_IA32_RIP, eip) == 0) {
+ rv |= VMX_EXIT_INFO_HAVE_RIP;
+ if (vmread(VMCS_EXIT_REASON, exit_reason) == 0)
+ rv |= VMX_EXIT_INFO_HAVE_REASON;
+ }
+ return (rv);
+}
+
+/*
+ * vmx_handle_exit
+ *
+ * Handle exits from the VM by decoding the exit reason and calling various
+ * subhandlers as needed.
+ */
+int
+vmx_handle_exit(struct vcpu *vcpu)
+{
+ uint64_t exit_reason;
+ uint32_t eflags;
+ int update_rip, ret = 0;
+
+ update_rip = 0;
+ exit_reason = vcpu->vc_gueststate.vg_exit_reason;
+ eflags = vcpu->vc_gueststate.vg_eflags;
+
+ switch (exit_reason) {
+ case VMX_EXIT_INT_WINDOW:
+ if (!(eflags & PSL_I)) {
+ DPRINTF("vmx_handle_exit: impossible interrupt window"
+ " exit config\n");
+ ret = EINVAL;
+ break;
+ }
+
+ ret = EAGAIN;
+ update_rip = 0;
+ break;
+ case VMX_EXIT_EPT_VIOLATION:
+ ret = vmx_handle_np_fault(vcpu);
+ break;
+ case VMX_EXIT_CPUID:
+ ret = vmx_handle_cpuid(vcpu);
+ update_rip = 1;
+ break;
+ case VMX_EXIT_IO:
+ ret = vmx_handle_inout(vcpu);
+ update_rip = 1;
+ break;
+ case VMX_EXIT_EXTINT:
+ vmx_handle_intr(vcpu);
+ update_rip = 0;
+ break;
+ case VMX_EXIT_CR_ACCESS:
+ ret = vmx_handle_cr(vcpu);
+ update_rip = 1;
+ break;
+ case VMX_EXIT_HLT:
+ ret = vmx_handle_hlt(vcpu);
+ update_rip = 1;
+ break;
+ case VMX_EXIT_RDMSR:
+ ret = vmx_handle_rdmsr(vcpu);
+ update_rip = 1;
+ break;
+ case VMX_EXIT_WRMSR:
+ ret = vmx_handle_wrmsr(vcpu);
+ update_rip = 1;
+ break;
+ case VMX_EXIT_TRIPLE_FAULT:
+#ifdef VMM_DEBUG
+ DPRINTF("vmx_handle_exit: vm %d vcpu %d triple fault\n",
+ vcpu->vc_parent->vm_id, vcpu->vc_id);
+ vmx_vcpu_dump_regs(vcpu);
+ dump_vcpu(vcpu);
+ vmx_dump_vmcs(vcpu);
+#endif /* VMM_DEBUG */
+ ret = EAGAIN;
+ update_rip = 0;
+ break;
+ default:
+ DPRINTF("vmx_handle_exit: unhandled exit %lld (%s)\n",
+ exit_reason, vmx_exit_reason_decode(exit_reason));
+ return (EINVAL);
+ }
+
+ if (update_rip) {
+ if (vmwrite(VMCS_GUEST_IA32_RIP,
+ vcpu->vc_gueststate.vg_eip)) {
+ printf("vmx_handle_exit: can't advance rip\n");
+ return (EINVAL);
+ }
+ }
+
+ return (ret);
+}
+
+/*
+ * vmm_get_guest_memtype
+ *
+ * Returns the type of memory 'gpa' refers to in the context of vm 'vm'
+ */
+int
+vmm_get_guest_memtype(struct vm *vm, paddr_t gpa)
+{
+ int i;
+ struct vm_mem_range *vmr;
+
+ if (gpa >= VMM_PCI_MMIO_BAR_BASE && gpa <= VMM_PCI_MMIO_BAR_END) {
+ DPRINTF("guest mmio access @ 0x%llx\n", (uint64_t)gpa);
+ return (VMM_MEM_TYPE_REGULAR);
+ }
+
+ /* XXX Use binary search? */
+ for (i = 0; i < vm->vm_nmemranges; i++) {
+ vmr = &vm->vm_memranges[i];
+
+ /*
+ * vm_memranges are ascending. gpa can no longer be in one of
+ * the memranges
+ */
+ if (gpa < vmr->vmr_gpa)
+ break;
+
+ if (gpa < vmr->vmr_gpa + vmr->vmr_size)
+ return (VMM_MEM_TYPE_REGULAR);
+ }
+
+ DPRINTF("guest memtype @ 0x%llx unknown\n", (uint64_t)gpa);
+ return (VMM_MEM_TYPE_UNKNOWN);
+}
+
+/*
+ * vmm_get_guest_faulttype
+ *
+ * Determines the type (R/W/X) of the last fault on the VCPU last run on
+ * this PCPU. Calls the appropriate architecture-specific subroutine.
+ */
+int
+vmm_get_guest_faulttype(void)
+{
+ if (vmm_softc->mode == VMM_MODE_EPT)
+ return vmx_get_guest_faulttype();
+ else if (vmm_softc->mode == VMM_MODE_RVI)
+ return vmx_get_guest_faulttype();
+ else
+ panic("unknown vmm mode\n");
+}
+
+/*
+ * vmx_get_exit_qualification
+ *
+ * Return the current VMCS' exit qualification information
+ */
+int
+vmx_get_exit_qualification(uint32_t *exit_qualification)
+{
+ if (vmread(VMCS_GUEST_EXIT_QUALIFICATION, exit_qualification)) {
+ printf("vmm_get_exit_qualification: cant extract exit qual\n");
+ return (EINVAL);
+ }
+
+ return (0);
+}
+
+/*
+ * vmx_get_guest_faulttype
+ *
+ * Determines the type (R/W/X) of the last fault on the VCPU last run on
+ * this PCPU.
+ */
+int
+vmx_get_guest_faulttype(void)
+{
+ uint32_t exit_qualification;
+ uint64_t presentmask = IA32_VMX_EPT_FAULT_WAS_READABLE |
+ IA32_VMX_EPT_FAULT_WAS_WRITABLE | IA32_VMX_EPT_FAULT_WAS_EXECABLE;
+ uint64_t protmask = IA32_VMX_EPT_FAULT_READ |
+ IA32_VMX_EPT_FAULT_WRITE | IA32_VMX_EPT_FAULT_EXEC;
+
+ if (vmx_get_exit_qualification(&exit_qualification))
+ return (-1);
+
+ if ((exit_qualification & presentmask) == 0)
+ return VM_FAULT_INVALID;
+ if (exit_qualification & protmask)
+ return VM_FAULT_PROTECT;
+ return (-1);
+}
+
+/*
+ * svm_get_guest_faulttype
+ *
+ * Determines the type (R/W/X) of the last fault on the VCPU last run on
+ * this PCPU.
+ */
+int
+svm_get_guest_faulttype(void)
+{
+ /* XXX removed due to rot */
+ return (-1);
+}
+
+/*
+ * vmx_fault_page
+ *
+ * Request a new page to be faulted into the UVM map of the VM owning 'vcpu'
+ * at address 'gpa'.
+ */
+int
+vmx_fault_page(struct vcpu *vcpu, paddr_t gpa)
+{
+ int fault_type, ret;
+
+ fault_type = vmx_get_guest_faulttype();
+ if (fault_type == -1) {
+ printf("vmx_fault_page: invalid fault type\n");
+ return (EINVAL);
+ }
+
+ ret = uvm_fault(vcpu->vc_parent->vm_map, gpa, fault_type,
+ PROT_READ | PROT_WRITE | PROT_EXEC);
+ if (ret)
+ printf("vmx_fault_page: uvm_fault returns %d\n", ret);
+
+ return (ret);
+}
+
+/*
+ * vmx_handle_np_fault
+ *
+ * High level nested paging handler for VMX. Verifies that a fault is for a
+ * valid memory region, then faults a page, or aborts otherwise.
+ */
+int
+vmx_handle_np_fault(struct vcpu *vcpu)
+{
+ uint64_t gpa;
+ uint32_t gpa_lo, gpa_hi;
+ int gpa_memtype, ret;
+
+ ret = 0;
+ if (vmread(VMCS_GUEST_PHYSICAL_ADDRESS, &gpa_lo)) {
+ printf("vmm_handle_np_fault: cannot extract faulting pa lo\n");
+ return (EINVAL);
+ }
+
+ if (vmread(VMCS_GUEST_PHYSICAL_ADDRESS_HI, &gpa_hi)) {
+ printf("vmm_handle_np_fault: cannot extract faulting pa hi\n");
+ return (EINVAL);
+ }
+
+ gpa = (uint64_t)gpa_lo | (uint64_t)gpa_hi << 32ULL;
+
+ gpa_memtype = vmm_get_guest_memtype(vcpu->vc_parent, gpa);
+ switch (gpa_memtype) {
+ case VMM_MEM_TYPE_REGULAR:
+ ret = vmx_fault_page(vcpu, gpa);
+ break;
+ default:
+ printf("unknown memory type %d for GPA 0x%llx\n",
+ gpa_memtype, gpa);
+ return (EINVAL);
+ }
+
+ return (ret);
+}
+
+/*
+ * vmx_handle_inout
+ *
+ * Exit handler for IN/OUT instructions.
+ *
+ * The vmm can handle certain IN/OUTS without exiting to vmd, but most of these
+ * will be passed to vmd for completion.
+ */
+int
+vmx_handle_inout(struct vcpu *vcpu)
+{
+ uint32_t insn_length;
+ uint32_t exit_qual;
+ int ret;
+
+ if (vmread(VMCS_INSTRUCTION_LENGTH, &insn_length)) {
+ printf("vmx_handle_inout: can't obtain instruction length\n");
+ return (EINVAL);
+ }
+
+ if (vmx_get_exit_qualification(&exit_qual)) {
+ printf("vmx_handle_inout: can't get exit qual\n");
+ return (EINVAL);
+ }
+
+ /* Bits 0:2 - size of exit */
+ vcpu->vc_exit.vei.vei_size = (exit_qual & 0x7) + 1;
+ /* Bit 3 - direction */
+ vcpu->vc_exit.vei.vei_dir = (exit_qual & 0x8) >> 3;
+ /* Bit 4 - string instruction? */
+ vcpu->vc_exit.vei.vei_string = (exit_qual & 0x10) >> 4;
+ /* Bit 5 - REP prefix? */
+ vcpu->vc_exit.vei.vei_rep = (exit_qual & 0x20) >> 5;
+ /* Bit 6 - Operand encoding */
+ vcpu->vc_exit.vei.vei_encoding = (exit_qual & 0x40) >> 6;
+ /* Bit 16:31 - port */
+ vcpu->vc_exit.vei.vei_port = (exit_qual & 0xFFFF0000) >> 16;
+ /* Data */
+ vcpu->vc_exit.vei.vei_data = vcpu->vc_gueststate.vg_eax;
+
+ vcpu->vc_gueststate.vg_eip += insn_length;
+
+ /*
+ * The following ports usually belong to devices owned by vmd.
+ * Return EAGAIN to signal help needed from userspace (vmd).
+ * Return 0 to indicate we don't care about this port.
+ *
+ * XXX something better than a hardcoded list here, maybe
+ * configure via vmd via the device list in vm create params?
+ *
+ * XXX handle not eax target
+ */
+ switch (vcpu->vc_exit.vei.vei_port) {
+ case IO_ICU1 ... IO_ICU1 + 1:
+ case 0x40 ... 0x43:
+ case IO_RTC ... IO_RTC + 1:
+ case IO_ICU2 ... IO_ICU2 + 1:
+ case 0x3f8 ... 0x3ff:
+ case 0xcf8:
+ case 0xcfc:
+ case VMM_PCI_IO_BAR_BASE ... VMM_PCI_IO_BAR_END:
+ ret = EAGAIN;
+ break;
+ default:
+ /* Read from unsupported ports returns FFs */
+ if (vcpu->vc_exit.vei.vei_dir == 1)
+ vcpu->vc_gueststate.vg_eax = 0xFFFFFFFF;
+ ret = 0;
+ }
+
+ return (ret);
+}
+
+/*
+ * vmx_handle_cr
+ *
+ * Handle reads/writes to control registers (except CR3)
+ */
+int
+vmx_handle_cr(struct vcpu *vcpu)
+{
+ uint32_t insn_length;
+ uint32_t exit_qual;
+ uint8_t crnum, dir;
+
+ if (vmread(VMCS_INSTRUCTION_LENGTH, &insn_length)) {
+ printf("vmx_handle_cr: can't obtain instruction length\n");
+ return (EINVAL);
+ }
+
+ if (vmx_get_exit_qualification(&exit_qual)) {
+ printf("vmx_handle_cr: can't get exit qual\n");
+ return (EINVAL);
+ }
+
+ /* Low 4 bits of exit_qual represent the CR number */
+ crnum = exit_qual & 0xf;
+
+ dir = (exit_qual & 0x30) >> 4;
+
+ switch (dir) {
+ case CR_WRITE:
+ DPRINTF("vmx_handle_cr: mov to cr%d @ %x\n",
+ crnum, vcpu->vc_gueststate.vg_eip);
+ break;
+ case CR_READ:
+ DPRINTF("vmx_handle_cr: mov from cr%d @ %x\n",
+ crnum, vcpu->vc_gueststate.vg_eip);
+ break;
+ case CR_CLTS:
+ DPRINTF("vmx_handle_cr: clts instruction @ %x\n",
+ vcpu->vc_gueststate.vg_eip);
+ break;
+ case CR_LMSW:
+ DPRINTF("vmx_handle_cr: lmsw instruction @ %x\n",
+ vcpu->vc_gueststate.vg_eip);
+ break;
+ default:
+ DPRINTF("vmx_handle_cr: unknown cr access @ %x\n",
+ vcpu->vc_gueststate.vg_eip);
+ }
+
+ vcpu->vc_gueststate.vg_eip += insn_length;
+
+ return (0);
+}
+
+/*
+ * vmx_handle_rdmsr
+ *
+ * Handler for rdmsr instructions. Bitmap MSRs are allowed implicit access
+ * and won't end up here. This handler is primarily intended to catch otherwise
+ * unknown MSR access for possible later inclusion in the bitmap list. For
+ * each MSR access that ends up here, we log the access.
+ *
+ * Parameters:
+ * vcpu: vcpu structure containing instruction info causing the exit
+ *
+ * Return value:
+ * 0: The operation was successful
+ * 1: An error occurred
+ */
+int
+vmx_handle_rdmsr(struct vcpu *vcpu)
+{
+ uint32_t insn_length;
+ uint64_t msr;
+ uint32_t *eax, *ecx, *edx;
+
+ if (vmread(VMCS_INSTRUCTION_LENGTH, &insn_length)) {
+ printf("%s: can't obtain instruction length\n", __func__);
+ return (EINVAL);
+ }
+
+ /* All RDMSR instructions are 0x0F 0x32 */
+ KASSERT(insn_length == 2);
+
+ eax = &vcpu->vc_gueststate.vg_eax;
+ ecx = &vcpu->vc_gueststate.vg_ecx;
+ edx = &vcpu->vc_gueststate.vg_edx;
+
+ msr = rdmsr(*ecx);
+ *eax = msr & 0xFFFFFFFFULL;
+ *edx = msr >> 32;
+
+ /* XXX log the access for now, to be able to identify unknown MSRs */
+ printf("%s: rdmsr exit, msr=0x%x, data returned to "
+ "guest=0x%x:0x%x\n", __func__, *ecx, *edx, *eax);
+
+ vcpu->vc_gueststate.vg_eip += insn_length;
+
+ return (0);
+}
+
+/*
+ * vmx_handle_wrmsr
+ *
+ * Handler for wrmsr instructions. This handler logs the access, and discards
+ * the written data. Any valid wrmsr will not end up here (it will be
+ * whitelisted in the MSR bitmap).
+ *
+ * Parameters:
+ * vcpu: vcpu structure containing instruction info causing the exit
+ *
+ * Return value:
+ * 0: The operation was successful
+ * 1: An error occurred
+ */
+int
+vmx_handle_wrmsr(struct vcpu *vcpu)
+{
+ uint32_t insn_length;
+ uint32_t *eax, *ecx, *edx;
+
+ if (vmread(VMCS_INSTRUCTION_LENGTH, &insn_length)) {
+ printf("%s: can't obtain instruction length\n", __func__);
+ return (EINVAL);
+ }
+
+ /* All WRMSR instructions are 0x0F 0x30 */
+ KASSERT(insn_length == 2);
+
+ eax = &vcpu->vc_gueststate.vg_eax;
+ ecx = &vcpu->vc_gueststate.vg_ecx;
+ edx = &vcpu->vc_gueststate.vg_edx;
+
+ /* XXX log the access for now, to be able to identify unknown MSRs */
+ printf("%s: wrmsr exit, msr=0x%x, discarding data written from "
+ "guest=0x%x:0x%x\n", __func__, *ecx, *edx, *eax);
+
+ vcpu->vc_gueststate.vg_eip += insn_length;
+
+ return (0);
+}
+
+/*
+ * vmx_handle_cpuid
+ *
+ * Exit handler for CPUID instruction
+ */
+int
+vmx_handle_cpuid(struct vcpu *vcpu)
+{
+ uint32_t insn_length;
+ uint32_t *eax, *ebx, *ecx, *edx;
+
+ if (vmread(VMCS_INSTRUCTION_LENGTH, &insn_length)) {
+ printf("vmx_handle_cpuid: can't obtain instruction length\n");
+ return (EINVAL);
+ }
+
+ /* All CPUID instructions are 0x0F 0xA2 */
+ KASSERT(insn_length == 2);
+
+ eax = &vcpu->vc_gueststate.vg_eax;
+ ebx = &vcpu->vc_gueststate.vg_ebx;
+ ecx = &vcpu->vc_gueststate.vg_ecx;
+ edx = &vcpu->vc_gueststate.vg_edx;
+
+ switch (*eax) {
+ case 0x00: /* Max level and vendor ID */
+ *eax = 0x07; /* cpuid_level */
+ *ebx = *((uint32_t *)&cpu_vendor);
+ *edx = *((uint32_t *)&cpu_vendor + 1);
+ *ecx = *((uint32_t *)&cpu_vendor + 2);
+ break;
+ case 0x01: /* Version, brand, feature info */
+ *eax = cpu_id;
+ /* mask off host's APIC ID, reset to vcpu id */
+ *ebx = cpu_miscinfo & 0x00FFFFFF;
+ *ebx &= (vcpu->vc_id & 0xFF) << 24;
+ /*
+ * clone host capabilities minus:
+ * debug store (CPUIDECX_DTES64, CPUIDECX_DSCPL, CPUID_DS)
+ * monitor/mwait (CPUIDECX_MWAIT)
+ * vmx (CPUIDECX_VMX)
+ * smx (CPUIDECX_SMX)
+ * speedstep (CPUIDECX_EST)
+ * thermal (CPUIDECX_TM2, CPUID_ACPI, CPUID_TM)
+ * context id (CPUIDECX_CNXTID)
+ * silicon debug (CPUIDECX_SDBG)
+ * xTPR (CPUIDECX_XTPR)
+ * perf/debug (CPUIDECX_PDCM)
+ * pcid (CPUIDECX_PCID)
+ * direct cache access (CPUIDECX_DCA)
+ * x2APIC (CPUIDECX_X2APIC)
+ * apic deadline (CPUIDECX_DEADLINE)
+ * timestamp (CPUID_TSC)
+ * apic (CPUID_APIC)
+ * psn (CPUID_PSN)
+ * self snoop (CPUID_SS)
+ * hyperthreading (CPUID_HTT)
+ * pending break enabled (CPUID_PBE)
+ * MTRR (CPUID_MTRR)
+ * plus:
+ * hypervisor (CPUIDECX_HV)
+ */
+ *ecx = (cpu_ecxfeature | CPUIDECX_HV) &
+ ~(CPUIDECX_EST | CPUIDECX_TM2 |
+ CPUIDECX_MWAIT | CPUIDECX_PDCM |
+ CPUIDECX_VMX | CPUIDECX_DTES64 |
+ CPUIDECX_DSCPL | CPUIDECX_SMX |
+ CPUIDECX_CNXTID | CPUIDECX_SDBG |
+ CPUIDECX_XTPR |
+ CPUIDECX_PCID | CPUIDECX_DCA |
+ CPUIDECX_X2APIC | CPUIDECX_DEADLINE);
+ *edx = curcpu()->ci_feature_flags &
+ ~(CPUID_ACPI | CPUID_TM | CPUID_TSC |
+ CPUID_HTT | CPUID_DS | CPUID_APIC |
+ CPUID_PSN | CPUID_SS | CPUID_PBE |
+ CPUID_MTRR);
+ break;
+ case 0x02: /* Cache and TLB information */
+ DPRINTF("vmx_handle_cpuid: function 0x02 (cache/TLB) not"
+ " supported\n");
+ *eax = 0;
+ *ebx = 0;
+ *ecx = 0;
+ *edx = 0;
+ break;
+ case 0x03: /* Processor serial number (not supported) */
+ *eax = 0;
+ *ebx = 0;
+ *ecx = 0;
+ *edx = 0;
+ break;
+ case 0x04:
+ DPRINTF("vmx_handle_cpuid: function 0x04 (deterministic "
+ "cache info) not supported\n");
+ *eax = 0;
+ *ebx = 0;
+ *ecx = 0;
+ *edx = 0;
+ break;
+ case 0x05: /* MONITOR/MWAIT (not supported) */
+ *eax = 0;
+ *ebx = 0;
+ *ecx = 0;
+ *edx = 0;
+ break;
+ case 0x06: /* Thermal / Power management */
+ *eax = 0;
+ *ebx = 0;
+ *ecx = 0;
+ *edx = 0;
+ break;
+ case 0x07: /* SEFF */
+ if (*ecx == 0) {
+ /*
+ * SEFF flags - copy from host minus:
+ * SGX (SEFF0EBX_SGX)
+ * HLE (SEFF0EBX_HLE)
+ * INVPCID (SEFF0EBX_INVPCID)
+ * RTM (SEFF0EBX_RTM)
+ * PQM (SEFF0EBX_PQM)
+ * MPX (SEFF0EBX_MPX)
+ * PCOMMIT (SEFF0EBX_PCOMMIT)
+ * PT (SEFF0EBX_PT)
+ */
+ *eax = 0; /* Highest subleaf supported */
+ *ebx = curcpu()->ci_feature_sefflags_ebx &
+ ~(SEFF0EBX_SGX | SEFF0EBX_HLE | SEFF0EBX_INVPCID |
+ SEFF0EBX_RTM | SEFF0EBX_PQM | SEFF0EBX_MPX |
+ SEFF0EBX_PCOMMIT | SEFF0EBX_PT);
+ *ecx = curcpu()->ci_feature_sefflags_ecx;
+ *edx = 0;
+ } else {
+ /* Unsupported subleaf */
+ *eax = 0;
+ *ebx = 0;
+ *ecx = 0;
+ *edx = 0;
+ }
+ break;
+ case 0x09: /* Direct Cache Access (not supported) */
+ DPRINTF("vmx_handle_cpuid: function 0x09 (direct cache access)"
+ " not supported\n");
+ *eax = 0;
+ *ebx = 0;
+ *ecx = 0;
+ *edx = 0;
+ break;
+ case 0x0a: /* Architectural performance monitoring */
+ *eax = 0;
+ *ebx = 0;
+ *ecx = 0;
+ *edx = 0;
+ break;
+ case 0x0b: /* Extended topology enumeration (not supported) */
+ DPRINTF("vmx_handle_cpuid: function 0x0b (topology enumeration)"
+ " not supported\n");
+ *eax = 0;
+ *ebx = 0;
+ *ecx = 0;
+ *edx = 0;
+ break;
+ case 0x0d: /* Processor ext. state information (not supported) */
+ DPRINTF("vmx_handle_cpuid: function 0x0d (ext. state info)"
+ " not supported\n");
+ *eax = 0;
+ *ebx = 0;
+ *ecx = 0;
+ *edx = 0;
+ break;
+ case 0x0f: /* QoS info (not supported) */
+ DPRINTF("vmx_handle_cpuid: function 0x0f (QoS info)"
+ " not supported\n");
+ *eax = 0;
+ *ebx = 0;
+ *ecx = 0;
+ *edx = 0;
+ break;
+ case 0x14: /* Processor Trace info (not supported) */
+ DPRINTF("vmx_handle_cpuid: function 0x14 (processor trace info)"
+ " not supported\n");
+ *eax = 0;
+ *ebx = 0;
+ *ecx = 0;
+ *edx = 0;
+ break;
+ case 0x15: /* TSC / Core Crystal Clock info (not supported) */
+ DPRINTF("vmx_handle_cpuid: function 0x15 (TSC / CCC info)"
+ " not supported\n");
+ *eax = 0;
+ *ebx = 0;
+ *ecx = 0;
+ *edx = 0;
+ break;
+ case 0x16: /* Processor frequency info (not supported) */
+ DPRINTF("vmx_handle_cpuid: function 0x16 (frequency info)"
+ " not supported\n");
+ *eax = 0;
+ *ebx = 0;
+ *ecx = 0;
+ *edx = 0;
+ break;
+ case 0x40000000: /* Hypervisor information */
+ *eax = 0;
+ *ebx = *((uint32_t *)&vmm_hv_signature[0]);
+ *ecx = *((uint32_t *)&vmm_hv_signature[4]);
+ *edx = *((uint32_t *)&vmm_hv_signature[8]);
+ break;
+ case 0x80000000: /* Extended function level */
+ *eax = 0x80000007; /* curcpu()->ci_pnfeatset */
+ *ebx = 0;
+ *ecx = 0;
+ *edx = 0;
+ case 0x80000001: /* Extended function info */
+ *eax = ecpu_eaxfeature;
+ *ebx = 0; /* Reserved */
+ *ecx = ecpu_ecxfeature;
+ *edx = ecpu_feature;
+ break;
+ case 0x80000002: /* Brand string */
+ *eax = cpu_brandstr[0];
+ *ebx = cpu_brandstr[1];
+ *ecx = cpu_brandstr[2];
+ *edx = cpu_brandstr[3];
+ break;
+ case 0x80000003: /* Brand string */
+ *eax = cpu_brandstr[4];
+ *ebx = cpu_brandstr[5];
+ *ecx = cpu_brandstr[6];
+ *edx = cpu_brandstr[7];
+ break;
+ case 0x80000004: /* Brand string */
+ *eax = cpu_brandstr[8];
+ *ebx = cpu_brandstr[9];
+ *ecx = cpu_brandstr[10];
+ *edx = cpu_brandstr[11];
+ break;
+ case 0x80000005: /* Reserved (Intel), cacheinfo (AMD) */
+ *eax = curcpu()->ci_amdcacheinfo[0];
+ *ebx = curcpu()->ci_amdcacheinfo[1];
+ *ecx = curcpu()->ci_amdcacheinfo[2];
+ *edx = curcpu()->ci_amdcacheinfo[3];
+ break;
+ case 0x80000006: /* ext. cache info */
+ *eax = curcpu()->ci_extcacheinfo[0];
+ *ebx = curcpu()->ci_extcacheinfo[1];
+ *ecx = curcpu()->ci_extcacheinfo[2];
+ *edx = curcpu()->ci_extcacheinfo[3];
+ break;
+ case 0x80000007: /* apmi */
+ *eax = 0; /* Reserved */
+ *ebx = 0; /* Reserved */
+ *ecx = 0; /* Reserved */
+ *edx = 0; /* unsupported ITSC */
+ break;
+ case 0x80000008: /* Phys bits info and topology (AMD) */
+ DPRINTF("vmx_handle_cpuid: function 0x80000008 (phys bits info)"
+ " not supported\n");
+ *eax = 0;
+ *ebx = 0;
+ *ecx = 0;
+ *edx = 0;
+ break;
+ default:
+ DPRINTF("vmx_handle_cpuid: unsupported eax=0x%x\n", *eax);
+ *eax = 0;
+ *ebx = 0;
+ *ecx = 0;
+ *edx = 0;
+ }
+
+ vcpu->vc_gueststate.vg_eip += insn_length;
+
+ return (0);
+}
+
+/*
+ * vcpu_run_svm
+ *
+ * VMM main loop used to run a VCPU.
+ */
+int
+vcpu_run_svm(struct vcpu *vcpu, struct vm_run_params *vrp)
+{
+ /* XXX removed due to rot */
+ return (0);
+}
+
+/*
+ * vmx_exit_reason_decode
+ *
+ * Returns a human readable string describing exit type 'code'
+ */
+const char *
+vmx_exit_reason_decode(uint32_t code)
+{
+ switch (code) {
+ case VMX_EXIT_NMI: return "NMI";
+ case VMX_EXIT_EXTINT: return "external interrupt";
+ case VMX_EXIT_TRIPLE_FAULT: return "triple fault";
+ case VMX_EXIT_INIT: return "INIT signal";
+ case VMX_EXIT_SIPI: return "SIPI signal";
+ case VMX_EXIT_IO_SMI: return "I/O SMI";
+ case VMX_EXIT_OTHER_SMI: return "other SMI";
+ case VMX_EXIT_INT_WINDOW: return "interrupt window";
+ case VMX_EXIT_NMI_WINDOW: return "NMI window";
+ case VMX_EXIT_TASK_SWITCH: return "task switch";
+ case VMX_EXIT_CPUID: return "CPUID instruction";
+ case VMX_EXIT_GETSEC: return "GETSEC instruction";
+ case VMX_EXIT_HLT: return "HLT instruction";
+ case VMX_EXIT_INVD: return "INVD instruction";
+ case VMX_EXIT_INVLPG: return "INVLPG instruction";
+ case VMX_EXIT_RDPMC: return "RDPMC instruction";
+ case VMX_EXIT_RDTSC: return "RDTSC instruction";
+ case VMX_EXIT_RSM: return "RSM instruction";
+ case VMX_EXIT_VMCALL: return "VMCALL instruction";
+ case VMX_EXIT_VMCLEAR: return "VMCLEAR instruction";
+ case VMX_EXIT_VMLAUNCH: return "VMLAUNCH instruction";
+ case VMX_EXIT_VMPTRLD: return "VMPTRLD instruction";
+ case VMX_EXIT_VMPTRST: return "VMPTRST instruction";
+ case VMX_EXIT_VMREAD: return "VMREAD instruction";
+ case VMX_EXIT_VMRESUME: return "VMRESUME instruction";
+ case VMX_EXIT_VMWRITE: return "VMWRITE instruction";
+ case VMX_EXIT_VMXOFF: return "VMXOFF instruction";
+ case VMX_EXIT_VMXON: return "VMXON instruction";
+ case VMX_EXIT_CR_ACCESS: return "CR access";
+ case VMX_EXIT_MOV_DR: return "MOV DR instruction";
+ case VMX_EXIT_IO: return "I/O instruction";
+ case VMX_EXIT_RDMSR: return "RDMSR instruction";
+ case VMX_EXIT_WRMSR: return "WRMSR instruction";
+ case VMX_EXIT_ENTRY_FAILED_GUEST_STATE: return "guest state invalid";
+ case VMX_EXIT_ENTRY_FAILED_MSR_LOAD: return "MSR load failed";
+ case VMX_EXIT_MWAIT: return "MWAIT instruction";
+ case VMX_EXIT_MTF: return "monitor trap flag";
+ case VMX_EXIT_MONITOR: return "MONITOR instruction";
+ case VMX_EXIT_PAUSE: return "PAUSE instruction";
+ case VMX_EXIT_ENTRY_FAILED_MCE: return "MCE during entry";
+ case VMX_EXIT_TPR_BELOW_THRESHOLD: return "TPR below threshold";
+ case VMX_EXIT_APIC_ACCESS: return "APIC access";
+ case VMX_EXIT_VIRTUALIZED_EOI: return "virtualized EOI";
+ case VMX_EXIT_GDTR_IDTR: return "GDTR/IDTR access";
+ case VMX_EXIT_LDTR_TR: return "LDTR/TR access";
+ case VMX_EXIT_EPT_VIOLATION: return "EPT violation";
+ case VMX_EXIT_EPT_MISCONFIGURATION: return "EPT misconfiguration";
+ case VMX_EXIT_INVEPT: return "INVEPT instruction";
+ case VMX_EXIT_RDTSCP: return "RDTSCP instruction";
+ case VMX_EXIT_VMX_PREEMPTION_TIMER_EXPIRED:
+ return "preemption timer expired";
+ case VMX_EXIT_INVVPID: return "INVVPID instruction";
+ case VMX_EXIT_WBINVD: return "WBINVD instruction";
+ case VMX_EXIT_XSETBV: return "XSETBV instruction";
+ case VMX_EXIT_APIC_WRITE: return "APIC write";
+ case VMX_EXIT_RDRAND: return "RDRAND instruction";
+ case VMX_EXIT_INVPCID: return "INVPCID instruction";
+ case VMX_EXIT_VMFUNC: return "VMFUNC instruction";
+ case VMX_EXIT_RDSEED: return "RDSEED instruction";
+ case VMX_EXIT_XSAVES: return "XSAVES instruction";
+ case VMX_EXIT_XRSTORS: return "XRSTORS instruction";
+ default: return "unknown";
+ }
+}
+
+/*
+ * vmx_instruction_error_decode
+ *
+ * Returns a human readable string describing the instruction error in 'code'
+ */
+const char *
+vmx_instruction_error_decode(uint32_t code)
+{
+ switch (code) {
+ case 1: return "VMCALL: unsupported in VMX root";
+ case 2: return "VMCLEAR: invalid paddr";
+ case 3: return "VMCLEAR: VMXON pointer";
+ case 4: return "VMLAUNCH: non-clear VMCS";
+ case 5: return "VMRESUME: non-launched VMCS";
+ case 6: return "VMRESUME: executed after VMXOFF";
+ case 7: return "VM entry: invalid control field(s)";
+ case 8: return "VM entry: invalid host state field(s)";
+ case 9: return "VMPTRLD: invalid paddr";
+ case 10: return "VMPTRLD: VMXON pointer";
+ case 11: return "VMPTRLD: incorrect VMCS revid";
+ case 12: return "VMREAD/VMWRITE: unsupported VMCS field";
+ case 13: return "VMWRITE: RO VMCS field";
+ case 15: return "VMXON: unsupported in VMX root";
+ case 20: return "VMCALL: invalid VM exit control fields";
+ case 26: return "VM entry: blocked by MOV SS";
+ case 28: return "Invalid operand to INVEPT/INVVPID";
+ default: return "unknown";
+ }
+}
+
+/*
+ * vcpu_state_decode
+ *
+ * Returns a human readable string describing the vcpu state in 'state'.
+ */
+const char *
+vcpu_state_decode(u_int state)
+{
+ switch (state) {
+ case VCPU_STATE_STOPPED: return "stopped";
+ case VCPU_STATE_RUNNING: return "running";
+ case VCPU_STATE_REQTERM: return "requesting termination";
+ case VCPU_STATE_TERMINATED: return "terminated";
+ case VCPU_STATE_UNKNOWN: return "unknown";
+ default: return "invalid";
+ }
+}
+
+#ifdef VMM_DEBUG
+/*
+ * dump_vcpu
+ *
+ * Dumps the VMX capabilites of vcpu 'vcpu'
+ */
+void
+dump_vcpu(struct vcpu *vcpu)
+{
+ printf("vcpu @ %p\n", vcpu);
+ printf(" parent vm @ %p\n", vcpu->vc_parent);
+ printf(" mode: ");
+ if (vcpu->vc_virt_mode == VMM_MODE_VMX ||
+ vcpu->vc_virt_mode == VMM_MODE_EPT) {
+ printf("VMX\n");
+ printf(" pinbased ctls: 0x%llx\n",
+ vcpu->vc_vmx_pinbased_ctls);
+ printf(" true pinbased ctls: 0x%llx\n",
+ vcpu->vc_vmx_true_pinbased_ctls);
+ CTRL_DUMP(vcpu, PINBASED, EXTERNAL_INT_EXITING);
+ CTRL_DUMP(vcpu, PINBASED, NMI_EXITING);
+ CTRL_DUMP(vcpu, PINBASED, VIRTUAL_NMIS);
+ CTRL_DUMP(vcpu, PINBASED, ACTIVATE_VMX_PREEMPTION_TIMER);
+ CTRL_DUMP(vcpu, PINBASED, PROCESS_POSTED_INTERRUPTS);
+ printf(" procbased ctls: 0x%llx\n",
+ vcpu->vc_vmx_procbased_ctls);
+ printf(" true procbased ctls: 0x%llx\n",
+ vcpu->vc_vmx_true_procbased_ctls);
+ CTRL_DUMP(vcpu, PROCBASED, INTERRUPT_WINDOW_EXITING);
+ CTRL_DUMP(vcpu, PROCBASED, USE_TSC_OFFSETTING);
+ CTRL_DUMP(vcpu, PROCBASED, HLT_EXITING);
+ CTRL_DUMP(vcpu, PROCBASED, INVLPG_EXITING);
+ CTRL_DUMP(vcpu, PROCBASED, MWAIT_EXITING);
+ CTRL_DUMP(vcpu, PROCBASED, RDPMC_EXITING);
+ CTRL_DUMP(vcpu, PROCBASED, RDTSC_EXITING);
+ CTRL_DUMP(vcpu, PROCBASED, CR3_LOAD_EXITING);
+ CTRL_DUMP(vcpu, PROCBASED, CR3_STORE_EXITING);
+ CTRL_DUMP(vcpu, PROCBASED, CR8_LOAD_EXITING);
+ CTRL_DUMP(vcpu, PROCBASED, CR8_STORE_EXITING);
+ CTRL_DUMP(vcpu, PROCBASED, USE_TPR_SHADOW);
+ CTRL_DUMP(vcpu, PROCBASED, NMI_WINDOW_EXITING);
+ CTRL_DUMP(vcpu, PROCBASED, MOV_DR_EXITING);
+ CTRL_DUMP(vcpu, PROCBASED, UNCONDITIONAL_IO_EXITING);
+ CTRL_DUMP(vcpu, PROCBASED, USE_IO_BITMAPS);
+ CTRL_DUMP(vcpu, PROCBASED, MONITOR_TRAP_FLAG);
+ CTRL_DUMP(vcpu, PROCBASED, USE_MSR_BITMAPS);
+ CTRL_DUMP(vcpu, PROCBASED, MONITOR_EXITING);
+ CTRL_DUMP(vcpu, PROCBASED, PAUSE_EXITING);
+ if (vcpu_vmx_check_cap(vcpu, IA32_VMX_PROCBASED_CTLS,
+ IA32_VMX_ACTIVATE_SECONDARY_CONTROLS, 1)) {
+ printf(" procbased2 ctls: 0x%llx\n",
+ vcpu->vc_vmx_procbased2_ctls);
+ CTRL_DUMP(vcpu, PROCBASED2, VIRTUALIZE_APIC);
+ CTRL_DUMP(vcpu, PROCBASED2, ENABLE_EPT);
+ CTRL_DUMP(vcpu, PROCBASED2, DESCRIPTOR_TABLE_EXITING);
+ CTRL_DUMP(vcpu, PROCBASED2, ENABLE_RDTSCP);
+ CTRL_DUMP(vcpu, PROCBASED2, VIRTUALIZE_X2APIC_MODE);
+ CTRL_DUMP(vcpu, PROCBASED2, ENABLE_VPID);
+ CTRL_DUMP(vcpu, PROCBASED2, WBINVD_EXITING);
+ CTRL_DUMP(vcpu, PROCBASED2, UNRESTRICTED_GUEST);
+ CTRL_DUMP(vcpu, PROCBASED2,
+ APIC_REGISTER_VIRTUALIZATION);
+ CTRL_DUMP(vcpu, PROCBASED2,
+ VIRTUAL_INTERRUPT_DELIVERY);
+ CTRL_DUMP(vcpu, PROCBASED2, PAUSE_LOOP_EXITING);
+ CTRL_DUMP(vcpu, PROCBASED2, RDRAND_EXITING);
+ CTRL_DUMP(vcpu, PROCBASED2, ENABLE_INVPCID);
+ CTRL_DUMP(vcpu, PROCBASED2, ENABLE_VM_FUNCTIONS);
+ CTRL_DUMP(vcpu, PROCBASED2, VMCS_SHADOWING);
+ CTRL_DUMP(vcpu, PROCBASED2, ENABLE_ENCLS_EXITING);
+ CTRL_DUMP(vcpu, PROCBASED2, RDSEED_EXITING);
+ CTRL_DUMP(vcpu, PROCBASED2, ENABLE_PML);
+ CTRL_DUMP(vcpu, PROCBASED2, EPT_VIOLATION_VE);
+ CTRL_DUMP(vcpu, PROCBASED2, CONCEAL_VMX_FROM_PT);
+ CTRL_DUMP(vcpu, PROCBASED2, ENABLE_XSAVES_XRSTORS);
+ CTRL_DUMP(vcpu, PROCBASED2, ENABLE_TSC_SCALING);
+ }
+ printf(" entry ctls: 0x%llx\n",
+ vcpu->vc_vmx_entry_ctls);
+ printf(" true entry ctls: 0x%llx\n",
+ vcpu->vc_vmx_true_entry_ctls);
+ CTRL_DUMP(vcpu, ENTRY, LOAD_DEBUG_CONTROLS);
+ CTRL_DUMP(vcpu, ENTRY, IA32E_MODE_GUEST);
+ CTRL_DUMP(vcpu, ENTRY, ENTRY_TO_SMM);
+ CTRL_DUMP(vcpu, ENTRY, DEACTIVATE_DUAL_MONITOR_TREATMENT);
+ CTRL_DUMP(vcpu, ENTRY, LOAD_IA32_PERF_GLOBAL_CTRL_ON_ENTRY);
+ CTRL_DUMP(vcpu, ENTRY, LOAD_IA32_PAT_ON_ENTRY);
+ CTRL_DUMP(vcpu, ENTRY, LOAD_IA32_EFER_ON_ENTRY);
+ CTRL_DUMP(vcpu, ENTRY, LOAD_IA32_BNDCFGS_ON_ENTRY);
+ CTRL_DUMP(vcpu, ENTRY, CONCEAL_VM_ENTRIES_FROM_PT);
+ printf(" exit ctls: 0x%llx\n",
+ vcpu->vc_vmx_exit_ctls);
+ printf(" true exit ctls: 0x%llx\n",
+ vcpu->vc_vmx_true_exit_ctls);
+ CTRL_DUMP(vcpu, EXIT, SAVE_DEBUG_CONTROLS);
+ CTRL_DUMP(vcpu, EXIT, HOST_SPACE_ADDRESS_SIZE);
+ CTRL_DUMP(vcpu, EXIT, LOAD_IA32_PERF_GLOBAL_CTRL_ON_EXIT);
+ CTRL_DUMP(vcpu, EXIT, ACKNOWLEDGE_INTERRUPT_ON_EXIT);
+ CTRL_DUMP(vcpu, EXIT, SAVE_IA32_PAT_ON_EXIT);
+ CTRL_DUMP(vcpu, EXIT, LOAD_IA32_PAT_ON_EXIT);
+ CTRL_DUMP(vcpu, EXIT, SAVE_IA32_EFER_ON_EXIT);
+ CTRL_DUMP(vcpu, EXIT, LOAD_IA32_EFER_ON_EXIT);
+ CTRL_DUMP(vcpu, EXIT, SAVE_VMX_PREEMPTION_TIMER);
+ CTRL_DUMP(vcpu, EXIT, CLEAR_IA32_BNDCFGS_ON_EXIT);
+ CTRL_DUMP(vcpu, EXIT, CONCEAL_VM_EXITS_FROM_PT);
+ }
+}
+
+/*
+ * vmx_dump_vmcs_field
+ *
+ * Debug function to dump the contents of a single VMCS field
+ *
+ * Parameters:
+ * fieldid: VMCS Field ID
+ * msg: string to display
+ */
+void
+vmx_dump_vmcs_field(uint16_t fieldid, const char *msg)
+{
+ uint8_t width;
+ uint64_t val;
+ uint32_t val_lo, val_hi;
+
+ DPRINTF("%s (0x%04x): ", msg, fieldid);
+ width = (fieldid >> 13) & 0x3;
+
+ if (width == 1) {
+ if (vmread(fieldid, &val_lo)) {
+ DPRINTF("???? ");
+ return;
+ }
+ if (vmread(fieldid + 1, &val_hi)) {
+ DPRINTF("???? ");
+ return;
+ }
+
+ val = (uint64_t)val_lo | (uint64_t)val_hi << 32ULL;
+ }
+
+ /*
+ * Field width encoding : bits 13:14
+ *
+ * 0: 16-bit
+ * 1: 64-bit
+ * 2: 32-bit
+ * 3: natural width
+ */
+ switch (width) {
+ case 0: DPRINTF("0x%04llx ", val); break;
+ case 1:
+ case 3: DPRINTF("0x%016llx ", val); break;
+ case 2: DPRINTF("0x%08llx ", val);
+ }
+}
+
+/*
+ * vmx_dump_vmcs
+ *
+ * Debug function to dump the contents of the current VMCS.
+ */
+void
+vmx_dump_vmcs(struct vcpu *vcpu)
+{
+ int has_sec, i;
+ uint32_t cr3_tgt_ct;
+
+ /* XXX save and load new vmcs, restore at end */
+
+ DPRINTF("--CURRENT VMCS STATE--\n");
+ DPRINTF("VMXON revision : 0x%x\n",
+ curcpu()->ci_vmm_cap.vcc_vmx.vmx_vmxon_revision);
+ DPRINTF("CR0 fixed0: 0x%llx\n",
+ curcpu()->ci_vmm_cap.vcc_vmx.vmx_cr0_fixed0);
+ DPRINTF("CR0 fixed1: 0x%llx\n",
+ curcpu()->ci_vmm_cap.vcc_vmx.vmx_cr0_fixed1);
+ DPRINTF("CR4 fixed0: 0x%llx\n",
+ curcpu()->ci_vmm_cap.vcc_vmx.vmx_cr4_fixed0);
+ DPRINTF("CR4 fixed1: 0x%llx\n",
+ curcpu()->ci_vmm_cap.vcc_vmx.vmx_cr4_fixed1);
+ DPRINTF("MSR table size: 0x%x\n",
+ 512 * (curcpu()->ci_vmm_cap.vcc_vmx.vmx_msr_table_size + 1));
+
+ has_sec = vcpu_vmx_check_cap(vcpu, IA32_VMX_PROCBASED_CTLS,
+ IA32_VMX_ACTIVATE_SECONDARY_CONTROLS, 1);
+
+ if (has_sec) {
+ if (vcpu_vmx_check_cap(vcpu, IA32_VMX_PROCBASED2_CTLS,
+ IA32_VMX_ENABLE_VPID, 1)) {
+ vmx_dump_vmcs_field(VMCS_GUEST_VPID, "VPID");
+ }
+ }
+
+ if (vcpu_vmx_check_cap(vcpu, IA32_VMX_PINBASED_CTLS,
+ IA32_VMX_PROCESS_POSTED_INTERRUPTS, 1)) {
+ vmx_dump_vmcs_field(VMCS_POSTED_INT_NOTIF_VECTOR,
+ "Posted Int Notif Vec");
+ }
+
+ if (has_sec) {
+ if (vcpu_vmx_check_cap(vcpu, IA32_VMX_PROCBASED2_CTLS,
+ IA32_VMX_EPT_VIOLATION_VE, 1)) {
+ vmx_dump_vmcs_field(VMCS_EPTP_INDEX, "EPTP idx");
+ }
+ }
+
+ DPRINTF("\n");
+ vmx_dump_vmcs_field(VMCS_GUEST_IA32_ES_SEL, "G.ES");
+ vmx_dump_vmcs_field(VMCS_GUEST_IA32_CS_SEL, "G.CS");
+ vmx_dump_vmcs_field(VMCS_GUEST_IA32_SS_SEL, "G.SS");
+ DPRINTF("\n");
+ vmx_dump_vmcs_field(VMCS_GUEST_IA32_DS_SEL, "G.DS");
+ vmx_dump_vmcs_field(VMCS_GUEST_IA32_FS_SEL, "G.FS");
+ vmx_dump_vmcs_field(VMCS_GUEST_IA32_GS_SEL, "G.GS");
+ DPRINTF("\n");
+ vmx_dump_vmcs_field(VMCS_GUEST_IA32_LDTR_SEL, "LDTR");
+ vmx_dump_vmcs_field(VMCS_GUEST_IA32_TR_SEL, "G.TR");
+
+ if (has_sec) {
+ if (vcpu_vmx_check_cap(vcpu, IA32_VMX_PROCBASED2_CTLS,
+ IA32_VMX_VIRTUAL_INTERRUPT_DELIVERY, 1)) {
+ vmx_dump_vmcs_field(VMCS_GUEST_INTERRUPT_STATUS,
+ "Int sts");
+ }
+
+ if (vcpu_vmx_check_cap(vcpu, IA32_VMX_PROCBASED2_CTLS,
+ IA32_VMX_ENABLE_PML, 1)) {
+ vmx_dump_vmcs_field(VMCS_GUEST_PML_INDEX, "PML Idx");
+ }
+ }
+
+ DPRINTF("\n");
+ vmx_dump_vmcs_field(VMCS_HOST_IA32_ES_SEL, "H.ES");
+ vmx_dump_vmcs_field(VMCS_HOST_IA32_CS_SEL, "H.CS");
+ vmx_dump_vmcs_field(VMCS_HOST_IA32_SS_SEL, "H.SS");
+ DPRINTF("\n");
+ vmx_dump_vmcs_field(VMCS_HOST_IA32_DS_SEL, "H.DS");
+ vmx_dump_vmcs_field(VMCS_HOST_IA32_FS_SEL, "H.FS");
+ vmx_dump_vmcs_field(VMCS_HOST_IA32_GS_SEL, "H.GS");
+ DPRINTF("\n");
+
+ vmx_dump_vmcs_field(VMCS_IO_BITMAP_A, "I/O Bitmap A");
+ DPRINTF("\n");
+ vmx_dump_vmcs_field(VMCS_IO_BITMAP_B, "I/O Bitmap B");
+ DPRINTF("\n");
+
+ if (vcpu_vmx_check_cap(vcpu, IA32_VMX_PROCBASED_CTLS,
+ IA32_VMX_USE_MSR_BITMAPS, 1)) {
+ vmx_dump_vmcs_field(VMCS_MSR_BITMAP_ADDRESS, "MSR Bitmap");
+ DPRINTF("\n");
+ }
+
+ vmx_dump_vmcs_field(VMCS_EXIT_STORE_MSR_ADDRESS, "Exit Store MSRs");
+ DPRINTF("\n");
+ vmx_dump_vmcs_field(VMCS_EXIT_LOAD_MSR_ADDRESS, "Exit Load MSRs");
+ DPRINTF("\n");
+ vmx_dump_vmcs_field(VMCS_ENTRY_LOAD_MSR_ADDRESS, "Entry Load MSRs");
+ DPRINTF("\n");
+ vmx_dump_vmcs_field(VMCS_EXECUTIVE_VMCS_POINTER, "Exec VMCS Ptr");
+ DPRINTF("\n");
+
+ if (has_sec) {
+ if (vcpu_vmx_check_cap(vcpu, IA32_VMX_PROCBASED2_CTLS,
+ IA32_VMX_ENABLE_PML, 1)) {
+ vmx_dump_vmcs_field(VMCS_PML_ADDRESS, "PML Addr");
+ DPRINTF("\n");
+ }
+ }
+
+ vmx_dump_vmcs_field(VMCS_TSC_OFFSET, "TSC Offset");
+ DPRINTF("\n");
+
+ if (vcpu_vmx_check_cap(vcpu, IA32_VMX_PROCBASED_CTLS,
+ IA32_VMX_USE_TPR_SHADOW, 1)) {
+ vmx_dump_vmcs_field(VMCS_VIRTUAL_APIC_ADDRESS,
+ "Virtual APIC Addr");
+ DPRINTF("\n");
+ }
+
+ if (has_sec) {
+ if (vcpu_vmx_check_cap(vcpu, IA32_VMX_PROCBASED2_CTLS,
+ IA32_VMX_VIRTUALIZE_APIC, 1)) {
+ vmx_dump_vmcs_field(VMCS_APIC_ACCESS_ADDRESS,
+ "APIC Access Addr");
+ DPRINTF("\n");
+ }
+ }
+
+ if (vcpu_vmx_check_cap(vcpu, IA32_VMX_PINBASED_CTLS,
+ IA32_VMX_PROCESS_POSTED_INTERRUPTS, 1)) {
+ vmx_dump_vmcs_field(VMCS_POSTED_INTERRUPT_DESC,
+ "Posted Int Desc Addr");
+ DPRINTF("\n");
+ }
+
+ if (has_sec) {
+ if (vcpu_vmx_check_cap(vcpu, IA32_VMX_PROCBASED2_CTLS,
+ IA32_VMX_ENABLE_VM_FUNCTIONS, 1)) {
+ vmx_dump_vmcs_field(VMCS_VM_FUNCTION_CONTROLS,
+ "VM Function Controls");
+ DPRINTF("\n");
+ }
+
+ if (vcpu_vmx_check_cap(vcpu, IA32_VMX_PROCBASED2_CTLS,
+ IA32_VMX_ENABLE_EPT, 1)) {
+ vmx_dump_vmcs_field(VMCS_GUEST_IA32_EPTP,
+ "EPT Pointer");
+ DPRINTF("\n");
+ }
+
+ if (vcpu_vmx_check_cap(vcpu, IA32_VMX_PROCBASED2_CTLS,
+ IA32_VMX_VIRTUAL_INTERRUPT_DELIVERY, 1)) {
+ vmx_dump_vmcs_field(VMCS_EOI_EXIT_BITMAP_0,
+ "EOI Exit Bitmap 0");
+ DPRINTF("\n");
+ vmx_dump_vmcs_field(VMCS_EOI_EXIT_BITMAP_1,
+ "EOI Exit Bitmap 1");
+ DPRINTF("\n");
+ vmx_dump_vmcs_field(VMCS_EOI_EXIT_BITMAP_2,
+ "EOI Exit Bitmap 2");
+ DPRINTF("\n");
+ vmx_dump_vmcs_field(VMCS_EOI_EXIT_BITMAP_3,
+ "EOI Exit Bitmap 3");
+ DPRINTF("\n");
+ }
+
+ if (vcpu_vmx_check_cap(vcpu, IA32_VMX_PROCBASED2_CTLS,
+ IA32_VMX_ENABLE_VM_FUNCTIONS, 1)) {
+ /* We assume all CPUs have the same VMFUNC caps */
+ if (curcpu()->ci_vmm_cap.vcc_vmx.vmx_vm_func & 0x1) {
+ vmx_dump_vmcs_field(VMCS_EPTP_LIST_ADDRESS,
+ "EPTP List Addr");
+ DPRINTF("\n");
+ }
+ }
+
+ if (vcpu_vmx_check_cap(vcpu, IA32_VMX_PROCBASED2_CTLS,
+ IA32_VMX_VMCS_SHADOWING, 1)) {
+ vmx_dump_vmcs_field(VMCS_VMREAD_BITMAP_ADDRESS,
+ "VMREAD Bitmap Addr");
+ DPRINTF("\n");
+ vmx_dump_vmcs_field(VMCS_VMWRITE_BITMAP_ADDRESS,
+ "VMWRITE Bitmap Addr");
+ DPRINTF("\n");
+ }
+
+ if (vcpu_vmx_check_cap(vcpu, IA32_VMX_PROCBASED2_CTLS,
+ IA32_VMX_EPT_VIOLATION_VE, 1)) {
+ vmx_dump_vmcs_field(VMCS_VIRTUALIZATION_EXC_ADDRESS,
+ "#VE Addr");
+ DPRINTF("\n");
+ }
+
+ if (vcpu_vmx_check_cap(vcpu, IA32_VMX_PROCBASED2_CTLS,
+ IA32_VMX_ENABLE_XSAVES_XRSTORS, 1)) {
+ vmx_dump_vmcs_field(VMCS_XSS_EXITING_BITMAP,
+ "XSS exiting bitmap addr");
+ DPRINTF("\n");
+ }
+
+ if (vcpu_vmx_check_cap(vcpu, IA32_VMX_PROCBASED2_CTLS,
+ IA32_VMX_ENABLE_ENCLS_EXITING, 1)) {
+ vmx_dump_vmcs_field(VMCS_ENCLS_EXITING_BITMAP,
+ "Encls exiting bitmap addr");
+ DPRINTF("\n");
+ }
+
+ if (vcpu_vmx_check_cap(vcpu, IA32_VMX_PROCBASED2_CTLS,
+ IA32_VMX_ENABLE_TSC_SCALING, 1)) {
+ vmx_dump_vmcs_field(VMCS_TSC_MULTIPLIER,
+ "TSC scaling factor");
+ DPRINTF("\n");
+ }
+
+ if (vcpu_vmx_check_cap(vcpu, IA32_VMX_PROCBASED2_CTLS,
+ IA32_VMX_ENABLE_EPT, 1)) {
+ vmx_dump_vmcs_field(VMCS_GUEST_PHYSICAL_ADDRESS,
+ "Guest PA");
+ DPRINTF("\n");
+ }
+ }
+
+ vmx_dump_vmcs_field(VMCS_LINK_POINTER, "VMCS Link Pointer");
+ DPRINTF("\n");
+ vmx_dump_vmcs_field(VMCS_GUEST_IA32_DEBUGCTL, "Guest DEBUGCTL");
+ DPRINTF("\n");
+
+ if (vcpu_vmx_check_cap(vcpu, IA32_VMX_ENTRY_CTLS,
+ IA32_VMX_LOAD_IA32_PAT_ON_ENTRY, 1) ||
+ vcpu_vmx_check_cap(vcpu, IA32_VMX_EXIT_CTLS,
+ IA32_VMX_SAVE_IA32_PAT_ON_EXIT, 1)) {
+ vmx_dump_vmcs_field(VMCS_GUEST_IA32_PAT,
+ "Guest PAT");
+ DPRINTF("\n");
+ }
+
+ if (vcpu_vmx_check_cap(vcpu, IA32_VMX_ENTRY_CTLS,
+ IA32_VMX_LOAD_IA32_EFER_ON_ENTRY, 1) ||
+ vcpu_vmx_check_cap(vcpu, IA32_VMX_EXIT_CTLS,
+ IA32_VMX_SAVE_IA32_EFER_ON_EXIT, 1)) {
+ vmx_dump_vmcs_field(VMCS_GUEST_IA32_EFER,
+ "Guest EFER");
+ DPRINTF("\n");
+ }
+
+ if (vcpu_vmx_check_cap(vcpu, IA32_VMX_ENTRY_CTLS,
+ IA32_VMX_LOAD_IA32_PERF_GLOBAL_CTRL_ON_ENTRY, 1)) {
+ vmx_dump_vmcs_field(VMCS_GUEST_IA32_PERF_GBL_CTRL,
+ "Guest Perf Global Ctrl");
+ DPRINTF("\n");
+ }
+
+ if (has_sec) {
+ if (vcpu_vmx_check_cap(vcpu, IA32_VMX_PROCBASED2_CTLS,
+ IA32_VMX_ENABLE_EPT, 1)) {
+ vmx_dump_vmcs_field(VMCS_GUEST_PDPTE0, "Guest PDPTE0");
+ DPRINTF("\n");
+ vmx_dump_vmcs_field(VMCS_GUEST_PDPTE1, "Guest PDPTE1");
+ DPRINTF("\n");
+ vmx_dump_vmcs_field(VMCS_GUEST_PDPTE2, "Guest PDPTE2");
+ DPRINTF("\n");
+ vmx_dump_vmcs_field(VMCS_GUEST_PDPTE3, "Guest PDPTE3");
+ DPRINTF("\n");
+ }
+ }
+
+ if (vcpu_vmx_check_cap(vcpu, IA32_VMX_ENTRY_CTLS,
+ IA32_VMX_LOAD_IA32_BNDCFGS_ON_ENTRY, 1) ||
+ vcpu_vmx_check_cap(vcpu, IA32_VMX_EXIT_CTLS,
+ IA32_VMX_CLEAR_IA32_BNDCFGS_ON_EXIT, 1)) {
+ vmx_dump_vmcs_field(VMCS_GUEST_IA32_BNDCFGS,
+ "Guest BNDCFGS");
+ DPRINTF("\n");
+ }
+
+ if (vcpu_vmx_check_cap(vcpu, IA32_VMX_EXIT_CTLS,
+ IA32_VMX_LOAD_IA32_PAT_ON_EXIT, 1)) {
+ vmx_dump_vmcs_field(VMCS_HOST_IA32_PAT,
+ "Host PAT");
+ DPRINTF("\n");
+ }
+
+ if (vcpu_vmx_check_cap(vcpu, IA32_VMX_EXIT_CTLS,
+ IA32_VMX_LOAD_IA32_EFER_ON_EXIT, 1)) {
+ vmx_dump_vmcs_field(VMCS_HOST_IA32_EFER,
+ "Host EFER");
+ DPRINTF("\n");
+ }
+
+ if (vcpu_vmx_check_cap(vcpu, IA32_VMX_EXIT_CTLS,
+ IA32_VMX_LOAD_IA32_PERF_GLOBAL_CTRL_ON_EXIT, 1)) {
+ vmx_dump_vmcs_field(VMCS_HOST_IA32_PERF_GBL_CTRL,
+ "Host Perf Global Ctrl");
+ DPRINTF("\n");
+ }
+
+ vmx_dump_vmcs_field(VMCS_PINBASED_CTLS, "Pinbased Ctrls");
+ vmx_dump_vmcs_field(VMCS_PROCBASED_CTLS, "Procbased Ctrls");
+ DPRINTF("\n");
+ vmx_dump_vmcs_field(VMCS_EXCEPTION_BITMAP, "Exception Bitmap");
+ vmx_dump_vmcs_field(VMCS_PF_ERROR_CODE_MASK, "#PF Err Code Mask");
+ DPRINTF("\n");
+ vmx_dump_vmcs_field(VMCS_PF_ERROR_CODE_MATCH, "#PF Err Code Match");
+ vmx_dump_vmcs_field(VMCS_CR3_TARGET_COUNT, "CR3 Tgt Count");
+ DPRINTF("\n");
+ vmx_dump_vmcs_field(VMCS_EXIT_CTLS, "Exit Ctrls");
+ vmx_dump_vmcs_field(VMCS_EXIT_MSR_STORE_COUNT, "Exit MSR Store Ct");
+ DPRINTF("\n");
+ vmx_dump_vmcs_field(VMCS_EXIT_MSR_LOAD_COUNT, "Exit MSR Load Ct");
+ vmx_dump_vmcs_field(VMCS_ENTRY_CTLS, "Entry Ctrls");
+ DPRINTF("\n");
+ vmx_dump_vmcs_field(VMCS_ENTRY_MSR_LOAD_COUNT, "Entry MSR Load Ct");
+ vmx_dump_vmcs_field(VMCS_ENTRY_INTERRUPTION_INFO, "Entry Int. Info");
+ DPRINTF("\n");
+ vmx_dump_vmcs_field(VMCS_ENTRY_EXCEPTION_ERROR_CODE,
+ "Entry Ex. Err Code");
+ vmx_dump_vmcs_field(VMCS_ENTRY_INSTRUCTION_LENGTH, "Entry Insn Len");
+ DPRINTF("\n");
+
+ if (vcpu_vmx_check_cap(vcpu, IA32_VMX_PROCBASED_CTLS,
+ IA32_VMX_USE_TPR_SHADOW, 1)) {
+ vmx_dump_vmcs_field(VMCS_TPR_THRESHOLD, "TPR Threshold");
+ DPRINTF("\n");
+ }
+
+ if (has_sec) {
+ vmx_dump_vmcs_field(VMCS_PROCBASED2_CTLS, "2ndary Ctrls");
+ DPRINTF("\n");
+ if (vcpu_vmx_check_cap(vcpu, IA32_VMX_PROCBASED2_CTLS,
+ IA32_VMX_PAUSE_LOOP_EXITING, 1)) {
+ vmx_dump_vmcs_field(VMCS_PLE_GAP, "PLE Gap");
+ vmx_dump_vmcs_field(VMCS_PLE_WINDOW, "PLE Window");
+ }
+ DPRINTF("\n");
+ }
+
+ vmx_dump_vmcs_field(VMCS_INSTRUCTION_ERROR, "Insn Error");
+ vmx_dump_vmcs_field(VMCS_EXIT_REASON, "Exit Reason");
+ DPRINTF("\n");
+
+ vmx_dump_vmcs_field(VMCS_EXIT_INTERRUPTION_INFO, "Exit Int. Info");
+ vmx_dump_vmcs_field(VMCS_EXIT_INTERRUPTION_ERR_CODE,
+ "Exit Int. Err Code");
+ DPRINTF("\n");
+
+ vmx_dump_vmcs_field(VMCS_IDT_VECTORING_INFO, "IDT vect info");
+ vmx_dump_vmcs_field(VMCS_IDT_VECTORING_ERROR_CODE,
+ "IDT vect err code");
+ DPRINTF("\n");
+
+ vmx_dump_vmcs_field(VMCS_INSTRUCTION_LENGTH, "Insn Len");
+ vmx_dump_vmcs_field(VMCS_EXIT_INSTRUCTION_INFO, "Exit Insn Info");
+ DPRINTF("\n");
+
+ vmx_dump_vmcs_field(VMCS_GUEST_IA32_ES_LIMIT, "G. ES Lim");
+ vmx_dump_vmcs_field(VMCS_GUEST_IA32_CS_LIMIT, "G. CS Lim");
+ DPRINTF("\n");
+
+ vmx_dump_vmcs_field(VMCS_GUEST_IA32_SS_LIMIT, "G. SS Lim");
+ vmx_dump_vmcs_field(VMCS_GUEST_IA32_DS_LIMIT, "G. DS Lim");
+ DPRINTF("\n");
+
+ vmx_dump_vmcs_field(VMCS_GUEST_IA32_FS_LIMIT, "G. FS Lim");
+ vmx_dump_vmcs_field(VMCS_GUEST_IA32_GS_LIMIT, "G. GS Lim");
+ DPRINTF("\n");
+
+ vmx_dump_vmcs_field(VMCS_GUEST_IA32_LDTR_LIMIT, "G. LDTR Lim");
+ vmx_dump_vmcs_field(VMCS_GUEST_IA32_TR_LIMIT, "G. TR Lim");
+ DPRINTF("\n");
+
+ vmx_dump_vmcs_field(VMCS_GUEST_IA32_GDTR_LIMIT, "G. GDTR Lim");
+ vmx_dump_vmcs_field(VMCS_GUEST_IA32_IDTR_LIMIT, "G. IDTR Lim");
+ DPRINTF("\n");
+
+ vmx_dump_vmcs_field(VMCS_GUEST_IA32_ES_AR, "G. ES AR");
+ vmx_dump_vmcs_field(VMCS_GUEST_IA32_CS_AR, "G. CS AR");
+ DPRINTF("\n");
+
+ vmx_dump_vmcs_field(VMCS_GUEST_IA32_SS_AR, "G. SS AR");
+ vmx_dump_vmcs_field(VMCS_GUEST_IA32_DS_AR, "G. DS AR");
+ DPRINTF("\n");
+
+ vmx_dump_vmcs_field(VMCS_GUEST_IA32_FS_AR, "G. FS AR");
+ vmx_dump_vmcs_field(VMCS_GUEST_IA32_GS_AR, "G. GS AR");
+ DPRINTF("\n");
+
+ vmx_dump_vmcs_field(VMCS_GUEST_IA32_LDTR_AR, "G. LDTR AR");
+ vmx_dump_vmcs_field(VMCS_GUEST_IA32_TR_AR, "G. TR AR");
+ DPRINTF("\n");
+
+ vmx_dump_vmcs_field(VMCS_GUEST_INTERRUPTIBILITY_ST, "G. Int St.");
+ vmx_dump_vmcs_field(VMCS_GUEST_ACTIVITY_STATE, "G. Act St.");
+ DPRINTF("\n");
+
+ vmx_dump_vmcs_field(VMCS_GUEST_SMBASE, "G. SMBASE");
+ vmx_dump_vmcs_field(VMCS_GUEST_IA32_SYSENTER_CS, "G. SYSENTER CS");
+ DPRINTF("\n");
+
+ if (vcpu_vmx_check_cap(vcpu, IA32_VMX_PINBASED_CTLS,
+ IA32_VMX_ACTIVATE_VMX_PREEMPTION_TIMER, 1)) {
+ vmx_dump_vmcs_field(VMCS_VMX_PREEMPTION_TIMER_VAL,
+ "VMX Preempt Timer");
+ DPRINTF("\n");
+ }
+
+ vmx_dump_vmcs_field(VMCS_HOST_IA32_SYSENTER_CS, "H. SYSENTER CS");
+ DPRINTF("\n");
+
+ vmx_dump_vmcs_field(VMCS_CR0_MASK, "CR0 Mask");
+ DPRINTF("\n");
+ vmx_dump_vmcs_field(VMCS_CR4_MASK, "CR4 Mask");
+ DPRINTF("\n");
+
+ vmx_dump_vmcs_field(VMCS_CR0_READ_SHADOW, "CR0 RD Shadow");
+ DPRINTF("\n");
+ vmx_dump_vmcs_field(VMCS_CR4_READ_SHADOW, "CR4 RD Shadow");
+ DPRINTF("\n");
+
+ /* We assume all CPUs have the same max CR3 target ct */
+ cr3_tgt_ct = curcpu()->ci_vmm_cap.vcc_vmx.vmx_cr3_tgt_count;
+ DPRINTF("Max CR3 target count: 0x%x\n", cr3_tgt_ct);
+ if (cr3_tgt_ct <= VMX_MAX_CR3_TARGETS) {
+ for (i = 0 ; i < cr3_tgt_ct; i++) {
+ vmx_dump_vmcs_field(VMCS_CR3_TARGET_0 + (2 * i),
+ "CR3 Target");
+ DPRINTF("\n");
+ }
+ } else {
+ DPRINTF("(Bogus CR3 Target Count > %d", VMX_MAX_CR3_TARGETS);
+ }
+
+ vmx_dump_vmcs_field(VMCS_GUEST_EXIT_QUALIFICATION, "G. Exit Qual");
+ DPRINTF("\n");
+ vmx_dump_vmcs_field(VMCS_IO_RCX, "I/O RCX");
+ DPRINTF("\n");
+ vmx_dump_vmcs_field(VMCS_IO_RSI, "I/O RSI");
+ DPRINTF("\n");
+ vmx_dump_vmcs_field(VMCS_IO_RDI, "I/O RDI");
+ DPRINTF("\n");
+ vmx_dump_vmcs_field(VMCS_IO_RIP, "I/O RIP");
+ DPRINTF("\n");
+ vmx_dump_vmcs_field(VMCS_GUEST_LINEAR_ADDRESS, "G. Lin Addr");
+ DPRINTF("\n");
+ vmx_dump_vmcs_field(VMCS_GUEST_IA32_CR0, "G. CR0");
+ DPRINTF("\n");
+ vmx_dump_vmcs_field(VMCS_GUEST_IA32_CR3, "G. CR3");
+ DPRINTF("\n");
+ vmx_dump_vmcs_field(VMCS_GUEST_IA32_CR4, "G. CR4");
+ DPRINTF("\n");
+ vmx_dump_vmcs_field(VMCS_GUEST_IA32_ES_BASE, "G. ES Base");
+ DPRINTF("\n");
+ vmx_dump_vmcs_field(VMCS_GUEST_IA32_CS_BASE, "G. CS Base");
+ DPRINTF("\n");
+ vmx_dump_vmcs_field(VMCS_GUEST_IA32_SS_BASE, "G. SS Base");
+ DPRINTF("\n");
+ vmx_dump_vmcs_field(VMCS_GUEST_IA32_DS_BASE, "G. DS Base");
+ DPRINTF("\n");
+ vmx_dump_vmcs_field(VMCS_GUEST_IA32_FS_BASE, "G. FS Base");
+ DPRINTF("\n");
+ vmx_dump_vmcs_field(VMCS_GUEST_IA32_GS_BASE, "G. GS Base");
+ DPRINTF("\n");
+ vmx_dump_vmcs_field(VMCS_GUEST_IA32_LDTR_BASE, "G. LDTR Base");
+ DPRINTF("\n");
+ vmx_dump_vmcs_field(VMCS_GUEST_IA32_TR_BASE, "G. TR Base");
+ DPRINTF("\n");
+ vmx_dump_vmcs_field(VMCS_GUEST_IA32_GDTR_BASE, "G. GDTR Base");
+ DPRINTF("\n");
+ vmx_dump_vmcs_field(VMCS_GUEST_IA32_IDTR_BASE, "G. IDTR Base");
+ DPRINTF("\n");
+ vmx_dump_vmcs_field(VMCS_GUEST_IA32_DR7, "G. DR7");
+ DPRINTF("\n");
+ vmx_dump_vmcs_field(VMCS_GUEST_IA32_RSP, "G. RSP");
+ DPRINTF("\n");
+ vmx_dump_vmcs_field(VMCS_GUEST_IA32_RIP, "G. RIP");
+ DPRINTF("\n");
+ vmx_dump_vmcs_field(VMCS_GUEST_IA32_RFLAGS, "G. RFLAGS");
+ DPRINTF("\n");
+ vmx_dump_vmcs_field(VMCS_GUEST_PENDING_DBG_EXC, "G. Pend Dbg Exc");
+ DPRINTF("\n");
+ vmx_dump_vmcs_field(VMCS_GUEST_IA32_SYSENTER_ESP, "G. SYSENTER ESP");
+ DPRINTF("\n");
+ vmx_dump_vmcs_field(VMCS_GUEST_IA32_SYSENTER_EIP, "G. SYSENTER EIP");
+ DPRINTF("\n");
+ vmx_dump_vmcs_field(VMCS_HOST_IA32_CR0, "H. CR0");
+ DPRINTF("\n");
+ vmx_dump_vmcs_field(VMCS_HOST_IA32_CR3, "H. CR3");
+ DPRINTF("\n");
+ vmx_dump_vmcs_field(VMCS_HOST_IA32_CR4, "H. CR4");
+ DPRINTF("\n");
+ vmx_dump_vmcs_field(VMCS_HOST_IA32_FS_BASE, "H. FS Base");
+ DPRINTF("\n");
+ vmx_dump_vmcs_field(VMCS_HOST_IA32_GS_BASE, "H. GS Base");
+ DPRINTF("\n");
+ vmx_dump_vmcs_field(VMCS_HOST_IA32_TR_BASE, "H. TR Base");
+ DPRINTF("\n");
+ vmx_dump_vmcs_field(VMCS_HOST_IA32_GDTR_BASE, "H. GDTR Base");
+ DPRINTF("\n");
+ vmx_dump_vmcs_field(VMCS_HOST_IA32_IDTR_BASE, "H. IDTR Base");
+ DPRINTF("\n");
+ vmx_dump_vmcs_field(VMCS_HOST_IA32_SYSENTER_ESP, "H. SYSENTER ESP");
+ DPRINTF("\n");
+ vmx_dump_vmcs_field(VMCS_HOST_IA32_SYSENTER_EIP, "H. SYSENTER EIP");
+ DPRINTF("\n");
+ vmx_dump_vmcs_field(VMCS_HOST_IA32_RSP, "H. RSP");
+ DPRINTF("\n");
+ vmx_dump_vmcs_field(VMCS_HOST_IA32_RIP, "H. RIP");
+ DPRINTF("\n");
+}
+
+/*
+ * vmx_vcpu_dump_regs
+ *
+ * Debug function to print vcpu regs from the current vcpu
+ * note - vmcs for 'vcpu' must be on this pcpu.
+ *
+ * Parameters:
+ * vcpu - vcpu whose registers should be dumped
+ */
+void
+vmx_vcpu_dump_regs(struct vcpu *vcpu)
+{
+ uint32_t r;
+ int i;
+ struct vmx_msr_store *msr_store;
+
+ DPRINTF("vcpu @ %p\n", vcpu);
+ DPRINTF(" eax=0x%08x ebx=0x%08x ecx=0x%08x\n",
+ vcpu->vc_gueststate.vg_eax, vcpu->vc_gueststate.vg_ebx,
+ vcpu->vc_gueststate.vg_ecx);
+ DPRINTF(" edx=0x%08x ebp=0x%08x edi=0x%08x\n",
+ vcpu->vc_gueststate.vg_edx, vcpu->vc_gueststate.vg_ebp,
+ vcpu->vc_gueststate.vg_edi);
+ DPRINTF(" esi=0x%08x\n", vcpu->vc_gueststate.vg_esi);
+
+ DPRINTF(" eip=0x%08x rsp=", vcpu->vc_gueststate.vg_eip);
+ if (vmread(VMCS_GUEST_IA32_RSP, &r))
+ DPRINTF("(error reading)\n");
+ else
+ DPRINTF("0x%08x\n", r);
+
+ DPRINTF(" cr0=");
+ if (vmread(VMCS_GUEST_IA32_CR0, &r))
+ DPRINTF("(error reading)\n");
+ else {
+ DPRINTF("0x%08x ", r);
+ vmm_decode_cr0(r);
+ }
+
+ DPRINTF(" cr2=0x%08x\n", vcpu->vc_gueststate.vg_cr2);
+
+ DPRINTF(" cr3=");
+ if (vmread(VMCS_GUEST_IA32_CR3, &r))
+ DPRINTF("(error reading)\n");
+ else
+ DPRINTF("0x%08x ", r);
+
+ DPRINTF(" cr4=");
+ if (vmread(VMCS_GUEST_IA32_CR4, &r))
+ DPRINTF("(error reading)\n");
+ else {
+ DPRINTF("0x%08x ", r);
+ vmm_decode_cr4(r);
+ }
+
+ DPRINTF(" --Guest Segment Info--\n");
+
+ DPRINTF(" cs=");
+ if (vmread(VMCS_GUEST_IA32_CS_SEL, &r))
+ DPRINTF("(error reading)");
+ else
+ DPRINTF("0x%04x rpl=%d", r, r & 0x3);
+
+ DPRINTF(" base=");
+ if (vmread(VMCS_GUEST_IA32_CS_BASE, &r))
+ DPRINTF("(error reading)");
+ else
+ DPRINTF("0x%08x", r);
+
+ DPRINTF(" limit=");
+ if (vmread(VMCS_GUEST_IA32_CS_LIMIT, &r))
+ DPRINTF("(error reading)");
+ else
+ DPRINTF("0x%08x", r);
+
+ DPRINTF(" a/r=");
+ if (vmread(VMCS_GUEST_IA32_CS_AR, &r))
+ DPRINTF("(error reading)\n");
+ else {
+ DPRINTF("0x%04x\n ", r);
+ vmm_segment_desc_decode(r);
+ }
+
+ DPRINTF(" ds=");
+ if (vmread(VMCS_GUEST_IA32_DS_SEL, &r))
+ DPRINTF("(error reading)");
+ else
+ DPRINTF("0x%04x rpl=%d", r, r & 0x3);
+
+ DPRINTF(" base=");
+ if (vmread(VMCS_GUEST_IA32_DS_BASE, &r))
+ DPRINTF("(error reading)");
+ else
+ DPRINTF("0x%08x", r);
+
+ DPRINTF(" limit=");
+ if (vmread(VMCS_GUEST_IA32_DS_LIMIT, &r))
+ DPRINTF("(error reading)");
+ else
+ DPRINTF("0x%08x", r);
+
+ DPRINTF(" a/r=");
+ if (vmread(VMCS_GUEST_IA32_DS_AR, &r))
+ DPRINTF("(error reading)\n");
+ else {
+ DPRINTF("0x%04x\n ", r);
+ vmm_segment_desc_decode(r);
+ }
+
+ DPRINTF(" es=");
+ if (vmread(VMCS_GUEST_IA32_ES_SEL, &r))
+ DPRINTF("(error reading)");
+ else
+ DPRINTF("0x%04x rpl=%d", r, r & 0x3);
+
+ DPRINTF(" base=");
+ if (vmread(VMCS_GUEST_IA32_ES_BASE, &r))
+ DPRINTF("(error reading)");
+ else
+ DPRINTF("0x%08x", r);
+
+ DPRINTF(" limit=");
+ if (vmread(VMCS_GUEST_IA32_ES_LIMIT, &r))
+ DPRINTF("(error reading)");
+ else
+ DPRINTF("0x%08x", r);
+
+ DPRINTF(" a/r=");
+ if (vmread(VMCS_GUEST_IA32_ES_AR, &r))
+ DPRINTF("(error reading)\n");
+ else {
+ DPRINTF("0x%04x\n ", r);
+ vmm_segment_desc_decode(r);
+ }
+
+ DPRINTF(" fs=");
+ if (vmread(VMCS_GUEST_IA32_FS_SEL, &r))
+ DPRINTF("(error reading)");
+ else
+ DPRINTF("0x%04x rpl=%d", r, r & 0x3);
+
+ DPRINTF(" base=");
+ if (vmread(VMCS_GUEST_IA32_FS_BASE, &r))
+ DPRINTF("(error reading)");
+ else
+ DPRINTF("0x%08x", r);
+
+ DPRINTF(" limit=");
+ if (vmread(VMCS_GUEST_IA32_FS_LIMIT, &r))
+ DPRINTF("(error reading)");
+ else
+ DPRINTF("0x%08x", r);
+
+ DPRINTF(" a/r=");
+ if (vmread(VMCS_GUEST_IA32_FS_AR, &r))
+ DPRINTF("(error reading)\n");
+ else {
+ DPRINTF("0x%04x\n ", r);
+ vmm_segment_desc_decode(r);
+ }
+
+ DPRINTF(" gs=");
+ if (vmread(VMCS_GUEST_IA32_GS_SEL, &r))
+ DPRINTF("(error reading)");
+ else
+ DPRINTF("0x%04x rpl=%d", r, r & 0x3);
+
+ DPRINTF(" base=");
+ if (vmread(VMCS_GUEST_IA32_GS_BASE, &r))
+ DPRINTF("(error reading)");
+ else
+ DPRINTF("0x%08x", r);
+
+ DPRINTF(" limit=");
+ if (vmread(VMCS_GUEST_IA32_GS_LIMIT, &r))
+ DPRINTF("(error reading)");
+ else
+ DPRINTF("0x%08x", r);
+
+ DPRINTF(" a/r=");
+ if (vmread(VMCS_GUEST_IA32_GS_AR, &r))
+ DPRINTF("(error reading)\n");
+ else {
+ DPRINTF("0x%04x\n ", r);
+ vmm_segment_desc_decode(r);
+ }
+
+ DPRINTF(" ss=");
+ if (vmread(VMCS_GUEST_IA32_SS_SEL, &r))
+ DPRINTF("(error reading)");
+ else
+ DPRINTF("0x%04x rpl=%d", r, r & 0x3);
+
+ DPRINTF(" base=");
+ if (vmread(VMCS_GUEST_IA32_SS_BASE, &r))
+ DPRINTF("(error reading)");
+ else
+ DPRINTF("0x%08x", r);
+
+ DPRINTF(" limit=");
+ if (vmread(VMCS_GUEST_IA32_SS_LIMIT, &r))
+ DPRINTF("(error reading)");
+ else
+ DPRINTF("0x%08x", r);
+
+ DPRINTF(" a/r=");
+ if (vmread(VMCS_GUEST_IA32_SS_AR, &r))
+ DPRINTF("(error reading)\n");
+ else {
+ DPRINTF("0x%04x\n ", r);
+ vmm_segment_desc_decode(r);
+ }
+
+ DPRINTF(" tr=");
+ if (vmread(VMCS_GUEST_IA32_TR_SEL, &r))
+ DPRINTF("(error reading)");
+ else
+ DPRINTF("0x%04x", r);
+
+ DPRINTF(" base=");
+ if (vmread(VMCS_GUEST_IA32_TR_BASE, &r))
+ DPRINTF("(error reading)");
+ else
+ DPRINTF("0x%08x", r);
+
+ DPRINTF(" limit=");
+ if (vmread(VMCS_GUEST_IA32_TR_LIMIT, &r))
+ DPRINTF("(error reading)");
+ else
+ DPRINTF("0x%08x", r);
+
+ DPRINTF(" a/r=");
+ if (vmread(VMCS_GUEST_IA32_TR_AR, &r))
+ DPRINTF("(error reading)\n");
+ else {
+ DPRINTF("0x%04x\n ", r);
+ vmm_segment_desc_decode(r);
+ }
+
+ DPRINTF(" gdtr base=");
+ if (vmread(VMCS_GUEST_IA32_GDTR_BASE, &r))
+ DPRINTF("(error reading) ");
+ else
+ DPRINTF("0x%08x", r);
+
+ DPRINTF(" limit=");
+ if (vmread(VMCS_GUEST_IA32_GDTR_LIMIT, &r))
+ DPRINTF("(error reading)\n");
+ else
+ DPRINTF("0x%08x\n", r);
+
+ DPRINTF(" idtr base=");
+ if (vmread(VMCS_GUEST_IA32_IDTR_BASE, &r))
+ DPRINTF("(error reading) ");
+ else
+ DPRINTF("0x%08x", r);
+
+ DPRINTF(" limit=");
+ if (vmread(VMCS_GUEST_IA32_IDTR_LIMIT, &r))
+ DPRINTF("(error reading)\n");
+ else
+ DPRINTF("0x%08x\n", r);
+
+ DPRINTF(" ldtr=");
+ if (vmread(VMCS_GUEST_IA32_LDTR_SEL, &r))
+ DPRINTF("(error reading)");
+ else
+ DPRINTF("0x%04x", r);
+
+ DPRINTF(" base=");
+ if (vmread(VMCS_GUEST_IA32_LDTR_BASE, &r))
+ DPRINTF("(error reading)");
+ else
+ DPRINTF("0x%08x", r);
+
+ DPRINTF(" limit=");
+ if (vmread(VMCS_GUEST_IA32_LDTR_LIMIT, &r))
+ DPRINTF("(error reading)");
+ else
+ DPRINTF("0x%08x", r);
+
+ DPRINTF(" a/r=");
+ if (vmread(VMCS_GUEST_IA32_LDTR_AR, &r))
+ DPRINTF("(error reading)\n");
+ else {
+ DPRINTF("0x%04x\n ", r);
+ vmm_segment_desc_decode(r);
+ }
+
+ DPRINTF(" --Guest MSRs @ 0x%08x (paddr: 0x%08x)--\n",
+ (uint32_t)vcpu->vc_vmx_msr_exit_save_va,
+ (uint32_t)vcpu->vc_vmx_msr_exit_save_pa);
+
+ msr_store = (struct vmx_msr_store *)vcpu->vc_vmx_msr_exit_save_va;
+
+ for (i = 0; i < VMX_NUM_MSR_STORE; i++) {
+ DPRINTF(" MSR %d @ %p : 0x%08x (%s), "
+ "value=0x%016llx ",
+ i, &msr_store[i], msr_store[i].vms_index,
+ msr_name_decode(msr_store[i].vms_index),
+ msr_store[i].vms_data);
+ vmm_decode_msr_value(msr_store[i].vms_index,
+ msr_store[i].vms_data);
+ }
+
+ DPRINTF(" last PIC irq=%d\n", vcpu->vc_intr);
+}
+
+/*
+ * msr_name_decode
+ *
+ * Returns a human-readable name for the MSR supplied in 'msr'.
+ *
+ * Parameters:
+ * msr - The MSR to decode
+ *
+ * Return value:
+ * NULL-terminated character string containing the name of the MSR requested
+ */
+const char *
+msr_name_decode(uint32_t msr)
+{
+ /*
+ * Add as needed. Also consider adding a decode function when
+ * adding to this table.
+ */
+
+ switch (msr) {
+ case MSR_TSC: return "TSC";
+ case MSR_APICBASE: return "APIC base";
+ case MSR_IA32_FEATURE_CONTROL: return "IA32 feature control";
+ case MSR_PERFCTR0: return "perf counter 0";
+ case MSR_PERFCTR1: return "perf counter 1";
+ case MSR_TEMPERATURE_TARGET: return "temperature target";
+ case MSR_MTRRcap: return "MTRR cap";
+ case MSR_PERF_STATUS: return "perf status";
+ case MSR_PERF_CTL: return "perf control";
+ case MSR_MTRRvarBase: return "MTRR variable base";
+ case MSR_MTRRfix64K_00000: return "MTRR fixed 64K";
+ case MSR_MTRRfix16K_80000: return "MTRR fixed 16K";
+ case MSR_MTRRfix4K_C0000: return "MTRR fixed 4K";
+ case MSR_CR_PAT: return "PAT";
+ case MSR_MTRRdefType: return "MTRR default type";
+ case MSR_EFER: return "EFER";
+ case MSR_STAR: return "STAR";
+ case MSR_LSTAR: return "LSTAR";
+ case MSR_CSTAR: return "CSTAR";
+ case MSR_SFMASK: return "SFMASK";
+ case MSR_FSBASE: return "FSBASE";
+ case MSR_GSBASE: return "GSBASE";
+ case MSR_KERNELGSBASE: return "KGSBASE";
+ default: return "Unknown MSR";
+ }
+}
+
+/*
+ * vmm_segment_desc_decode
+ *
+ * Debug function to print segment information for supplied descriptor
+ *
+ * Parameters:
+ * val - The A/R bytes for the segment descriptor to decode
+ */
+void
+vmm_segment_desc_decode(uint32_t val)
+{
+ uint16_t ar;
+ uint8_t g, type, s, dpl, p, dib, l;
+ uint32_t unusable;
+
+ /* Exit early on unusable descriptors */
+ unusable = val & 0x10000;
+ if (unusable) {
+ DPRINTF("(unusable)\n");
+ return;
+ }
+
+ ar = (uint16_t)val;
+
+ g = (ar & 0x8000) >> 15;
+ dib = (ar & 0x4000) >> 14;
+ l = (ar & 0x2000) >> 13;
+ p = (ar & 0x80) >> 7;
+ dpl = (ar & 0x60) >> 5;
+ s = (ar & 0x10) >> 4;
+ type = (ar & 0xf);
+
+ DPRINTF("granularity=%d dib=%d l(64 bit)=%d present=%d sys=%d ",
+ g, dib, l, p, s);
+
+ DPRINTF("type=");
+ if (!s) {
+ switch (type) {
+ case SDT_SYSLDT: DPRINTF("ldt\n"); break;
+ case SDT_SYS386TSS: DPRINTF("tss (available)\n"); break;
+ case SDT_SYS386BSY: DPRINTF("tss (busy)\n"); break;
+ case SDT_SYS386CGT: DPRINTF("call gate\n"); break;
+ case SDT_SYS386IGT: DPRINTF("interrupt gate\n"); break;
+ case SDT_SYS386TGT: DPRINTF("trap gate\n"); break;
+ /* XXX handle 32 bit segment types by inspecting mode */
+ default: DPRINTF("unknown");
+ }
+ } else {
+ switch (type + 16) {
+ case SDT_MEMRO: DPRINTF("data, r/o\n"); break;
+ case SDT_MEMROA: DPRINTF("data, r/o, accessed\n"); break;
+ case SDT_MEMRW: DPRINTF("data, r/w\n"); break;
+ case SDT_MEMRWA: DPRINTF("data, r/w, accessed\n"); break;
+ case SDT_MEMROD: DPRINTF("data, r/o, expand down\n"); break;
+ case SDT_MEMRODA: DPRINTF("data, r/o, expand down, "
+ "accessed\n");
+ break;
+ case SDT_MEMRWD: DPRINTF("data, r/w, expand down\n"); break;
+ case SDT_MEMRWDA: DPRINTF("data, r/w, expand down, "
+ "accessed\n");
+ break;
+ case SDT_MEME: DPRINTF("code, x only\n"); break;
+ case SDT_MEMEA: DPRINTF("code, x only, accessed\n");
+ case SDT_MEMER: DPRINTF("code, r/x\n"); break;
+ case SDT_MEMERA: DPRINTF("code, r/x, accessed\n"); break;
+ case SDT_MEMEC: DPRINTF("code, x only, conforming\n"); break;
+ case SDT_MEMEAC: DPRINTF("code, x only, conforming, "
+ "accessed\n");
+ break;
+ case SDT_MEMERC: DPRINTF("code, r/x, conforming\n"); break;
+ case SDT_MEMERAC: DPRINTF("code, r/x, conforming, accessed\n");
+ break;
+ }
+ }
+}
+
+void
+vmm_decode_cr0(uint32_t cr0)
+{
+ struct vmm_reg_debug_info cr0_info[11] = {
+ { CR0_PG, "PG ", "pg " },
+ { CR0_CD, "CD ", "cd " },
+ { CR0_NW, "NW ", "nw " },
+ { CR0_AM, "AM ", "am " },
+ { CR0_WP, "WP ", "wp " },
+ { CR0_NE, "NE ", "ne " },
+ { CR0_ET, "ET ", "et " },
+ { CR0_TS, "TS ", "ts " },
+ { CR0_EM, "EM ", "em " },
+ { CR0_MP, "MP ", "mp " },
+ { CR0_PE, "PE", "pe" }
+ };
+
+ uint8_t i;
+
+ DPRINTF("(");
+ for (i = 0; i < 11; i++)
+ if (cr0 & cr0_info[i].vrdi_bit)
+ DPRINTF(cr0_info[i].vrdi_present);
+ else
+ DPRINTF(cr0_info[i].vrdi_absent);
+
+ DPRINTF(")\n");
+}
+
+void
+vmm_decode_cr4(uint32_t cr4)
+{
+ struct vmm_reg_debug_info cr4_info[19] = {
+ { CR4_PKE, "PKE ", "pke "},
+ { CR4_SMAP, "SMAP ", "smap "},
+ { CR4_SMEP, "SMEP ", "smep "},
+ { CR4_OSXSAVE, "OSXSAVE ", "osxsave "},
+ { CR4_PCIDE, "PCIDE ", "pcide "},
+ { CR4_FSGSBASE, "FSGSBASE ", "fsgsbase "},
+ { CR4_SMXE, "SMXE ", "smxe "},
+ { CR4_VMXE, "VMXE ", "vmxe "},
+ { CR4_OSXMMEXCPT, "OSXMMEXCPT ", "osxmmexcpt "},
+ { CR4_OSFXSR, "OSFXSR ", "osfxsr "},
+ { CR4_PCE, "PCE ", "pce "},
+ { CR4_PGE, "PGE ", "pge "},
+ { CR4_MCE, "MCE ", "mce "},
+ { CR4_PAE, "PAE ", "pae "},
+ { CR4_PSE, "PSE ", "pse "},
+ { CR4_DE, "DE ", "de "},
+ { CR4_TSD, "TSD ", "tsd "},
+ { CR4_PVI, "PVI ", "pvi "},
+ { CR4_VME, "VME", "vme"}
+ };
+
+ uint8_t i;
+
+ DPRINTF("(");
+ for (i = 0; i < 19; i++)
+ if (cr4 & cr4_info[i].vrdi_bit)
+ DPRINTF(cr4_info[i].vrdi_present);
+ else
+ DPRINTF(cr4_info[i].vrdi_absent);
+
+ DPRINTF(")\n");
+}
+
+void
+vmm_decode_apicbase_msr_value(uint64_t apicbase)
+{
+ struct vmm_reg_debug_info apicbase_info[3] = {
+ { APICBASE_BSP, "BSP ", "bsp "},
+ { APICBASE_ENABLE_X2APIC, "X2APIC ", "x2apic "},
+ { APICBASE_GLOBAL_ENABLE, "GLB_EN", "glb_en"}
+ };
+
+ uint8_t i;
+
+ DPRINTF("(");
+ for (i = 0; i < 3; i++)
+ if (apicbase & apicbase_info[i].vrdi_bit)
+ DPRINTF(apicbase_info[i].vrdi_present);
+ else
+ DPRINTF(apicbase_info[i].vrdi_absent);
+
+ DPRINTF(")\n");
+}
+
+void
+vmm_decode_ia32_fc_value(uint64_t fcr)
+{
+ struct vmm_reg_debug_info fcr_info[4] = {
+ { IA32_FEATURE_CONTROL_LOCK, "LOCK ", "lock "},
+ { IA32_FEATURE_CONTROL_SMX_EN, "SMX ", "smx "},
+ { IA32_FEATURE_CONTROL_VMX_EN, "VMX ", "vmx "},
+ { IA32_FEATURE_CONTROL_SENTER_EN, "SENTER ", "senter "}
+ };
+
+ uint8_t i;
+
+ DPRINTF("(");
+ for (i = 0; i < 4; i++)
+ if (fcr & fcr_info[i].vrdi_bit)
+ DPRINTF(fcr_info[i].vrdi_present);
+ else
+ DPRINTF(fcr_info[i].vrdi_absent);
+
+ if (fcr & IA32_FEATURE_CONTROL_SENTER_EN)
+ DPRINTF(" [SENTER param = 0x%llx]",
+ (fcr & IA32_FEATURE_CONTROL_SENTER_PARAM_MASK) >> 8);
+
+ DPRINTF(")\n");
+}
+
+void
+vmm_decode_mtrrcap_value(uint64_t val)
+{
+ struct vmm_reg_debug_info mtrrcap_info[3] = {
+ { MTRRcap_FIXED, "FIXED ", "fixed "},
+ { MTRRcap_WC, "WC ", "wc "},
+ { MTRRcap_SMRR, "SMRR ", "smrr "}
+ };
+
+ uint8_t i;
+
+ DPRINTF("(");
+ for (i = 0; i < 3; i++)
+ if (val & mtrrcap_info[i].vrdi_bit)
+ DPRINTF(mtrrcap_info[i].vrdi_present);
+ else
+ DPRINTF(mtrrcap_info[i].vrdi_absent);
+
+ if (val & MTRRcap_FIXED)
+ DPRINTF(" [nr fixed ranges = 0x%llx]",
+ (val & 0xff));
+
+ DPRINTF(")\n");
+}
+
+void
+vmm_decode_perf_status_value(uint64_t val)
+{
+ DPRINTF("(pstate ratio = 0x%llx)\n", (val & 0xffff));
+}
+
+void vmm_decode_perf_ctl_value(uint64_t val)
+{
+ DPRINTF("(%s ", (val & PERF_CTL_TURBO) ? "TURBO" : "turbo");
+ DPRINTF("pstate req = 0x%llx)\n", (val & 0xfffF));
+}
+
+void
+vmm_decode_mtrrdeftype_value(uint64_t mtrrdeftype)
+{
+ struct vmm_reg_debug_info mtrrdeftype_info[2] = {
+ { MTRRdefType_FIXED_ENABLE, "FIXED ", "fixed "},
+ { MTRRdefType_ENABLE, "ENABLED ", "enabled "},
+ };
+
+ uint8_t i;
+ int type;
+
+ DPRINTF("(");
+ for (i = 0; i < 2; i++)
+ if (mtrrdeftype & mtrrdeftype_info[i].vrdi_bit)
+ DPRINTF(mtrrdeftype_info[i].vrdi_present);
+ else
+ DPRINTF(mtrrdeftype_info[i].vrdi_absent);
+
+ DPRINTF("type = ");
+ type = mtrr2mrt(mtrrdeftype & 0xff);
+ switch (type) {
+ case MDF_UNCACHEABLE: DPRINTF("UC"); break;
+ case MDF_WRITECOMBINE: DPRINTF("WC"); break;
+ case MDF_WRITETHROUGH: DPRINTF("WT"); break;
+ case MDF_WRITEPROTECT: DPRINTF("RO"); break;
+ case MDF_WRITEBACK: DPRINTF("WB"); break;
+ case MDF_UNKNOWN:
+ default:
+ DPRINTF("??");
+ break;
+ }
+
+ DPRINTF(")\n");
+}
+
+void
+vmm_decode_efer_value(uint64_t efer)
+{
+ struct vmm_reg_debug_info efer_info[4] = {
+ { EFER_SCE, "SCE ", "sce "},
+ { EFER_LME, "LME ", "lme "},
+ { EFER_LMA, "LMA ", "lma "},
+ { EFER_NXE, "NXE", "nxe"},
+ };
+
+ uint8_t i;
+
+ DPRINTF("(");
+ for (i = 0; i < 4; i++)
+ if (efer & efer_info[i].vrdi_bit)
+ DPRINTF(efer_info[i].vrdi_present);
+ else
+ DPRINTF(efer_info[i].vrdi_absent);
+
+ DPRINTF(")\n");
+}
+
+void
+vmm_decode_msr_value(uint64_t msr, uint64_t val)
+{
+ switch (msr) {
+ case MSR_APICBASE: vmm_decode_apicbase_msr_value(val); break;
+ case MSR_IA32_FEATURE_CONTROL: vmm_decode_ia32_fc_value(val); break;
+ case MSR_MTRRcap: vmm_decode_mtrrcap_value(val); break;
+ case MSR_PERF_STATUS: vmm_decode_perf_status_value(val); break;
+ case MSR_PERF_CTL: vmm_decode_perf_ctl_value(val); break;
+ case MSR_MTRRdefType: vmm_decode_mtrrdeftype_value(val); break;
+ case MSR_EFER: vmm_decode_efer_value(val); break;
+ default: DPRINTF("\n");
+ }
+}
+#endif /* VMM_DEBUG */
diff --git a/sys/arch/i386/i386/vmm_support.S b/sys/arch/i386/i386/vmm_support.S
new file mode 100644
index 00000000000..54d41349586
--- /dev/null
+++ b/sys/arch/i386/i386/vmm_support.S
@@ -0,0 +1,291 @@
+/*
+ * Copyright (c) 2014 Mike Larkin <mlarkin@openbsd.org>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include "assym.h"
+#include <machine/asm.h>
+#include <machine/specialreg.h>
+
+/*
+ * XXX duplicated in vmmvar.h due to song-and-dance with sys/rwlock.h inclusion
+ * here
+ */
+#define VMX_FAIL_LAUNCH_UNKNOWN 1
+#define VMX_FAIL_LAUNCH_INVALID_VMCS 2
+#define VMX_FAIL_LAUNCH_VALID_VMCS 3
+
+ .text
+ .code32
+ .align 16
+ .global _C_LABEL(vmxon)
+ .global _C_LABEL(vmxoff)
+ .global _C_LABEL(vmclear)
+ .global _C_LABEL(vmptrld)
+ .global _C_LABEL(vmptrst)
+ .global _C_LABEL(vmwrite)
+ .global _C_LABEL(vmread)
+ .global _C_LABEL(invvpid)
+ .global _C_LABEL(invept)
+ .global _C_LABEL(vmx_enter_guest)
+ .global _C_LABEL(vmm_dispatch_intr)
+
+_C_LABEL(vmm_dispatch_intr):
+ movl %esp, %eax
+ andl $0xFFFFFFF0, %esp
+ pushl %ss
+ pushl %eax
+ pushfl
+ pushl %cs
+ cli
+ movl 4(%eax), %eax
+ calll *%eax
+ addl $0x8, %esp
+ ret
+
+_C_LABEL(vmxon):
+ movl 4(%esp), %eax
+ vmxon (%eax)
+ jz failed_on
+ jc failed_on
+ xorl %eax, %eax
+ ret
+failed_on:
+ movl $0x01, %eax
+ ret
+
+_C_LABEL(vmxoff):
+ vmxoff
+ jz failed_off
+ jc failed_off
+ xorl %eax, %eax
+ ret
+failed_off:
+ movl $0x01, %eax
+ ret
+
+_C_LABEL(vmclear):
+ movl 0x04(%esp), %eax
+ vmclear (%eax)
+ jz failed_clear
+ jc failed_clear
+ xorl %eax, %eax
+ ret
+failed_clear:
+ movl $0x01, %eax
+ ret
+
+_C_LABEL(vmptrld):
+ movl 4(%esp), %eax
+ vmptrld (%eax)
+ jz failed_ptrld
+ jc failed_ptrld
+ xorl %eax, %eax
+ ret
+failed_ptrld:
+ movl $0x01, %eax
+ ret
+
+_C_LABEL(vmptrst):
+ movl 0x04(%esp), %eax
+ vmptrst (%eax)
+ jz failed_ptrst
+ jc failed_ptrst
+ xorl %eax, %eax
+ ret
+failed_ptrst:
+ movl $0x01, %eax
+ ret
+
+_C_LABEL(vmwrite):
+ movl 0x04(%esp), %eax
+ vmwrite 0x08(%esp), %eax
+ jz failed_write
+ jc failed_write
+ xorl %eax, %eax
+ ret
+failed_write:
+ movl $0x01, %eax
+ ret
+
+_C_LABEL(vmread):
+ pushl %ebx
+ movl 0x08(%esp), %ebx
+ movl 0x0c(%esp), %eax
+ vmread %ebx, (%eax)
+ jz failed_read
+ jc failed_read
+ popl %ebx
+ xorl %eax, %eax
+ ret
+failed_read:
+ popl %ebx
+ movl $0x01, %eax
+ ret
+
+_C_LABEL(invvpid):
+ pushl %ebx
+ movl 0x08(%esp), %eax
+ movl 0x0c(%esp), %ebx
+ invvpid (%ebx), %eax
+ popl %ebx
+ ret
+
+_C_LABEL(invept):
+ movl 0x04(%esp), %eax
+ invept 0x08(%esp), %eax
+ ret
+
+_C_LABEL(vmx_enter_guest):
+ pushl %ebx
+ pushl %ecx
+ pushl %edx
+ movl 0x14(%esp), %edx /* Guest Regs Pointer */
+ movl 0x18(%esp), %ebx /* resume flag */
+ testl %ebx, %ebx
+ jnz skip_init
+
+ /*
+ * XXX make vmx_exit_handler a global and put this in the per-vcpu
+ * init code
+ */
+ movl $VMCS_HOST_IA32_RIP, %eax
+ movl $vmx_exit_handler_asm, %ecx
+ vmwrite %ecx, %eax
+
+skip_init:
+ pushfl
+
+ strw %ax
+ pushw %ax
+ movw %es, %ax
+ pushw %ax
+ movw %ds, %ax
+ pushw %ax
+ movw %ss, %ax
+ pushw %ax
+ pushw %fs
+ pushw %gs
+
+ pushl %ebp
+ pushl %esi
+ pushl %edi
+ pushl %edx /* Guest Regs Pointer */
+
+ movl $VMCS_HOST_IA32_RSP, %edi
+ movl %esp, %eax
+ vmwrite %eax, %edi
+
+ testl %ebx, %ebx
+ jnz do_resume
+
+ /* Restore guest registers */
+ movl 0x1c(%edx), %eax
+ movl %eax, %cr2
+ movl 0x18(%edx), %ebp
+ movl 0x14(%edx), %edi
+ movl 0x0c(%edx), %ecx
+ movl 0x08(%edx), %ebx
+ movl 0x04(%edx), %eax
+ movl (%edx), %esi
+ movl 0x10(%edx), %edx
+
+ vmlaunch
+ jmp fail_launch_or_resume
+do_resume:
+ /* Restore guest registers */
+ movl 0x1c(%edx), %eax
+ movl %eax, %cr2
+ movl 0x18(%edx), %ebp
+ movl 0x14(%edx), %edi
+ movl 0x0c(%edx), %ecx
+ movl 0x08(%edx), %ebx
+ movl 0x04(%edx), %eax
+ movl (%edx), %esi
+ movl 0x10(%edx), %edx
+ vmresume
+fail_launch_or_resume:
+ /* Failed launch/resume (fell through) */
+ jc fail_launch_invalid_vmcs /* Invalid VMCS */
+ jz fail_launch_valid_vmcs /* Valid VMCS, failed launch/resume */
+
+ /* Unknown failure mode (not documented as per Intel SDM) */
+ movl $VMX_FAIL_LAUNCH_UNKNOWN, %eax
+ popl %edx
+ jmp restore_host
+
+fail_launch_invalid_vmcs:
+ movl $VMX_FAIL_LAUNCH_INVALID_VMCS, %eax
+ popl %edx
+ jmp restore_host
+
+fail_launch_valid_vmcs:
+ movl $VMCS_INSTRUCTION_ERROR, %edi
+ popl %edx
+ vmread %edi, %eax
+ /* XXX check failure of vmread */
+ movl %eax, 0x20(%edx)
+ movl $VMX_FAIL_LAUNCH_VALID_VMCS, %eax
+ jmp restore_host
+
+vmx_exit_handler_asm:
+ /* Preserve guest registers not saved in VMCS */
+ pushl %esi
+ pushl %edi
+ movl 0x8(%esp), %edi
+ movl 0x4(%esp), %esi
+ movl %esi, (%edi)
+ popl %edi
+ popl %esi /* discard */
+
+ popl %esi
+ movl %eax, 0x4(%esi)
+ movl %ebx, 0x8(%esi)
+ movl %ecx, 0xc(%esi)
+ movl %edx, 0x10(%esi)
+ movl %edi, 0x14(%esi)
+ movl %ebp, 0x18(%esi)
+ movl %cr2, %eax
+ movl %eax, 0x1c(%esi)
+
+restore_host:
+ popl %edi
+ popl %esi
+ popl %ebp
+
+ popw %gs
+ popw %fs
+ popw %ax
+ movw %ax, %ss
+ popw %ax
+ movw %ax, %ds
+ popw %ax
+ movw %ax, %es
+ xorl %ecx, %ecx
+ popw %cx
+
+ popfl
+
+ movl 0x1c(%esp), %ebx
+ leal (%ebx, %ecx), %eax
+ andb $0xF9, 5(%eax)
+ ltr %cx
+
+ popl %edx
+ popl %ecx
+ popl %ebx
+
+ xorl %eax, %eax
+
+ ret
diff --git a/sys/arch/i386/include/cpu.h b/sys/arch/i386/include/cpu.h
index 58b823d64ab..3c140f26cd3 100644
--- a/sys/arch/i386/include/cpu.h
+++ b/sys/arch/i386/include/cpu.h
@@ -1,4 +1,4 @@
-/* $OpenBSD: cpu.h,v 1.149 2016/10/14 04:53:26 mlarkin Exp $ */
+/* $OpenBSD: cpu.h,v 1.150 2016/10/21 06:20:58 mlarkin Exp $ */
/* $NetBSD: cpu.h,v 1.35 1996/05/05 19:29:26 christos Exp $ */
/*-
@@ -69,6 +69,36 @@
struct intrsource;
+/* VMXON region (Intel) */
+struct vmxon_region {
+ uint32_t vr_revision;
+};
+
+/*
+ * VMX for Intel CPUs
+ */
+struct vmx {
+ uint64_t vmx_cr0_fixed0;
+ uint64_t vmx_cr0_fixed1;
+ uint64_t vmx_cr4_fixed0;
+ uint64_t vmx_cr4_fixed1;
+ uint32_t vmx_vmxon_revision;
+ uint32_t vmx_msr_table_size;
+ uint32_t vmx_cr3_tgt_count;
+ uint64_t vmx_vm_func;
+};
+
+/*
+ * SVM for AMD CPUs
+ */
+struct svm {
+};
+
+union vmm_cpu_cap {
+ struct vmx vcc_vmx;
+ struct svm vcc_svm;
+};
+
#ifdef _KERNEL
/* XXX stuff to move to cpuvar.h later */
struct cpu_info {
@@ -158,6 +188,14 @@ struct cpu_info {
#ifdef GPROF
struct gmonparam *ci_gmon;
#endif
+ u_int32_t ci_vmm_flags;
+#define CI_VMM_VMX (1 << 0)
+#define CI_VMM_SVM (1 << 1)
+#define CI_VMM_RVI (1 << 2)
+#define CI_VMM_EPT (1 << 3)
+ union vmm_cpu_cap ci_vmm_cap;
+ uint64_t ci_vmxon_region_pa; /* Must be 64 bit */
+ struct vmxon_region *ci_vmxon_region;
};
/*
@@ -177,6 +215,7 @@ struct cpu_info {
#define CPUF_PRESENT 0x1000 /* CPU is present */
#define CPUF_RUNNING 0x2000 /* CPU is running */
+#define CPUF_VMM 0x4000 /* CPU is executing in VMM mode */
/*
* We statically allocate the CPU info for the primary CPU (or,
diff --git a/sys/arch/i386/include/intrdefs.h b/sys/arch/i386/include/intrdefs.h
index 0384febd3f8..fba06ef79e9 100644
--- a/sys/arch/i386/include/intrdefs.h
+++ b/sys/arch/i386/include/intrdefs.h
@@ -1,4 +1,4 @@
-/* $OpenBSD: intrdefs.h,v 1.14 2013/05/16 19:26:04 kettenis Exp $ */
+/* $OpenBSD: intrdefs.h,v 1.15 2016/10/21 06:20:58 mlarkin Exp $ */
/* $NetBSD: intrdefs.h,v 1.2 2003/05/04 22:01:56 fvdl Exp $ */
#ifndef _I386_INTRDEFS_H
@@ -115,13 +115,16 @@
#define I386_IPI_GDT 0x00000020
#define I386_IPI_DDB 0x00000040 /* synchronize while in ddb */
#define I386_IPI_SETPERF 0x00000080
+#define I386_IPI_START_VMM 0x00000100
+#define I386_IPI_STOP_VMM 0x00000200
-#define I386_NIPI 8
+#define I386_NIPI 10
#define I386_IPI_NAMES { "halt IPI", "nop IPI", "FPU flush IPI", \
"FPU synch IPI", \
"MTRR update IPI", "GDT update IPI", \
- "DDB IPI", "setperf IPI" }
+ "DDB IPI", "setperf IPI", "VMM start IPI", \
+ "VMM stop IPI" }
#define IREENT_MAGIC 0x18041969
diff --git a/sys/arch/i386/include/pmap.h b/sys/arch/i386/include/pmap.h
index 1614b117cab..8751e11be56 100644
--- a/sys/arch/i386/include/pmap.h
+++ b/sys/arch/i386/include/pmap.h
@@ -1,4 +1,4 @@
-/* $OpenBSD: pmap.h,v 1.82 2016/03/15 03:17:51 guenther Exp $ */
+/* $OpenBSD: pmap.h,v 1.83 2016/10/21 06:20:59 mlarkin Exp $ */
/* $NetBSD: pmap.h,v 1.44 2000/04/24 17:18:18 thorpej Exp $ */
/*
@@ -88,6 +88,11 @@ LIST_HEAD(pmap_head, pmap); /* struct pmap_head: head of a pmap list */
* page list, and number of PTPs within the pmap.
*/
+#define PMAP_TYPE_NORMAL 1
+#define PMAP_TYPE_EPT 2
+#define PMAP_TYPE_RVI 3
+#define pmap_nested(pm) ((pm)->pm_type != PMAP_TYPE_NORMAL)
+
struct pmap {
uint64_t pm_pdidx[4]; /* PDIEs for PAE mode */
@@ -106,6 +111,10 @@ struct pmap {
int pm_flags; /* see below */
struct segment_descriptor pm_codeseg; /* cs descriptor for process */
+ int pm_type; /* Type of pmap this is (PMAP_TYPE_x) */
+ vaddr_t pm_npt_pml4; /* Nested paging PML4 VA */
+ paddr_t pm_npt_pa; /* Nested paging PML4 PA */
+ vaddr_t pm_npt_pdpt; /* Nested paging PDPT */
};
/*
@@ -246,6 +255,7 @@ void pmap_switch(struct proc *, struct proc *);
vaddr_t reserve_dumppages(vaddr_t); /* XXX: not a pmap fn */
paddr_t vtophys(vaddr_t va);
paddr_t vtophys_pae(vaddr_t va);
+int pmap_convert(struct pmap *, int);
extern u_int32_t (*pmap_pte_set_p)(vaddr_t, paddr_t, u_int32_t);
extern u_int32_t (*pmap_pte_setbits_p)(vaddr_t, u_int32_t, u_int32_t);
diff --git a/sys/arch/i386/include/pte.h b/sys/arch/i386/include/pte.h
index c0e1ccfb83d..aa9b62341d6 100644
--- a/sys/arch/i386/include/pte.h
+++ b/sys/arch/i386/include/pte.h
@@ -1,4 +1,4 @@
-/* $OpenBSD: pte.h,v 1.21 2015/04/12 18:37:54 mlarkin Exp $ */
+/* $OpenBSD: pte.h,v 1.22 2016/10/21 06:20:59 mlarkin Exp $ */
/* $NetBSD: pte.h,v 1.11 1998/02/06 21:58:05 thorpej Exp $ */
/*
@@ -67,6 +67,13 @@
#define PG_AVAIL3 0x00000800 /* ignored by hardware */
#define PG_PATLG 0x00001000 /* PAT on large pages */
+/* EPT PTE bits */
+#define EPT_R (1ULL << 0)
+#define EPT_W (1ULL << 1)
+#define EPT_X (1ULL << 2)
+#define EPT_WB (6ULL << 3)
+#define EPT_PS (1ULL << 7)
+
/* Cacheability bits when we are using PAT */
#define PG_WB (0) /* The default */
#define PG_WC (PG_WT) /* WT and CD is WC */
diff --git a/sys/arch/i386/include/specialreg.h b/sys/arch/i386/include/specialreg.h
index 8bfd61b766e..aa02392022b 100644
--- a/sys/arch/i386/include/specialreg.h
+++ b/sys/arch/i386/include/specialreg.h
@@ -1,4 +1,4 @@
-/* $OpenBSD: specialreg.h,v 1.57 2016/09/03 13:35:03 mlarkin Exp $ */
+/* $OpenBSD: specialreg.h,v 1.58 2016/10/21 06:20:59 mlarkin Exp $ */
/* $NetBSD: specialreg.h,v 1.7 1994/10/27 04:16:26 cgd Exp $ */
/*-
@@ -69,6 +69,12 @@
/* the remaining 7 bits of this register are reserved */
/*
+ * bits in CR3
+ */
+#define CR3_PWT (1ULL << 3)
+#define CR3_PCD (1ULL << 4)
+
+/*
* bits in the pentiums %cr4 register:
*/
@@ -91,6 +97,7 @@
#define CR4_OSXSAVE 0x00040000 /* enable XSAVE and extended states */
#define CR4_SMEP 0x00100000 /* supervisor mode exec protection */
#define CR4_SMAP 0x00200000 /* supervisor mode access prevention */
+#define CR4_PKE 0x00400000 /* protection key enable */
/*
* CPUID "features" bits (CPUID function 0x1):
@@ -296,14 +303,20 @@
#define P5MSR_CTR0 0x012 /* P5 only (trap on P6) */
#define P5MSR_CTR1 0x013 /* P5 only (trap on P6) */
#define MSR_APICBASE 0x01b
+#define APICBASE_BSP 0x100
+#define APICBASE_ENABLE_X2APIC 0x400
+#define APICBASE_GLOBAL_ENABLE 0x800
#define MSR_EBL_CR_POWERON 0x02a
#define MSR_EBC_FREQUENCY_ID 0x02c /* Pentium 4 only */
#define MSR_TEST_CTL 0x033
+#define MSR_IA32_FEATURE_CONTROL 0x03a
#define MSR_BIOS_UPDT_TRIG 0x079
#define MSR_BBL_CR_D0 0x088 /* PII+ only */
#define MSR_BBL_CR_D1 0x089 /* PII+ only */
#define MSR_BBL_CR_D2 0x08a /* PII+ only */
#define MSR_BIOS_SIGN 0x08b
+#define MSR_PERFCTR0 0x0c1
+#define MSR_PERFCTR1 0x0c2
#define P6MSR_CTR0 0x0c1
#define P6MSR_CTR1 0x0c2
#define MSR_FSB_FREQ 0x0cd /* Core Duo/Solo only */
@@ -422,6 +435,7 @@
#define EFER_LME 0x00000100 /* Long Mode Active */
#define EFER_LMA 0x00000400 /* Long Mode Enabled */
#define EFER_NXE 0x00000800 /* No-Execute Enabled */
+#define EFER_SVME 0x00001000 /* SVM Enabled */
#define MSR_STAR 0xc0000081 /* 32 bit syscall gate addr */
#define MSR_LSTAR 0xc0000082 /* 64 bit syscall gate addr */
@@ -688,3 +702,354 @@
#define C3_CRYPT_CWLO_KEY128 0x0000000a /* 128bit, 10 rds */
#define C3_CRYPT_CWLO_KEY192 0x0000040c /* 192bit, 12 rds */
#define C3_CRYPT_CWLO_KEY256 0x0000080e /* 256bit, 15 rds */
+
+/*
+ * VMX
+ */
+#define IA32_FEATURE_CONTROL_LOCK 0x01
+#define IA32_FEATURE_CONTROL_SMX_EN 0x02
+#define IA32_FEATURE_CONTROL_VMX_EN 0x04
+#define IA32_FEATURE_CONTROL_SENTER_EN (1ULL << 15)
+#define IA32_FEATURE_CONTROL_SENTER_PARAM_MASK 0x7f00
+#define IA32_VMX_BASIC 0x480
+#define IA32_VMX_PINBASED_CTLS 0x481
+#define IA32_VMX_PROCBASED_CTLS 0x482
+#define IA32_VMX_EXIT_CTLS 0x483
+#define IA32_VMX_ENTRY_CTLS 0x484
+#define IA32_VMX_MISC 0x485
+#define IA32_VMX_CR0_FIXED0 0x486
+#define IA32_VMX_CR0_FIXED1 0x487
+#define IA32_VMX_CR4_FIXED0 0x488
+#define IA32_VMX_CR4_FIXED1 0x489
+#define IA32_VMX_PROCBASED2_CTLS 0x48B
+#define IA32_VMX_EPT_VPID_CAP 0x48C
+#define IA32_VMX_TRUE_PINBASED_CTLS 0x48D
+#define IA32_VMX_TRUE_PROCBASED_CTLS 0x48E
+#define IA32_VMX_TRUE_EXIT_CTLS 0x48F
+#define IA32_VMX_TRUE_ENTRY_CTLS 0x490
+#define IA32_VMX_VMFUNC 0x491
+
+#define IA32_EPT_VPID_CAP_PAGE_WALK_4 (1ULL << 6)
+#define IA32_EPT_VPID_CAP_WB (1ULL << 14)
+#define IA32_EPT_VPID_CAP_AD_BITS (1ULL << 21)
+
+#define IA32_EPT_PAGING_CACHE_TYPE_UC 0x0
+#define IA32_EPT_PAGING_CACHE_TYPE_WB 0x6
+#define IA32_EPT_AD_BITS_ENABLE (1ULL << 6)
+#define IA32_EPT_PAGE_WALK_LENGTH 0x4
+
+/* VMX : IA32_VMX_BASIC bits */
+#define IA32_VMX_TRUE_CTLS_AVAIL (1ULL << 55)
+
+/* VMX : IA32_VMX_PINBASED_CTLS bits */
+#define IA32_VMX_EXTERNAL_INT_EXITING (1ULL << 0)
+#define IA32_VMX_NMI_EXITING (1ULL << 3)
+#define IA32_VMX_VIRTUAL_NMIS (1ULL << 5)
+#define IA32_VMX_ACTIVATE_VMX_PREEMPTION_TIMER (1ULL << 6)
+#define IA32_VMX_PROCESS_POSTED_INTERRUPTS (1ULL << 7)
+
+/* VMX : IA32_VMX_PROCBASED_CTLS bits */
+#define IA32_VMX_INTERRUPT_WINDOW_EXITING (1ULL << 2)
+#define IA32_VMX_USE_TSC_OFFSETTING (1ULL << 3)
+#define IA32_VMX_HLT_EXITING (1ULL << 7)
+#define IA32_VMX_INVLPG_EXITING (1ULL << 9)
+#define IA32_VMX_MWAIT_EXITING (1ULL << 10)
+#define IA32_VMX_RDPMC_EXITING (1ULL << 11)
+#define IA32_VMX_RDTSC_EXITING (1ULL << 12)
+#define IA32_VMX_CR3_LOAD_EXITING (1ULL << 15)
+#define IA32_VMX_CR3_STORE_EXITING (1ULL << 16)
+#define IA32_VMX_CR8_LOAD_EXITING (1ULL << 19)
+#define IA32_VMX_CR8_STORE_EXITING (1ULL << 20)
+#define IA32_VMX_USE_TPR_SHADOW (1ULL << 21)
+#define IA32_VMX_NMI_WINDOW_EXITING (1ULL << 22)
+#define IA32_VMX_MOV_DR_EXITING (1ULL << 23)
+#define IA32_VMX_UNCONDITIONAL_IO_EXITING (1ULL << 24)
+#define IA32_VMX_USE_IO_BITMAPS (1ULL << 25)
+#define IA32_VMX_MONITOR_TRAP_FLAG (1ULL << 27)
+#define IA32_VMX_USE_MSR_BITMAPS (1ULL << 28)
+#define IA32_VMX_MONITOR_EXITING (1ULL << 29)
+#define IA32_VMX_PAUSE_EXITING (1ULL << 30)
+#define IA32_VMX_ACTIVATE_SECONDARY_CONTROLS (1ULL << 31)
+
+/* VMX : IA32_VMX_PROCBASED2_CTLS bits */
+#define IA32_VMX_VIRTUALIZE_APIC (1ULL << 0)
+#define IA32_VMX_ENABLE_EPT (1ULL << 1)
+#define IA32_VMX_DESCRIPTOR_TABLE_EXITING (1ULL << 2)
+#define IA32_VMX_ENABLE_RDTSCP (1ULL << 3)
+#define IA32_VMX_VIRTUALIZE_X2APIC_MODE (1ULL << 4)
+#define IA32_VMX_ENABLE_VPID (1ULL << 5)
+#define IA32_VMX_WBINVD_EXITING (1ULL << 6)
+#define IA32_VMX_UNRESTRICTED_GUEST (1ULL << 7)
+#define IA32_VMX_APIC_REGISTER_VIRTUALIZATION (1ULL << 8)
+#define IA32_VMX_VIRTUAL_INTERRUPT_DELIVERY (1ULL << 9)
+#define IA32_VMX_PAUSE_LOOP_EXITING (1ULL << 10)
+#define IA32_VMX_RDRAND_EXITING (1ULL << 11)
+#define IA32_VMX_ENABLE_INVPCID (1ULL << 12)
+#define IA32_VMX_ENABLE_VM_FUNCTIONS (1ULL << 13)
+#define IA32_VMX_VMCS_SHADOWING (1ULL << 14)
+#define IA32_VMX_ENABLE_ENCLS_EXITING (1ULL << 15)
+#define IA32_VMX_RDSEED_EXITING (1ULL << 16)
+#define IA32_VMX_ENABLE_PML (1ULL << 17)
+#define IA32_VMX_EPT_VIOLATION_VE (1ULL << 18)
+#define IA32_VMX_CONCEAL_VMX_FROM_PT (1ULL << 19)
+#define IA32_VMX_ENABLE_XSAVES_XRSTORS (1ULL << 20)
+#define IA32_VMX_ENABLE_TSC_SCALING (1ULL << 25)
+
+/* VMX : IA32_VMX_EXIT_CTLS bits */
+#define IA32_VMX_SAVE_DEBUG_CONTROLS (1ULL << 2)
+#define IA32_VMX_HOST_SPACE_ADDRESS_SIZE (1ULL << 9)
+#define IA32_VMX_LOAD_IA32_PERF_GLOBAL_CTRL_ON_EXIT (1ULL << 12)
+#define IA32_VMX_ACKNOWLEDGE_INTERRUPT_ON_EXIT (1ULL << 15)
+#define IA32_VMX_SAVE_IA32_PAT_ON_EXIT (1ULL << 18)
+#define IA32_VMX_LOAD_IA32_PAT_ON_EXIT (1ULL << 19)
+#define IA32_VMX_SAVE_IA32_EFER_ON_EXIT (1ULL << 20)
+#define IA32_VMX_LOAD_IA32_EFER_ON_EXIT (1ULL << 21)
+#define IA32_VMX_SAVE_VMX_PREEMPTION_TIMER (1ULL << 22)
+#define IA32_VMX_CLEAR_IA32_BNDCFGS_ON_EXIT (1ULL << 23)
+#define IA32_VMX_CONCEAL_VM_EXITS_FROM_PT (1ULL << 24)
+
+/* VMX: IA32_VMX_ENTRY_CTLS bits */
+#define IA32_VMX_LOAD_DEBUG_CONTROLS (1ULL << 2)
+#define IA32_VMX_IA32E_MODE_GUEST (1ULL << 9)
+#define IA32_VMX_ENTRY_TO_SMM (1ULL << 10)
+#define IA32_VMX_DEACTIVATE_DUAL_MONITOR_TREATMENT (1ULL << 11)
+#define IA32_VMX_LOAD_IA32_PERF_GLOBAL_CTRL_ON_ENTRY (1ULL << 13)
+#define IA32_VMX_LOAD_IA32_PAT_ON_ENTRY (1ULL << 14)
+#define IA32_VMX_LOAD_IA32_EFER_ON_ENTRY (1ULL << 15)
+#define IA32_VMX_LOAD_IA32_BNDCFGS_ON_ENTRY (1ULL << 16)
+#define IA32_VMX_CONCEAL_VM_ENTRIES_FROM_PT (1ULL << 17)
+
+/*
+ * VMX : VMCS Fields
+ */
+
+/* 16-bit control fields */
+#define VMCS_GUEST_VPID 0x0000
+#define VMCS_POSTED_INT_NOTIF_VECTOR 0x0002
+#define VMCS_EPTP_INDEX 0x0004
+
+/* 16-bit guest state fields */
+#define VMCS_GUEST_IA32_ES_SEL 0x0800
+#define VMCS_GUEST_IA32_CS_SEL 0x0802
+#define VMCS_GUEST_IA32_SS_SEL 0x0804
+#define VMCS_GUEST_IA32_DS_SEL 0x0806
+#define VMCS_GUEST_IA32_FS_SEL 0x0808
+#define VMCS_GUEST_IA32_GS_SEL 0x080A
+#define VMCS_GUEST_IA32_LDTR_SEL 0x080C
+#define VMCS_GUEST_IA32_TR_SEL 0x080E
+#define VMCS_GUEST_INTERRUPT_STATUS 0x0810
+#define VMCS_GUEST_PML_INDEX 0x0812
+
+/* 16-bit host state fields */
+#define VMCS_HOST_IA32_ES_SEL 0x0C00
+#define VMCS_HOST_IA32_CS_SEL 0x0C02
+#define VMCS_HOST_IA32_SS_SEL 0x0C04
+#define VMCS_HOST_IA32_DS_SEL 0x0C06
+#define VMCS_HOST_IA32_FS_SEL 0x0C08
+#define VMCS_HOST_IA32_GS_SEL 0x0C0A
+#define VMCS_HOST_IA32_TR_SEL 0x0C0C
+
+/* 64-bit control fields */
+#define VMCS_IO_BITMAP_A 0x2000
+#define VMCS_IO_BITMAP_B 0x2002
+#define VMCS_MSR_BITMAP_ADDRESS 0x2004
+#define VMCS_MSR_BITMAP_ADDRESS_HI 0x2005
+#define VMCS_EXIT_STORE_MSR_ADDRESS 0x2006
+#define VMCS_EXIT_STORE_MSR_ADDRESS_HI 0x2007
+#define VMCS_EXIT_LOAD_MSR_ADDRESS 0x2008
+#define VMCS_EXIT_LOAD_MSR_ADDRESS_HI 0x2009
+#define VMCS_ENTRY_LOAD_MSR_ADDRESS 0x200A
+#define VMCS_ENTRY_LOAD_MSR_ADDRESS_HI 0x200B
+#define VMCS_EXECUTIVE_VMCS_POINTER 0x200C
+#define VMCS_PML_ADDRESS 0x200E
+#define VMCS_TSC_OFFSET 0x2010
+#define VMCS_VIRTUAL_APIC_ADDRESS 0x2012
+#define VMCS_APIC_ACCESS_ADDRESS 0x2014
+#define VMCS_POSTED_INTERRUPT_DESC 0x2016
+#define VMCS_VM_FUNCTION_CONTROLS 0x2018
+#define VMCS_GUEST_IA32_EPTP 0x201A
+#define VMCS_GUEST_IA32_EPTP_HI 0x201B
+#define VMCS_EOI_EXIT_BITMAP_0 0x201C
+#define VMCS_EOI_EXIT_BITMAP_1 0x201E
+#define VMCS_EOI_EXIT_BITMAP_2 0x2020
+#define VMCS_EOI_EXIT_BITMAP_3 0x2022
+#define VMCS_EPTP_LIST_ADDRESS 0x2024
+#define VMCS_VMREAD_BITMAP_ADDRESS 0x2026
+#define VMCS_VMWRITE_BITMAP_ADDRESS 0x2028
+#define VMCS_VIRTUALIZATION_EXC_ADDRESS 0x202A
+#define VMCS_XSS_EXITING_BITMAP 0x202C
+#define VMCS_ENCLS_EXITING_BITMAP 0x202E
+#define VMCS_TSC_MULTIPLIER 0x2032
+
+/* 64-bit RO data field */
+#define VMCS_GUEST_PHYSICAL_ADDRESS 0x2400
+#define VMCS_GUEST_PHYSICAL_ADDRESS_HI 0x2401
+
+/* 64-bit guest state fields */
+#define VMCS_LINK_POINTER 0x2800
+#define VMCS_LINK_POINTER_HI 0x2801
+#define VMCS_GUEST_IA32_DEBUGCTL 0x2802
+#define VMCS_GUEST_IA32_PAT 0x2804
+#define VMCS_GUEST_IA32_EFER 0x2806
+#define VMCS_GUEST_IA32_PERF_GBL_CTRL 0x2808
+#define VMCS_GUEST_PDPTE0 0x280A
+#define VMCS_GUEST_PDPTE1 0x280C
+#define VMCS_GUEST_PDPTE2 0x280E
+#define VMCS_GUEST_PDPTE3 0x2810
+#define VMCS_GUEST_IA32_BNDCFGS 0x2812
+
+/* 64-bit host state fields */
+#define VMCS_HOST_IA32_PAT 0x2C00
+#define VMCS_HOST_IA32_EFER 0x2C02
+#define VMCS_HOST_IA32_PERF_GBL_CTRL 0x2C04
+
+/* 32-bit control fields */
+#define VMCS_PINBASED_CTLS 0x4000
+#define VMCS_PROCBASED_CTLS 0x4002
+#define VMCS_EXCEPTION_BITMAP 0x4004
+#define VMCS_PF_ERROR_CODE_MASK 0x4006
+#define VMCS_PF_ERROR_CODE_MATCH 0x4008
+#define VMCS_CR3_TARGET_COUNT 0x400A
+#define VMCS_EXIT_CTLS 0x400C
+#define VMCS_EXIT_MSR_STORE_COUNT 0x400E
+#define VMCS_EXIT_MSR_LOAD_COUNT 0x4010
+#define VMCS_ENTRY_CTLS 0x4012
+#define VMCS_ENTRY_MSR_LOAD_COUNT 0x4014
+#define VMCS_ENTRY_INTERRUPTION_INFO 0x4016
+#define VMCS_ENTRY_EXCEPTION_ERROR_CODE 0x4018
+#define VMCS_ENTRY_INSTRUCTION_LENGTH 0x401A
+#define VMCS_TPR_THRESHOLD 0x401C
+#define VMCS_PROCBASED2_CTLS 0x401E
+#define VMCS_PLE_GAP 0x4020
+#define VMCS_PLE_WINDOW 0x4022
+
+/* 32-bit RO data fields */
+#define VMCS_INSTRUCTION_ERROR 0x4400
+#define VMCS_EXIT_REASON 0x4402
+#define VMCS_EXIT_INTERRUPTION_INFO 0x4404
+#define VMCS_EXIT_INTERRUPTION_ERR_CODE 0x4406
+#define VMCS_IDT_VECTORING_INFO 0x4408
+#define VMCS_IDT_VECTORING_ERROR_CODE 0x440A
+#define VMCS_INSTRUCTION_LENGTH 0x440C
+#define VMCS_EXIT_INSTRUCTION_INFO 0x440E
+
+/* 32-bit guest state fields */
+#define VMCS_GUEST_IA32_ES_LIMIT 0x4800
+#define VMCS_GUEST_IA32_CS_LIMIT 0x4802
+#define VMCS_GUEST_IA32_SS_LIMIT 0x4804
+#define VMCS_GUEST_IA32_DS_LIMIT 0x4806
+#define VMCS_GUEST_IA32_FS_LIMIT 0x4808
+#define VMCS_GUEST_IA32_GS_LIMIT 0x480A
+#define VMCS_GUEST_IA32_LDTR_LIMIT 0x480C
+#define VMCS_GUEST_IA32_TR_LIMIT 0x480E
+#define VMCS_GUEST_IA32_GDTR_LIMIT 0x4810
+#define VMCS_GUEST_IA32_IDTR_LIMIT 0x4812
+#define VMCS_GUEST_IA32_ES_AR 0x4814
+#define VMCS_GUEST_IA32_CS_AR 0x4816
+#define VMCS_GUEST_IA32_SS_AR 0x4818
+#define VMCS_GUEST_IA32_DS_AR 0x481A
+#define VMCS_GUEST_IA32_FS_AR 0x481C
+#define VMCS_GUEST_IA32_GS_AR 0x481E
+#define VMCS_GUEST_IA32_LDTR_AR 0x4820
+#define VMCS_GUEST_IA32_TR_AR 0x4822
+#define VMCS_GUEST_INTERRUPTIBILITY_ST 0x4824
+#define VMCS_GUEST_ACTIVITY_STATE 0x4826
+#define VMCS_GUEST_SMBASE 0x4828
+#define VMCS_GUEST_IA32_SYSENTER_CS 0x482A
+#define VMCS_VMX_PREEMPTION_TIMER_VAL 0x482E
+
+/* 32-bit host state field */
+#define VMCS_HOST_IA32_SYSENTER_CS 0x4C00
+
+/* Natural-width control fields */
+#define VMCS_CR0_MASK 0x6000
+#define VMCS_CR4_MASK 0x6002
+#define VMCS_CR0_READ_SHADOW 0x6004
+#define VMCS_CR4_READ_SHADOW 0x6006
+#define VMCS_CR3_TARGET_0 0x6008
+#define VMCS_CR3_TARGET_1 0x600A
+#define VMCS_CR3_TARGET_2 0x600C
+#define VMCS_CR3_TARGET_3 0x600E
+
+/* Natural-width RO fields */
+#define VMCS_GUEST_EXIT_QUALIFICATION 0x6400
+#define VMCS_IO_RCX 0x6402
+#define VMCS_IO_RSI 0x6404
+#define VMCS_IO_RDI 0x6406
+#define VMCS_IO_RIP 0x6408
+#define VMCS_GUEST_LINEAR_ADDRESS 0x640A
+
+/* Natural-width guest state fields */
+#define VMCS_GUEST_IA32_CR0 0x6800
+#define VMCS_GUEST_IA32_CR3 0x6802
+#define VMCS_GUEST_IA32_CR4 0x6804
+#define VMCS_GUEST_IA32_ES_BASE 0x6806
+#define VMCS_GUEST_IA32_CS_BASE 0x6808
+#define VMCS_GUEST_IA32_SS_BASE 0x680A
+#define VMCS_GUEST_IA32_DS_BASE 0x680C
+#define VMCS_GUEST_IA32_FS_BASE 0x680E
+#define VMCS_GUEST_IA32_GS_BASE 0x6810
+#define VMCS_GUEST_IA32_LDTR_BASE 0x6812
+#define VMCS_GUEST_IA32_TR_BASE 0x6814
+#define VMCS_GUEST_IA32_GDTR_BASE 0x6816
+#define VMCS_GUEST_IA32_IDTR_BASE 0x6818
+#define VMCS_GUEST_IA32_DR7 0x681A
+#define VMCS_GUEST_IA32_RSP 0x681C
+#define VMCS_GUEST_IA32_RIP 0x681E
+#define VMCS_GUEST_IA32_RFLAGS 0x6820
+#define VMCS_GUEST_PENDING_DBG_EXC 0x6822
+#define VMCS_GUEST_IA32_SYSENTER_ESP 0x6824
+#define VMCS_GUEST_IA32_SYSENTER_EIP 0x6826
+
+/* Natural-width host state fields */
+#define VMCS_HOST_IA32_CR0 0x6C00
+#define VMCS_HOST_IA32_CR3 0x6C02
+#define VMCS_HOST_IA32_CR4 0x6C04
+#define VMCS_HOST_IA32_FS_BASE 0x6C06
+#define VMCS_HOST_IA32_GS_BASE 0x6C08
+#define VMCS_HOST_IA32_TR_BASE 0x6C0A
+#define VMCS_HOST_IA32_GDTR_BASE 0x6C0C
+#define VMCS_HOST_IA32_IDTR_BASE 0x6C0E
+#define VMCS_HOST_IA32_SYSENTER_ESP 0x6C10
+#define VMCS_HOST_IA32_SYSENTER_EIP 0x6C12
+#define VMCS_HOST_IA32_RSP 0x6C14
+#define VMCS_HOST_IA32_RIP 0x6C16
+
+#define IA32_VMX_INVVPID_INDIV_ADDR_CTX 0x0
+#define IA32_VMX_INVVPID_SINGLE_CTX 0x1
+#define IA32_VMX_INVVPID_ALL_CTX 0x2
+#define IA32_VMX_INVVPID_SINGLE_CTX_GLB 0x3
+
+#define IA32_VMX_INVEPT_SINGLE_CTX 0x1
+#define IA32_VMX_INVEPT_GLOBAL_CTX 0x2
+
+#define IA32_VMX_EPT_FAULT_READ (1ULL << 0)
+#define IA32_VMX_EPT_FAULT_WRITE (1ULL << 1)
+#define IA32_VMX_EPT_FAULT_EXEC (1ULL << 2)
+
+#define IA32_VMX_EPT_FAULT_WAS_READABLE (1ULL << 3)
+#define IA32_VMX_EPT_FAULT_WAS_WRITABLE (1ULL << 4)
+#define IA32_VMX_EPT_FAULT_WAS_EXECABLE (1ULL << 5)
+
+#define IA32_VMX_MSR_LIST_SIZE_MASK (7ULL << 25)
+#define IA32_VMX_CR3_TGT_SIZE_MASK (0x1FFULL << 16)
+
+/*
+ * SVM
+ */
+#define MSR_AMD_VM_CR 0xc0010114
+#define CPUID_AMD_SVM_CAP 0x8000000A
+#define AMD_SVMDIS 0x10
+#define AMD_SVM_NESTED_PAGING_CAP (1 << 0)
+
+/*
+ * PAT
+ */
+#define PATENTRY(n, type) ((uint64_t)type << ((n) * 8))
+#define PAT_UC 0x0UL
+#define PAT_WC 0x1UL
+#define PAT_WT 0x4UL
+#define PAT_WP 0x5UL
+#define PAT_WB 0x6UL
+#define PAT_UCMINUS 0x7UL
+
diff --git a/sys/arch/i386/include/vmmvar.h b/sys/arch/i386/include/vmmvar.h
new file mode 100644
index 00000000000..4b8edf7756b
--- /dev/null
+++ b/sys/arch/i386/include/vmmvar.h
@@ -0,0 +1,446 @@
+/*
+ * Copyright (c) 2014 Mike Larkin <mlarkin@openbsd.org>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+/*
+ * CPU capabilities for VMM operation
+ */
+#ifndef _MACHINE_VMMVAR_H_
+#define _MACHINE_VMMVAR_H_
+
+#define VMM_HV_SIGNATURE "OpenBSDVMM58"
+
+#define VMM_MAX_MEM_RANGES 16
+#define VMM_MAX_DISKS_PER_VM 2
+#define VMM_MAX_PATH_DISK 128
+#define VMM_MAX_NAME_LEN 32
+#define VMM_MAX_KERNEL_PATH 128
+#define VMM_MAX_VCPUS_PER_VM 64
+#define VMM_MAX_VM_MEM_SIZE 2048
+#define VMM_MAX_NICS_PER_VM 2
+
+#define VMM_PCI_MMIO_BAR_BASE 0xF0000000
+#define VMM_PCI_MMIO_BAR_END 0xF0FFFFFF
+#define VMM_PCI_MMIO_BAR_SIZE 0x00010000
+#define VMM_PCI_IO_BAR_BASE 0x1000
+#define VMM_PCI_IO_BAR_END 0xFFFF
+#define VMM_PCI_IO_BAR_SIZE 0x1000
+
+/* VMX: Basic Exit Reasons */
+#define VMX_EXIT_NMI 0
+#define VMX_EXIT_EXTINT 1
+#define VMX_EXIT_TRIPLE_FAULT 2
+#define VMX_EXIT_INIT 3
+#define VMX_EXIT_SIPI 4
+#define VMX_EXIT_IO_SMI 5
+#define VMX_EXIT_OTHER_SMI 6
+#define VMX_EXIT_INT_WINDOW 7
+#define VMX_EXIT_NMI_WINDOW 8
+#define VMX_EXIT_TASK_SWITCH 9
+#define VMX_EXIT_CPUID 10
+#define VMX_EXIT_GETSEC 11
+#define VMX_EXIT_HLT 12
+#define VMX_EXIT_INVD 13
+#define VMX_EXIT_INVLPG 14
+#define VMX_EXIT_RDPMC 15
+#define VMX_EXIT_RDTSC 16
+#define VMX_EXIT_RSM 17
+#define VMX_EXIT_VMCALL 18
+#define VMX_EXIT_VMCLEAR 19
+#define VMX_EXIT_VMLAUNCH 20
+#define VMX_EXIT_VMPTRLD 21
+#define VMX_EXIT_VMPTRST 22
+#define VMX_EXIT_VMREAD 23
+#define VMX_EXIT_VMRESUME 24
+#define VMX_EXIT_VMWRITE 25
+#define VMX_EXIT_VMXOFF 26
+#define VMX_EXIT_VMXON 27
+#define VMX_EXIT_CR_ACCESS 28
+#define VMX_EXIT_MOV_DR 29
+#define VMX_EXIT_IO 30
+#define VMX_EXIT_RDMSR 31
+#define VMX_EXIT_WRMSR 32
+#define VMX_EXIT_ENTRY_FAILED_GUEST_STATE 33
+#define VMX_EXIT_ENTRY_FAILED_MSR_LOAD 34
+#define VMX_EXIT_MWAIT 36
+#define VMX_EXIT_MTF 37
+#define VMX_EXIT_MONITOR 39
+#define VMX_EXIT_PAUSE 40
+#define VMX_EXIT_ENTRY_FAILED_MCE 41
+#define VMX_EXIT_TPR_BELOW_THRESHOLD 43
+#define VMX_EXIT_APIC_ACCESS 44
+#define VMX_EXIT_VIRTUALIZED_EOI 45
+#define VMX_EXIT_GDTR_IDTR 46
+#define VMX_EXIT_LDTR_TR 47
+#define VMX_EXIT_EPT_VIOLATION 48
+#define VMX_EXIT_EPT_MISCONFIGURATION 49
+#define VMX_EXIT_INVEPT 50
+#define VMX_EXIT_RDTSCP 51
+#define VMX_EXIT_VMX_PREEMPTION_TIMER_EXPIRED 52
+#define VMX_EXIT_INVVPID 53
+#define VMX_EXIT_WBINVD 54
+#define VMX_EXIT_XSETBV 55
+#define VMX_EXIT_APIC_WRITE 56
+#define VMX_EXIT_RDRAND 57
+#define VMX_EXIT_INVPCID 58
+#define VMX_EXIT_VMFUNC 59
+#define VMX_EXIT_RDSEED 61
+#define VMX_EXIT_XSAVES 63
+#define VMX_EXIT_XRSTORS 64
+
+/*
+ * VMX: Misc defines
+ */
+#define VMX_MAX_CR3_TARGETS 256
+
+#define VM_EXIT_TERMINATED 0xFFFE
+#define VM_EXIT_NONE 0xFFFF
+
+/*
+ * VCPU state values. Note that there is a conversion function in vmm.c
+ * (vcpu_state_decode) that converts these to human readable strings,
+ * so this enum and vcpu_state_decode should be kept in sync.
+ */
+enum {
+ VCPU_STATE_STOPPED,
+ VCPU_STATE_RUNNING,
+ VCPU_STATE_REQTERM,
+ VCPU_STATE_TERMINATED,
+ VCPU_STATE_UNKNOWN,
+};
+
+enum {
+ VEI_DIR_OUT,
+ VEI_DIR_IN
+};
+
+/*
+ * vm exit data
+ * vm_exit_inout : describes an IN/OUT exit
+ */
+struct vm_exit_inout {
+ uint8_t vei_size; /* Size of access */
+ uint8_t vei_dir; /* Direction */
+ uint8_t vei_rep; /* REP prefix? */
+ uint8_t vei_string; /* string variety? */
+ uint8_t vei_encoding; /* operand encoding */
+ uint16_t vei_port; /* port */
+ uint32_t vei_data; /* data (for IN insns) */
+};
+
+union vm_exit {
+ struct vm_exit_inout vei; /* IN/OUT exit */
+};
+
+/*
+ * struct vcpu_segment_info describes a segment + selector set, used
+ * in constructing the initial vcpu register content
+ */
+struct vcpu_segment_info {
+ uint16_t vsi_sel;
+ uint32_t vsi_limit;
+ uint32_t vsi_ar;
+ uint32_t vsi_base;
+};
+
+#define VCPU_REGS_EAX 0
+#define VCPU_REGS_EBX 1
+#define VCPU_REGS_ECX 2
+#define VCPU_REGS_EDX 3
+#define VCPU_REGS_ESI 4
+#define VCPU_REGS_EDI 5
+#define VCPU_REGS_ESP 6
+#define VCPU_REGS_EBP 7
+#define VCPU_REGS_EIP 8
+#define VCPU_REGS_EFLAGS 9
+#define VCPU_REGS_NGPRS (VCPU_REGS_EFLAGS + 1)
+
+#define VCPU_REGS_CR0 0
+#define VCPU_REGS_CR2 1
+#define VCPU_REGS_CR3 2
+#define VCPU_REGS_CR4 3
+#define VCPU_REGS_CR8 4
+#define VCPU_REGS_NCRS (VCPU_REGS_CR8 + 1)
+
+#define VCPU_REGS_CS 0
+#define VCPU_REGS_DS 1
+#define VCPU_REGS_ES 2
+#define VCPU_REGS_FS 3
+#define VCPU_REGS_GS 4
+#define VCPU_REGS_SS 5
+#define VCPU_REGS_LDTR 6
+#define VCPU_REGS_TR 7
+#define VCPU_REGS_NSREGS (VCPU_REGS_TR + 1)
+
+struct vcpu_reg_state {
+ uint32_t vrs_gprs[VCPU_REGS_NGPRS];
+ uint32_t vrs_crs[VCPU_REGS_NCRS];
+ struct vcpu_segment_info vrs_sregs[VCPU_REGS_NSREGS];
+ struct vcpu_segment_info vrs_gdtr;
+ struct vcpu_segment_info vrs_idtr;
+};
+
+struct vm_mem_range {
+ paddr_t vmr_gpa;
+ vaddr_t vmr_va;
+ size_t vmr_size;
+};
+
+struct vm_create_params {
+ /* Input parameters to VMM_IOC_CREATE */
+ size_t vcp_nmemranges;
+ size_t vcp_ncpus;
+ size_t vcp_ndisks;
+ size_t vcp_nnics;
+ struct vm_mem_range vcp_memranges[VMM_MAX_MEM_RANGES];
+ char vcp_disks[VMM_MAX_DISKS_PER_VM][VMM_MAX_PATH_DISK];
+ char vcp_name[VMM_MAX_NAME_LEN];
+ char vcp_kernel[VMM_MAX_KERNEL_PATH];
+ uint8_t vcp_macs[VMM_MAX_NICS_PER_VM][6];
+
+ /* Output parameter from VMM_IOC_CREATE */
+ uint32_t vcp_id;
+};
+
+struct vm_run_params {
+ /* Input parameters to VMM_IOC_RUN */
+ uint32_t vrp_vm_id;
+ uint32_t vrp_vcpu_id;
+ uint8_t vrp_continue; /* Continuing from an exit */
+ uint16_t vrp_irq; /* IRQ to inject */
+
+ /* Input/output parameter to VMM_IOC_RUN */
+ union vm_exit *vrp_exit; /* updated exit data */
+
+ /* Output parameter from VMM_IOC_RUN */
+ uint16_t vrp_exit_reason; /* exit reason */
+ uint8_t vrp_irqready; /* ready for IRQ on entry */
+};
+
+struct vm_info_result {
+ /* Output parameters from VMM_IOC_INFO */
+ size_t vir_memory_size;
+ size_t vir_used_size;
+ size_t vir_ncpus;
+ uint8_t vir_vcpu_state[VMM_MAX_VCPUS_PER_VM];
+ pid_t vir_creator_pid;
+ uint32_t vir_id;
+ char vir_name[VMM_MAX_NAME_LEN];
+};
+
+struct vm_info_params {
+ /* Input parameters to VMM_IOC_INFO */
+ size_t vip_size; /* Output buffer size */
+
+ /* Output Parameters from VMM_IOC_INFO */
+ size_t vip_info_ct; /* # of entries returned */
+ struct vm_info_result *vip_info; /* Output buffer */
+};
+
+struct vm_terminate_params {
+ /* Input parameters to VMM_IOC_TERM */
+ uint32_t vtp_vm_id;
+};
+
+struct vm_resetcpu_params {
+ /* Input parameters to VMM_IOC_RESETCPU */
+ uint32_t vrp_vm_id;
+ uint32_t vrp_vcpu_id;
+ struct vcpu_reg_state vrp_init_state;
+};
+
+struct vm_intr_params {
+ /* Input parameters to VMM_IOC_INTR */
+ uint32_t vip_vm_id;
+ uint32_t vip_vcpu_id;
+ uint16_t vip_intr;
+};
+
+#define VM_RWREGS_GPRS 0x1 /* read/write GPRs */
+#define VM_RWREGS_SREGS 0x2 /* read/write segment registers */
+#define VM_RWREGS_CRS 0x4 /* read/write CRs */
+#define VM_RWREGS_ALL (VM_RWREGS_GPRS | VM_RWREGS_SREGS | VM_RWREGS_CRS)
+
+struct vm_rwregs_params {
+ uint32_t vrwp_vm_id;
+ uint32_t vrwp_vcpu_id;
+ uint64_t vrwp_mask;
+ struct vcpu_reg_state vrwp_regs;
+};
+
+/* IOCTL definitions */
+#define VMM_IOC_CREATE _IOWR('V', 1, struct vm_create_params) /* Create VM */
+#define VMM_IOC_RUN _IOWR('V', 2, struct vm_run_params) /* Run VCPU */
+#define VMM_IOC_INFO _IOWR('V', 3, struct vm_info_params) /* Get VM Info */
+#define VMM_IOC_TERM _IOW('V', 4, struct vm_terminate_params) /* Terminate VM */
+#define VMM_IOC_RESETCPU _IOW('V', 5, struct vm_resetcpu_params) /* Reset */
+#define VMM_IOC_INTR _IOW('V', 6, struct vm_intr_params) /* Intr pending */
+#define VMM_IOC_READREGS _IOWR('V', 7, struct vm_rwregs_params) /* Get registers */
+#define VMM_IOC_WRITEREGS _IOW('V', 8, struct vm_rwregs_params) /* Set registers */
+
+#ifdef _KERNEL
+
+#define VMX_FAIL_LAUNCH_UNKNOWN 1
+#define VMX_FAIL_LAUNCH_INVALID_VMCS 2
+#define VMX_FAIL_LAUNCH_VALID_VMCS 3
+
+#define VMX_NUM_MSR_STORE 0
+// #define VMX_NUM_MSR_STORE 1
+
+/* MSR bitmap manipulation macros */
+#define MSRIDX(m) ((m) / 8)
+#define MSRBIT(m) (1 << (m) % 8)
+
+enum {
+ VMM_MODE_UNKNOWN,
+ VMM_MODE_VMX,
+ VMM_MODE_EPT,
+ VMM_MODE_SVM,
+ VMM_MODE_RVI
+};
+
+enum {
+ VMM_MEM_TYPE_REGULAR,
+ VMM_MEM_TYPE_UNKNOWN
+};
+
+/* Forward declarations */
+struct vm;
+
+/*
+ * Implementation-specific cpu state
+ */
+struct vmcb {
+};
+
+struct vmcs {
+ uint32_t vmcs_revision;
+};
+
+struct vmx_invvpid_descriptor
+{
+ uint64_t vid_vpid; // : 16;
+ uint64_t vid_addr;
+};
+
+struct vmx_invept_descriptor
+{
+ uint64_t vid_eptp;
+ uint64_t vid_reserved;
+};
+
+struct vmx_msr_store
+{
+ uint64_t vms_index : 32;
+ uint64_t vms_data;
+};
+
+/*
+ * Storage for guest registers not preserved in VMCS and various exit
+ * information.
+ *
+ * Note that vmx_enter_guest depends on the layout of this struct for
+ * field access.
+ */
+struct vmx_gueststate
+{
+ /* %esi should be first */
+ uint32_t vg_esi; /* 0x00 */
+ uint32_t vg_eax; /* 0x04 */
+ uint32_t vg_ebx; /* 0x08 */
+ uint32_t vg_ecx; /* 0x0c */
+ uint32_t vg_edx; /* 0x10 */
+ uint32_t vg_edi; /* 0x14 */
+ uint32_t vg_ebp; /* 0x18 */
+ uint32_t vg_cr2; /* 0x1c */
+ uint32_t vg_eip; /* 0x20 */
+ uint32_t vg_exit_reason; /* 0x24 */
+ uint32_t vg_eflags; /* 0x28 */
+};
+
+/*
+ * Virtual Machine
+ */
+struct vm;
+
+/*
+ * Virtual CPU
+ */
+struct vcpu {
+ /* VMCS / VMCB pointer */
+ vaddr_t vc_control_va;
+ uint64_t vc_control_pa;
+
+ /* VLAPIC pointer */
+ vaddr_t vc_vlapic_va;
+ uint64_t vc_vlapic_pa;
+
+ /* MSR bitmap address */
+ vaddr_t vc_msr_bitmap_va;
+ uint64_t vc_msr_bitmap_pa;
+
+ struct vm *vc_parent;
+ uint32_t vc_id;
+ u_int vc_state;
+ SLIST_ENTRY(vcpu) vc_vcpu_link;
+ vaddr_t vc_hsa_stack_va;
+
+ uint8_t vc_virt_mode;
+
+ struct cpu_info *vc_last_pcpu;
+ union vm_exit vc_exit;
+
+ uint16_t vc_intr;
+ uint8_t vc_irqready;
+
+ /* VMX only */
+ uint64_t vc_vmx_basic;
+ uint64_t vc_vmx_entry_ctls;
+ uint64_t vc_vmx_true_entry_ctls;
+ uint64_t vc_vmx_exit_ctls;
+ uint64_t vc_vmx_true_exit_ctls;
+ uint64_t vc_vmx_pinbased_ctls;
+ uint64_t vc_vmx_true_pinbased_ctls;
+ uint64_t vc_vmx_procbased_ctls;
+ uint64_t vc_vmx_true_procbased_ctls;
+ uint64_t vc_vmx_procbased2_ctls;
+ struct vmx_gueststate vc_gueststate;
+ vaddr_t vc_vmx_msr_exit_save_va;
+ paddr_t vc_vmx_msr_exit_save_pa;
+ vaddr_t vc_vmx_msr_exit_load_va;
+ paddr_t vc_vmx_msr_exit_load_pa;
+ vaddr_t vc_vmx_msr_entry_load_va;
+ paddr_t vc_vmx_msr_entry_load_pa;
+};
+
+SLIST_HEAD(vcpu_head, vcpu);
+
+void vmm_dispatch_intr(vaddr_t);
+int vmxon(uint64_t *);
+int vmxoff(void);
+int vmclear(uint64_t *);
+int vmptrld(uint64_t *);
+int vmptrst(uint64_t *);
+int vmwrite(uint32_t, uint32_t);
+int vmread(uint32_t, uint32_t *);
+void invvpid(uint32_t, struct vmx_invvpid_descriptor *);
+void invept(uint32_t, struct vmx_invept_descriptor *);
+int vmx_enter_guest(uint64_t *, struct vmx_gueststate *, int, vaddr_t);
+void start_vmm_on_cpu(struct cpu_info *);
+void stop_vmm_on_cpu(struct cpu_info *);
+
+#endif /* _KERNEL */
+
+#endif /* ! _MACHINE_VMMVAR_H_ */