Allocate and hook up a "shared info page"

This page provides a matrix of pending events and some other information like hypervisor timecounter. OK mlarkin, reyk
author: mikeb <mikeb@openbsd.org> 2015-12-08 19:29:22 +0000
committer: mikeb <mikeb@openbsd.org> 2015-12-08 19:29:22 +0000
commit: c718e78e8bd71485fe6e5c6371366e52cb61f227 (patch)
tree: 6126d3b6d52794abd662c4fb28cf7cf2cbc6608a
parent: Allocate and hook up a "shared info page" (diff)
download: wireguard-openbsd-c718e78e8bd71485fe6e5c6371366e52cb61f227.tar.xz
wireguard-openbsd-c718e78e8bd71485fe6e5c6371366e52cb61f227.zip
2 files changed, 450 insertions, 0 deletions
diff --git a/sys/dev/pv/xen.c b/sys/dev/pv/xen.c
index 09f72bcaa14..cfa68c61dca 100644
--- a/sys/dev/pv/xen.c
+++ b/sys/dev/pv/xen.c
@@ -36,6 +36,7 @@ struct xen_softc *xen_sc;
 int	xen_init_hypercall(struct xen_softc *);
 int	xen_getversion(struct xen_softc *);
 int	xen_getfeatures(struct xen_softc *);
+int	xen_init_info_page(struct xen_softc *);
 
 int 	xen_match(struct device *, void *, void *);
 void	xen_attach(struct device *, struct device *, void *);
@@ -83,6 +84,9 @@ xen_attach(struct device *parent, struct device *self, void *aux)
 		return;
 	if (xen_getfeatures(sc))
 		return;
+
+	if (xen_init_info_page(sc))
+		return;
 }
 
 void
@@ -317,3 +321,81 @@ xen_getfeatures(struct xen_softc *sc)
 	    "\006PTUPD\005PAE4G\004SUPERVISOR\003AUTOPMAP\002WDT\001WPT");
 	return (0);
 }
+
+#ifdef XEN_DEBUG
+void
+xen_print_info_page(void)
+{
+	struct xen_softc *sc = xen_sc;
+	struct shared_info *s = sc->sc_ipg;
+	struct vcpu_info *v;
+	int i;
+
+	membar_sync();
+	for (i = 0; i < XEN_LEGACY_MAX_VCPUS; i++) {
+		v = &s->vcpu_info[i];
+		if (!v->evtchn_upcall_pending && !v->evtchn_upcall_mask &&
+		    !v->evtchn_pending_sel && !v->time.version &&
+		    !v->time.tsc_timestamp && !v->time.system_time &&
+		    !v->time.tsc_to_system_mul && !v->time.tsc_shift)
+			continue;
+		printf("vcpu%d:\n"
+		    "   upcall_pending=%02x upcall_mask=%02x pending_sel=%#lx\n"
+		    "   time version=%u tsc=%llu system=%llu\n"
+		    "   time mul=%u shift=%d\n"
+		    , i, v->evtchn_upcall_pending, v->evtchn_upcall_mask,
+		    v->evtchn_pending_sel, v->time.version,
+		    v->time.tsc_timestamp, v->time.system_time,
+		    v->time.tsc_to_system_mul, v->time.tsc_shift);
+	}
+	printf("pending events: ");
+	for (i = 0; i < nitems(s->evtchn_pending); i++) {
+		if (s->evtchn_pending[i] == 0)
+			continue;
+		printf(" %d:%#lx", i, s->evtchn_pending[i]);
+	}
+	printf("\nmasked events: ");
+	for (i = 0; i < nitems(s->evtchn_mask); i++) {
+		if (s->evtchn_mask[i] == 0xffffffffffffffffULL)
+			continue;
+		printf(" %d:%#lx", i, s->evtchn_mask[i]);
+	}
+	printf("\nwc ver=%u sec=%u nsec=%u\n", s->wc_version, s->wc_sec,
+	    s->wc_nsec);
+	printf("arch maxpfn=%lu framelist=%lu nmi=%lu\n", s->arch.max_pfn,
+	    s->arch.pfn_to_mfn_frame_list, s->arch.nmi_reason);
+}
+#endif	/* XEN_DEBUG */
+
+int
+xen_init_info_page(struct xen_softc *sc)
+{
+	struct xen_add_to_physmap xatp;
+	paddr_t pa;
+
+	sc->sc_ipg = malloc(PAGE_SIZE, M_DEVBUF, M_NOWAIT | M_ZERO);
+	if (sc->sc_ipg == NULL) {
+		printf("%s: failed to allocate shared info page\n",
+		    sc->sc_dev.dv_xname);
+		return (-1);
+	}
+	if (!pmap_extract(pmap_kernel(), (vaddr_t)sc->sc_ipg, &pa)) {
+		printf("%s: shared info page PA extraction failed\n",
+		    sc->sc_dev.dv_xname);
+		free(sc->sc_ipg, M_DEVBUF, PAGE_SIZE);
+		return (-1);
+	}
+	xatp.domid = DOMID_SELF;
+	xatp.idx = 0;
+	xatp.space = XENMAPSPACE_shared_info;
+	xatp.gpfn = atop(pa);
+	if (xen_hypercall(sc, memory_op, 2, XENMEM_add_to_physmap, &xatp)) {
+		printf("%s: failed to register shared info page\n",
+		    sc->sc_dev.dv_xname);
+		free(sc->sc_ipg, M_DEVBUF, PAGE_SIZE);
+		return (-1);
+	}
+	DPRINTF("%s: shared info page at va %p pa %#lx\n", sc->sc_dev.dv_xname,
+	    sc->sc_ipg, pa);
+	return (0);
+}
diff --git a/sys/dev/pv/xenreg.h b/sys/dev/pv/xenreg.h
index 3f646d33011..ec45722576d 100644
--- a/sys/dev/pv/xenreg.h
+++ b/sys/dev/pv/xenreg.h
@@ -87,6 +87,31 @@
 
 #define CPUID_OFFSET_XEN_HYPERCALL		0x2
 
+#if defined(__i386__) || defined(__amd64__)
+struct arch_vcpu_info {
+	unsigned long cr2;
+	unsigned long pad;
+} __packed;
+
+typedef unsigned long xen_pfn_t;
+typedef unsigned long xen_ulong_t;
+
+/* Maximum number of virtual CPUs in legacy multi-processor guests. */
+#define XEN_LEGACY_MAX_VCPUS 32
+
+struct arch_shared_info {
+	unsigned long max_pfn;	/* max pfn that appears in table */
+	/*
+	 * Frame containing list of mfns containing list of mfns containing p2m.
+	 */
+	xen_pfn_t pfn_to_mfn_frame_list;
+	unsigned long nmi_reason;
+	uint64_t pad[32];
+} __packed;
+#else
+#error "Not implemented"
+#endif	/* __i386__ || __amd64__ */
+
 /*
  * interface/xen.h
  */
@@ -97,6 +122,300 @@ typedef uint16_t domid_t;
 #define DOMID_SELF		(0x7FF0U)
 
 /*
+ * Event channel endpoints per domain:
+ *  1024 if a long is 32 bits; 4096 if a long is 64 bits.
+ */
+#define NR_EVENT_CHANNELS (sizeof(unsigned long) * sizeof(unsigned long) * 64)
+
+struct vcpu_time_info {
+	/*
+	 * Updates to the following values are preceded and followed by an
+	 * increment of 'version'. The guest can therefore detect updates by
+	 * looking for changes to 'version'. If the least-significant bit of
+	 * the version number is set then an update is in progress and the
+	 * guest must wait to read a consistent set of values.
+	 *
+	 * The correct way to interact with the version number is similar to
+	 * Linux's seqlock: see the implementations of read_seqbegin and
+	 * read_seqretry.
+	 */
+	uint32_t version;
+	uint32_t pad0;
+	uint64_t tsc_timestamp;	/* TSC at last update of time vals.  */
+	uint64_t system_time;	/* Time, in nanosecs, since boot.    */
+	/*
+	 * Current system time:
+	 *   system_time +
+	 *   ((((tsc - tsc_timestamp) << tsc_shift) * tsc_to_system_mul) >> 32)
+	 * CPU frequency (Hz):
+	 *   ((10^9 << 32) / tsc_to_system_mul) >> tsc_shift
+	 */
+	uint32_t tsc_to_system_mul;
+	int8_t tsc_shift;
+	int8_t pad1[3];
+} __packed; /* 32 bytes */
+
+struct vcpu_info {
+	/*
+	 * 'evtchn_upcall_pending' is written non-zero by Xen to indicate
+	 * a pending notification for a particular VCPU. It is then cleared
+	 * by the guest OS /before/ checking for pending work, thus avoiding
+	 * a set-and-check race. Note that the mask is only accessed by Xen
+	 * on the CPU that is currently hosting the VCPU. This means that the
+	 * pending and mask flags can be updated by the guest without special
+	 * synchronisation (i.e., no need for the x86 LOCK prefix).
+	 * This may seem suboptimal because if the pending flag is set by
+	 * a different CPU then an IPI may be scheduled even when the mask
+	 * is set. However, note:
+	 *  1. The task of 'interrupt holdoff' is covered by the per-event-
+	 *     channel mask bits. A 'noisy' event that is continually being
+	 *     triggered can be masked at source at this very precise
+	 *     granularity.
+	 *  2. The main purpose of the per-VCPU mask is therefore to restrict
+	 *     reentrant execution: whether for concurrency control, or to
+	 *     prevent unbounded stack usage. Whatever the purpose, we expect
+	 *     that the mask will be asserted only for short periods at a time,
+	 *     and so the likelihood of a 'spurious' IPI is suitably small.
+	 * The mask is read before making an event upcall to the guest: a
+	 * non-zero mask therefore guarantees that the VCPU will not receive
+	 * an upcall activation. The mask is cleared when the VCPU requests
+	 * to block: this avoids wakeup-waiting races.
+	 */
+	uint8_t evtchn_upcall_pending;
+	uint8_t pad1[3];
+	uint8_t evtchn_upcall_mask;
+	uint8_t pad2[3];
+	unsigned long evtchn_pending_sel;
+	struct arch_vcpu_info arch;
+	struct vcpu_time_info time;
+} __packed; /* 64 bytes (x86) */
+
+/*
+ * Xen/kernel shared data -- pointer provided in start_info.
+ *
+ * This structure is defined to be both smaller than a page, and the only data
+ * on the shared page, but may vary in actual size even within compatible Xen
+ * versions; guests should not rely on the size of this structure remaining
+ * constant.
+ */
+struct shared_info {
+	struct vcpu_info vcpu_info[XEN_LEGACY_MAX_VCPUS];
+
+	/*
+	 * A domain can create "event channels" on which it can send and
+	 * receive asynchronous event notifications. There are three classes
+	 * of event that are delivered by this mechanism:
+	 *  1. Bi-directional inter- and intra-domain connections.  Domains
+	 *     must arrange out-of-band to set up a connection (usually by
+	 *     allocating an unbound 'listener' port and avertising that via
+	 *     a storage service such as xenstore).
+	 *  2. Physical interrupts. A domain with suitable hardware-access
+	 *     privileges can bind an event-channel port to a physical
+	 *     interrupt source.
+	 *  3. Virtual interrupts ('events'). A domain can bind an event
+	 *     channel port to a virtual interrupt source, such as the
+	 *     virtual-timer device or the emergency console.
+	 *
+	 * Event channels are addressed by a "port index". Each channel is
+	 * associated with two bits of information:
+	 *  1. PENDING -- notifies the domain that there is a pending
+	 *     notification to be processed. This bit is cleared by the guest.
+	 *  2. MASK -- if this bit is clear then a 0->1 transition of PENDING
+	 *     will cause an asynchronous upcall to be scheduled. This bit is
+	 *     only updated by the guest. It is read-only within Xen. If a
+	 *     channel becomes pending while the channel is masked then the
+	 *     'edge' is lost (i.e., when the channel is unmasked, the guest
+	 *     must manually handle pending notifications as no upcall will be
+	 *     scheduled by Xen).
+	 *
+	 * To expedite scanning of pending notifications, any 0->1 pending
+	 * transition on an unmasked channel causes a corresponding bit in a
+	 * per-vcpu selector word to be set. Each bit in the selector covers a
+	 * 'C long' in the PENDING bitfield array.
+	 */
+	unsigned long evtchn_pending[sizeof(unsigned long) * 8];
+	unsigned long evtchn_mask[sizeof(unsigned long) * 8];
+
+	/*
+	 * Wallclock time: updated only by control software. Guests should
+	 * base their gettimeofday() syscall on this wallclock-base value.
+	 */
+	uint32_t wc_version;	/* Version counter: see vcpu_time_info_t. */
+	uint32_t wc_sec;	/* Secs  00:00:00 UTC, Jan 1, 1970.  */
+	uint32_t wc_nsec;	/* Nsecs 00:00:00 UTC, Jan 1, 1970.  */
+
+	struct arch_shared_info arch;
+} __packed;
+
+
+/*
+ * interface/hvm/hvm_op.h
+ */
+
+/* Get/set subcommands: extra argument == pointer to xen_hvm_param struct. */
+#define HVMOP_set_param		0
+#define HVMOP_get_param		1
+struct xen_hvm_param {
+	domid_t  domid;		/* IN */
+	uint32_t index;		/* IN */
+	uint64_t value;		/* IN/OUT */
+};
+
+/*
+ * Parameter space for HVMOP_{set,get}_param.
+ */
+
+/*
+ * How should CPU0 event-channel notifications be delivered?
+ * val[63:56] == 0: val[55:0] is a delivery GSI (Global System Interrupt).
+ * val[63:56] == 1: val[55:0] is a delivery PCI INTx line, as follows:
+ *                  Domain = val[47:32], Bus  = val[31:16],
+ *                  DevFn  = val[15: 8], IntX = val[ 1: 0]
+ * val[63:56] == 2: val[7:0] is a vector number, check for
+ *                  XENFEAT_hvm_callback_vector to know if this delivery
+ *                  method is available.
+ * If val == 0 then CPU0 event-channel notifications are not delivered.
+ */
+#define HVM_PARAM_CALLBACK_IRQ			0
+
+/*
+ * These are not used by Xen. They are here for convenience of HVM-guest
+ * xenbus implementations.
+ */
+#define HVM_PARAM_STORE_PFN			1
+#define HVM_PARAM_STORE_EVTCHN			2
+
+#define HVM_PARAM_PAE_ENABLED			4
+
+#define HVM_PARAM_IOREQ_PFN			5
+
+#define HVM_PARAM_BUFIOREQ_PFN			6
+#define HVM_PARAM_BUFIOREQ_EVTCHN		26
+
+/*
+ * Set mode for virtual timers (currently x86 only):
+ *  delay_for_missed_ticks (default):
+ *   Do not advance a vcpu's time beyond the correct delivery time for
+ *   interrupts that have been missed due to preemption. Deliver missed
+ *   interrupts when the vcpu is rescheduled and advance the vcpu's virtual
+ *   time stepwise for each one.
+ *  no_delay_for_missed_ticks:
+ *   As above, missed interrupts are delivered, but guest time always tracks
+ *   wallclock (i.e., real) time while doing so.
+ *  no_missed_ticks_pending:
+ *   No missed interrupts are held pending. Instead, to ensure ticks are
+ *   delivered at some non-zero rate, if we detect missed ticks then the
+ *   internal tick alarm is not disabled if the VCPU is preempted during the
+ *   next tick period.
+ *  one_missed_tick_pending:
+ *   Missed interrupts are collapsed together and delivered as one 'late tick'.
+ *   Guest time always tracks wallclock (i.e., real) time.
+ */
+#define HVM_PARAM_TIMER_MODE			10
+#define HVMPTM_delay_for_missed_ticks		 0
+#define HVMPTM_no_delay_for_missed_ticks	 1
+#define HVMPTM_no_missed_ticks_pending		 2
+#define HVMPTM_one_missed_tick_pending		 3
+
+/* Boolean: Enable virtual HPET (high-precision event timer)? (x86-only) */
+#define HVM_PARAM_HPET_ENABLED			11
+
+/* Identity-map page directory used by Intel EPT when CR0.PG=0. */
+#define HVM_PARAM_IDENT_PT			12
+
+/* Device Model domain, defaults to 0. */
+#define HVM_PARAM_DM_DOMAIN			13
+
+/* ACPI S state: currently support S0 and S3 on x86. */
+#define HVM_PARAM_ACPI_S_STATE			14
+
+/* TSS used on Intel when CR0.PE=0. */
+#define HVM_PARAM_VM86_TSS			15
+
+/* Boolean: Enable aligning all periodic vpts to reduce interrupts */
+#define HVM_PARAM_VPT_ALIGN			16
+
+/* Console debug shared memory ring and event channel */
+#define HVM_PARAM_CONSOLE_PFN			17
+#define HVM_PARAM_CONSOLE_EVTCHN		18
+
+/*
+ * Select location of ACPI PM1a and TMR control blocks. Currently two locations
+ * are supported, specified by version 0 or 1 in this parameter:
+ *   - 0: default, use the old addresses
+ *        PM1A_EVT == 0x1f40; PM1A_CNT == 0x1f44; PM_TMR == 0x1f48
+ *   - 1: use the new default qemu addresses
+ *        PM1A_EVT == 0xb000; PM1A_CNT == 0xb004; PM_TMR == 0xb008
+ * You can find these address definitions in <hvm/ioreq.h>
+ */
+#define HVM_PARAM_ACPI_IOPORTS_LOCATION		19
+
+/* Enable blocking memory events, async or sync (pause vcpu until response)
+ * onchangeonly indicates messages only on a change of value */
+#define HVM_PARAM_MEMORY_EVENT_CR0		20
+#define HVM_PARAM_MEMORY_EVENT_CR3		21
+#define HVM_PARAM_MEMORY_EVENT_CR4		22
+#define HVM_PARAM_MEMORY_EVENT_INT3		23
+#define HVM_PARAM_MEMORY_EVENT_SINGLE_STEP	25
+
+#define HVMPME_MODE_MASK			(3 << 0)
+#define HVMPME_mode_disabled			 0
+#define HVMPME_mode_async			 1
+#define HVMPME_mode_sync			 2
+#define HVMPME_onchangeonly			(1 << 2)
+
+/* Boolean: Enable nestedhvm (hvm only) */
+#define HVM_PARAM_NESTEDHVM			24
+
+/* Params for the mem event rings */
+#define HVM_PARAM_PAGING_RING_PFN		27
+#define HVM_PARAM_ACCESS_RING_PFN		28
+#define HVM_PARAM_SHARING_RING_PFN		29
+
+#define HVM_NR_PARAMS				30
+
+/** The callback method types for Hypervisor event delivery to our domain. */
+enum {
+	HVM_CB_TYPE_GSI,
+	HVM_CB_TYPE_PCI_INTX,
+	HVM_CB_TYPE_VECTOR,
+	HVM_CB_TYPE_MASK		= 0xFF,
+	HVM_CB_TYPE_SHIFT		= 56
+};
+
+/** Format for specifying a GSI type callback. */
+enum {
+	HVM_CB_GSI_GSI_MASK		= 0xFFFFFFFF,
+	HVM_CB_GSI_GSI_SHIFT		= 0
+};
+#define HVM_CALLBACK_GSI(gsi) \
+	(((uint64_t)HVM_CB_TYPE_GSI << HVM_CB_TYPE_SHIFT) | \
+	 ((gsi) & HVM_CB_GSI_GSI_MASK) << HVM_CB_GSI_GSI_SHIFT)
+
+/** Format for specifying a virtual PCI interrupt line GSI style callback. */
+enum {
+	HVM_CB_PCI_INTX_INTPIN_MASK	= 0x3,
+	HVM_CB_PCI_INTX_INTPIN_SHIFT	= 0,
+	HVM_CB_PCI_INTX_SLOT_MASK	= 0x1F,
+	HVM_CB_PCI_INTX_SLOT_SHIFT	= 11,
+};
+#define HVM_CALLBACK_PCI_INTX(slot, pin) \
+	(((uint64_t)HVM_CB_TYPE_PCI_INTX << HVM_CB_TYPE_SHIFT) | \
+	 (((slot) & HVM_CB_PCI_INTX_SLOT_MASK) << HVM_CB_PCI_INTX_SLOT_SHIFT) | \
+	 (((pin) & HVM_CB_PCI_INTX_INTPIN_MASK) << HVM_CB_PCI_INTX_INTPIN_SHIFT))
+
+/** Format for specifying a direct IDT vector injection style callback. */
+enum {
+	HVM_CB_VECTOR_VECTOR_MASK	= 0xFFFFFFFF,
+	HVM_CB_VECTOR_VECTOR_SHIFT	= 0
+};
+#define HVM_CALLBACK_VECTOR(vector) \
+	(((uint64_t)HVM_CB_TYPE_VECTOR << HVM_CB_TYPE_SHIFT) | \
+	 (((vector) & HVM_CB_GSI_GSI_MASK) << HVM_CB_GSI_GSI_SHIFT))
+
+
+
+/*
  * interface/features.h
  *
  * Feature flags, reported by XENVER_get_features.
@@ -144,6 +463,55 @@ typedef uint16_t domid_t;
 /* operation as Dom0 is supported */
 #define XENFEAT_dom0				11
 
+/*
+ * interface/memory.h
+ *
+ * Memory reservation and information.
+ */
+
+/*
+ * Increase or decrease the specified domain's memory reservation.
+ * Returns the number of extents successfully allocated or freed.
+ * arg == addr of struct xen_memory_reservation.
+ */
+#define XENMEM_increase_reservation	0
+#define XENMEM_decrease_reservation	1
+#define XENMEM_populate_physmap		6
+
+#define XENMAPSPACE_shared_info		0	/* shared info page */
+#define XENMAPSPACE_grant_table		1	/* grant table page */
+#define XENMAPSPACE_gmfn		2	/* GMFN */
+#define XENMAPSPACE_gmfn_range		3	/* GMFN range */
+#define XENMAPSPACE_gmfn_foreign	4	/* GMFN from another domain */
+
+/*
+ * Sets the GPFN at which a particular page appears in the specified guest's
+ * pseudophysical address space.
+ * arg == addr of xen_add_to_physmap_t.
+ */
+#define XENMEM_add_to_physmap		7
+struct xen_add_to_physmap {
+	/* Which domain to change the mapping for. */
+	domid_t domid;
+
+	/* Number of pages to go through for gmfn_range */
+	uint16_t size;
+
+	/* Source mapping space. */
+#define XENMAPSPACE_shared_info	0 /* shared info page */
+#define XENMAPSPACE_grant_table	1 /* grant table page */
+#define XENMAPSPACE_gmfn	2 /* GMFN */
+#define XENMAPSPACE_gmfn_range	3 /* GMFN range */
+	unsigned int space;
+
+#define XENMAPIDX_grant_table_status 0x80000000
+
+	/* Index into source mapping space. */
+	xen_ulong_t idx;
+
+	/* GPFN where the source mapping page should appear. */
+	xen_pfn_t gpfn;
+};
 
 /*
  * interface/version.h
author	mikeb <mikeb@openbsd.org>	2015-12-08 19:29:22 +0000
committer	mikeb <mikeb@openbsd.org>	2015-12-08 19:29:22 +0000
commit	c718e78e8bd71485fe6e5c6371366e52cb61f227 (patch)
tree	6126d3b6d52794abd662c4fb28cf7cf2cbc6608a
parent	Allocate and hook up a "shared info page" (diff)
download	wireguard-openbsd-c718e78e8bd71485fe6e5c6371366e52cb61f227.tar.xz wireguard-openbsd-c718e78e8bd71485fe6e5c6371366e52cb61f227.zip