summaryrefslogtreecommitdiffstats
path: root/usr.sbin/vmd
diff options
context:
space:
mode:
authorreyk <reyk@openbsd.org>2015-12-02 09:14:25 +0000
committerreyk <reyk@openbsd.org>2015-12-02 09:14:25 +0000
commitaf96af6c62bd200f3658b7d5ec868ef46d2cfd01 (patch)
tree7a81d339404ef73de70b5b7bce6540d2f48a07c6 /usr.sbin/vmd
parentwhitespaces (diff)
downloadwireguard-openbsd-af96af6c62bd200f3658b7d5ec868ef46d2cfd01.tar.xz
wireguard-openbsd-af96af6c62bd200f3658b7d5ec868ef46d2cfd01.zip
Start tweaking vmd's privsep and daemon model by splitting the main
process into multiple parts and adopting the "proc.c"-style from other daemons. This allows to further reduce the privileges, to give better pledge(2), and to add some upcoming changes. "please do" mlarkin@, deraadt@
Diffstat (limited to 'usr.sbin/vmd')
-rw-r--r--usr.sbin/vmd/Makefile7
-rw-r--r--usr.sbin/vmd/control.c367
-rw-r--r--usr.sbin/vmd/proc.c632
-rw-r--r--usr.sbin/vmd/proc.h187
-rw-r--r--usr.sbin/vmd/vmd.c1681
-rw-r--r--usr.sbin/vmd/vmd.h50
-rw-r--r--usr.sbin/vmd/vmm.c1408
7 files changed, 2764 insertions, 1568 deletions
diff --git a/usr.sbin/vmd/Makefile b/usr.sbin/vmd/Makefile
index 14518248692..51eba5700cc 100644
--- a/usr.sbin/vmd/Makefile
+++ b/usr.sbin/vmd/Makefile
@@ -2,14 +2,15 @@
.if ${MACHINE} == "amd64"
PROG= vmd
-SRCS= vmd.c loadfile_elf.c pci.c virtio.c log.c
+SRCS= vmm.c loadfile_elf.c pci.c virtio.c
+SRCS+= vmd.c control.c log.c proc.c
CFLAGS+= -Wall -I${.CURDIR}
CFLAGS+= -Wstrict-prototypes -Wmissing-prototypes
CFLAGS+= -Wmissing-declarations
CFLAGS+= -Wshadow -Wpointer-arith -Wcast-qual
CFLAGS+= -Wsign-compare
-LDADD+= -lutil -lpthread
-DPADD+= ${LIBUTIL}
+LDADD+= -lutil -lpthread -levent
+DPADD+= ${LIBUTIL} ${LIBEVENT}
.else
diff --git a/usr.sbin/vmd/control.c b/usr.sbin/vmd/control.c
new file mode 100644
index 00000000000..e9c23e98255
--- /dev/null
+++ b/usr.sbin/vmd/control.c
@@ -0,0 +1,367 @@
+/* $OpenBSD: control.c,v 1.1 2015/12/02 09:14:25 reyk Exp $ */
+
+/*
+ * Copyright (c) 2010-2015 Reyk Floeter <reyk@openbsd.org>
+ * Copyright (c) 2003, 2004 Henning Brauer <henning@openbsd.org>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <sys/param.h>
+#include <sys/queue.h>
+#include <sys/stat.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+#include <sys/tree.h>
+
+#include <net/if.h>
+
+#include <errno.h>
+#include <event.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <signal.h>
+
+#include "proc.h"
+#include "vmd.h"
+
+#define CONTROL_BACKLOG 5
+
+struct ctl_connlist ctl_conns;
+
+void
+ control_accept(int, short, void *);
+struct ctl_conn
+ *control_connbyfd(int);
+void control_close(int, struct control_sock *);
+void control_dispatch_imsg(int, short, void *);
+int control_dispatch_vmm(int, struct privsep_proc *, struct imsg *);
+void control_imsg_forward(struct imsg *);
+void control_run(struct privsep *, struct privsep_proc *, void *);
+
+static struct privsep_proc procs[] = {
+ { "parent", PROC_PARENT, control_dispatch_vmm }
+};
+
+pid_t
+control(struct privsep *ps, struct privsep_proc *p)
+{
+ return (proc_run(ps, p, procs, nitems(procs), control_run, NULL));
+}
+
+void
+control_run(struct privsep *ps, struct privsep_proc *p, void *arg)
+{
+ /*
+ * pledge in the control process:
+ * stdio - for malloc and basic I/O including events.
+ * unix - for the control socket.
+ */
+ if (pledge("stdio unix", NULL) == -1)
+ fatal("pledge");
+}
+
+int
+control_dispatch_vmm(int fd, struct privsep_proc *p, struct imsg *imsg)
+{
+ struct ctl_conn *c;
+
+ if ((c = control_connbyfd(imsg->hdr.peerid)) == NULL) {
+ log_warnx("%s: fd %d: not found", __func__, imsg->hdr.peerid);
+ return (-1);
+ }
+
+ switch (imsg->hdr.type) {
+ case IMSG_VMDOP_START_VM_RESPONSE:
+ case IMSG_VMDOP_TERMINATE_VM_RESPONSE:
+ case IMSG_VMDOP_GET_INFO_VM_DATA:
+ case IMSG_VMDOP_GET_INFO_VM_END_DATA:
+ imsg_compose_event(&c->iev, imsg->hdr.type,
+ 0, 0, -1, imsg->data, IMSG_DATA_SIZE(imsg));
+ break;
+ default:
+ return (-1);
+ }
+
+ return (0);
+}
+
+int
+control_init(struct privsep *ps, struct control_sock *cs)
+{
+ struct sockaddr_un sun;
+ int fd;
+ mode_t old_umask, mode;
+
+ if (cs->cs_name == NULL)
+ return (0);
+
+ if ((fd = socket(AF_UNIX, SOCK_STREAM | SOCK_NONBLOCK, 0)) == -1) {
+ log_warn("%s: socket", __func__);
+ return (-1);
+ }
+
+ sun.sun_family = AF_UNIX;
+ if (strlcpy(sun.sun_path, cs->cs_name,
+ sizeof(sun.sun_path)) >= sizeof(sun.sun_path)) {
+ log_warn("%s: %s name too long", __func__, cs->cs_name);
+ close(fd);
+ return (-1);
+ }
+
+ if (unlink(cs->cs_name) == -1)
+ if (errno != ENOENT) {
+ log_warn("%s: unlink %s", __func__, cs->cs_name);
+ close(fd);
+ return (-1);
+ }
+
+ if (cs->cs_restricted) {
+ old_umask = umask(S_IXUSR|S_IXGRP|S_IXOTH);
+ mode = S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP|S_IROTH|S_IWOTH;
+ } else {
+ old_umask = umask(S_IXUSR|S_IXGRP|S_IWOTH|S_IROTH|S_IXOTH);
+ mode = S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP;
+ }
+
+ if (bind(fd, (struct sockaddr *)&sun, sizeof(sun)) == -1) {
+ log_warn("%s: bind: %s", __func__, cs->cs_name);
+ close(fd);
+ (void)umask(old_umask);
+ return (-1);
+ }
+ (void)umask(old_umask);
+
+ if (chmod(cs->cs_name, mode) == -1) {
+ log_warn("%s: chmod", __func__);
+ close(fd);
+ (void)unlink(cs->cs_name);
+ return (-1);
+ }
+
+ cs->cs_fd = fd;
+ cs->cs_env = ps;
+
+ return (0);
+}
+
+int
+control_listen(struct control_sock *cs)
+{
+ if (cs->cs_name == NULL)
+ return (0);
+
+ if (listen(cs->cs_fd, CONTROL_BACKLOG) == -1) {
+ log_warn("%s: listen", __func__);
+ return (-1);
+ }
+
+ event_set(&cs->cs_ev, cs->cs_fd, EV_READ,
+ control_accept, cs);
+ event_add(&cs->cs_ev, NULL);
+ evtimer_set(&cs->cs_evt, control_accept, cs);
+
+ return (0);
+}
+
+void
+control_cleanup(struct control_sock *cs)
+{
+ if (cs->cs_name == NULL)
+ return;
+ event_del(&cs->cs_ev);
+ event_del(&cs->cs_evt);
+}
+
+/* ARGSUSED */
+void
+control_accept(int listenfd, short event, void *arg)
+{
+ struct control_sock *cs = arg;
+ int connfd;
+ socklen_t len;
+ struct sockaddr_un sun;
+ struct ctl_conn *c;
+
+ event_add(&cs->cs_ev, NULL);
+ if ((event & EV_TIMEOUT))
+ return;
+
+ len = sizeof(sun);
+ if ((connfd = accept4(listenfd,
+ (struct sockaddr *)&sun, &len, SOCK_NONBLOCK)) == -1) {
+ /*
+ * Pause accept if we are out of file descriptors, or
+ * libevent will haunt us here too.
+ */
+ if (errno == ENFILE || errno == EMFILE) {
+ struct timeval evtpause = { 1, 0 };
+
+ event_del(&cs->cs_ev);
+ evtimer_add(&cs->cs_evt, &evtpause);
+ } else if (errno != EWOULDBLOCK && errno != EINTR &&
+ errno != ECONNABORTED)
+ log_warn("%s: accept", __func__);
+ return;
+ }
+
+ if ((c = calloc(1, sizeof(struct ctl_conn))) == NULL) {
+ log_warn("%s", __func__);
+ close(connfd);
+ return;
+ }
+
+ imsg_init(&c->iev.ibuf, connfd);
+ c->iev.handler = control_dispatch_imsg;
+ c->iev.events = EV_READ;
+ c->iev.data = cs;
+ event_set(&c->iev.ev, c->iev.ibuf.fd, c->iev.events,
+ c->iev.handler, c->iev.data);
+ event_add(&c->iev.ev, NULL);
+
+ TAILQ_INSERT_TAIL(&ctl_conns, c, entry);
+}
+
+struct ctl_conn *
+control_connbyfd(int fd)
+{
+ struct ctl_conn *c;
+
+ for (c = TAILQ_FIRST(&ctl_conns); c != NULL && c->iev.ibuf.fd != fd;
+ c = TAILQ_NEXT(c, entry))
+ ; /* nothing */
+
+ return (c);
+}
+
+void
+control_close(int fd, struct control_sock *cs)
+{
+ struct ctl_conn *c;
+
+ if ((c = control_connbyfd(fd)) == NULL) {
+ log_warn("%s: fd %d: not found", __func__, fd);
+ return;
+ }
+
+ msgbuf_clear(&c->iev.ibuf.w);
+ TAILQ_REMOVE(&ctl_conns, c, entry);
+
+ event_del(&c->iev.ev);
+ close(c->iev.ibuf.fd);
+
+ /* Some file descriptors are available again. */
+ if (evtimer_pending(&cs->cs_evt, NULL)) {
+ evtimer_del(&cs->cs_evt);
+ event_add(&cs->cs_ev, NULL);
+ }
+
+ free(c);
+}
+
+/* ARGSUSED */
+void
+control_dispatch_imsg(int fd, short event, void *arg)
+{
+ struct control_sock *cs = arg;
+ struct privsep *ps = cs->cs_env;
+ struct ctl_conn *c;
+ struct imsg imsg;
+ int n, v;
+
+ if ((c = control_connbyfd(fd)) == NULL) {
+ log_warn("%s: fd %d: not found", __func__, fd);
+ return;
+ }
+
+ if (event & EV_READ) {
+ if ((n = imsg_read(&c->iev.ibuf)) == -1 || n == 0) {
+ control_close(fd, cs);
+ return;
+ }
+ }
+ if (event & EV_WRITE) {
+ if (msgbuf_write(&c->iev.ibuf.w) <= 0 && errno != EAGAIN) {
+ control_close(fd, cs);
+ return;
+ }
+ }
+
+ for (;;) {
+ if ((n = imsg_get(&c->iev.ibuf, &imsg)) == -1) {
+ control_close(fd, cs);
+ return;
+ }
+
+ if (n == 0)
+ break;
+
+ control_imsg_forward(&imsg);
+
+ switch (imsg.hdr.type) {
+ case IMSG_CTL_NOTIFY:
+ if (c->flags & CTL_CONN_NOTIFY) {
+ log_debug("%s: "
+ "client requested notify more than once",
+ __func__);
+ imsg_compose_event(&c->iev, IMSG_CTL_FAIL,
+ 0, 0, -1, NULL, 0);
+ break;
+ }
+ c->flags |= CTL_CONN_NOTIFY;
+ break;
+ case IMSG_CTL_VERBOSE:
+ IMSG_SIZE_CHECK(&imsg, &v);
+
+ memcpy(&v, imsg.data, sizeof(v));
+ log_verbose(v);
+
+ proc_forward_imsg(ps, &imsg, PROC_PARENT, -1);
+ break;
+ case IMSG_VMDOP_START_VM_REQUEST:
+ case IMSG_VMDOP_TERMINATE_VM_REQUEST:
+ case IMSG_VMDOP_GET_INFO_VM_REQUEST:
+ imsg.hdr.peerid = fd;
+
+ if (imsg_compose_event(&ps->ps_ievs[PROC_PARENT][0],
+ imsg.hdr.type, imsg.hdr.peerid, 0, -1,
+ imsg.data, IMSG_DATA_SIZE(&imsg)) == -1) {
+ control_close(fd, cs);
+ return;
+ }
+ break;
+ default:
+ log_debug("%s: error handling imsg %d",
+ __func__, imsg.hdr.type);
+ control_close(fd, cs);
+ break;
+ }
+ imsg_free(&imsg);
+ }
+
+ imsg_event_add(&c->iev);
+}
+
+void
+control_imsg_forward(struct imsg *imsg)
+{
+ struct ctl_conn *c;
+
+ TAILQ_FOREACH(c, &ctl_conns, entry)
+ if (c->flags & CTL_CONN_NOTIFY)
+ imsg_compose_event(&c->iev, imsg->hdr.type,
+ imsg->hdr.peerid, imsg->hdr.pid, -1, imsg->data,
+ imsg->hdr.len - IMSG_HEADER_SIZE);
+}
diff --git a/usr.sbin/vmd/proc.c b/usr.sbin/vmd/proc.c
new file mode 100644
index 00000000000..56f8d720d8a
--- /dev/null
+++ b/usr.sbin/vmd/proc.c
@@ -0,0 +1,632 @@
+/* $OpenBSD: proc.c,v 1.1 2015/12/02 09:14:25 reyk Exp $ */
+
+/*
+ * Copyright (c) 2010 - 2014 Reyk Floeter <reyk@openbsd.org>
+ * Copyright (c) 2008 Pierre-Yves Ritschard <pyr@openbsd.org>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <sys/types.h>
+#include <sys/queue.h>
+#include <sys/socket.h>
+#include <sys/wait.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <errno.h>
+#include <signal.h>
+#include <pwd.h>
+#include <event.h>
+#include <imsg.h>
+
+#include "proc.h"
+
+void proc_open(struct privsep *, struct privsep_proc *,
+ struct privsep_proc *, size_t);
+void proc_close(struct privsep *);
+int proc_ispeer(struct privsep_proc *, unsigned int, enum privsep_procid);
+void proc_shutdown(struct privsep_proc *);
+void proc_sig_handler(int, short, void *);
+void proc_range(struct privsep *, enum privsep_procid, int *, int *);
+int proc_dispatch_null(int, struct privsep_proc *, struct imsg *);
+
+int
+proc_ispeer(struct privsep_proc *procs, unsigned int nproc,
+ enum privsep_procid type)
+{
+ unsigned int i;
+
+ for (i = 0; i < nproc; i++)
+ if (procs[i].p_id == type)
+ return (1);
+ return (0);
+}
+
+void
+proc_init(struct privsep *ps, struct privsep_proc *procs, unsigned int nproc)
+{
+ unsigned int i, j, src, dst;
+ struct privsep_pipes *pp;
+
+ /*
+ * Allocate pipes for all process instances (incl. parent)
+ *
+ * - ps->ps_pipes: N:M mapping
+ * N source processes connected to M destination processes:
+ * [src][instances][dst][instances], for example
+ * [PROC_RELAY][3][PROC_CA][3]
+ *
+ * - ps->ps_pp: per-process 1:M part of ps->ps_pipes
+ * Each process instance has a destination array of socketpair fds:
+ * [dst][instances], for example
+ * [PROC_PARENT][0]
+ */
+ for (src = 0; src < PROC_MAX; src++) {
+ /* Allocate destination array for each process */
+ if ((ps->ps_pipes[src] = calloc(ps->ps_ninstances,
+ sizeof(struct privsep_pipes))) == NULL)
+ fatal("proc_init: calloc");
+
+ for (i = 0; i < ps->ps_ninstances; i++) {
+ pp = &ps->ps_pipes[src][i];
+
+ for (dst = 0; dst < PROC_MAX; dst++) {
+ /* Allocate maximum fd integers */
+ if ((pp->pp_pipes[dst] =
+ calloc(ps->ps_ninstances,
+ sizeof(int))) == NULL)
+ fatal("proc_init: calloc");
+
+ /* Mark fd as unused */
+ for (j = 0; j < ps->ps_ninstances; j++)
+ pp->pp_pipes[dst][j] = -1;
+ }
+ }
+ }
+
+ /*
+ * Setup and run the parent and its children
+ */
+ privsep_process = PROC_PARENT;
+ ps->ps_instances[PROC_PARENT] = 1;
+ ps->ps_title[PROC_PARENT] = "parent";
+ ps->ps_pid[PROC_PARENT] = getpid();
+ ps->ps_pp = &ps->ps_pipes[privsep_process][0];
+
+ for (i = 0; i < nproc; i++) {
+ /* Default to 1 process instance */
+ if (ps->ps_instances[procs[i].p_id] < 1)
+ ps->ps_instances[procs[i].p_id] = 1;
+ ps->ps_title[procs[i].p_id] = procs[i].p_title;
+ }
+
+ proc_open(ps, NULL, procs, nproc);
+
+ /* Engage! */
+ for (i = 0; i < nproc; i++)
+ ps->ps_pid[procs[i].p_id] = (*procs[i].p_init)(ps, &procs[i]);
+}
+
+void
+proc_kill(struct privsep *ps)
+{
+ pid_t pid;
+ unsigned int i;
+
+ if (privsep_process != PROC_PARENT)
+ return;
+
+ for (i = 0; i < PROC_MAX; i++) {
+ if (ps->ps_pid[i] == 0)
+ continue;
+ killpg(ps->ps_pid[i], SIGTERM);
+ }
+
+ do {
+ pid = waitpid(WAIT_ANY, NULL, 0);
+ } while (pid != -1 || (pid == -1 && errno == EINTR));
+
+ proc_close(ps);
+}
+
+void
+proc_open(struct privsep *ps, struct privsep_proc *p,
+ struct privsep_proc *procs, size_t nproc)
+{
+ struct privsep_pipes *pa, *pb;
+ int fds[2];
+ unsigned int i, j, src, proc;
+
+ if (p == NULL)
+ src = privsep_process; /* parent */
+ else
+ src = p->p_id;
+
+ /*
+ * Open socket pairs for our peers
+ */
+ for (proc = 0; proc < nproc; proc++) {
+ procs[proc].p_ps = ps;
+ procs[proc].p_env = ps->ps_env;
+ if (procs[proc].p_cb == NULL)
+ procs[proc].p_cb = proc_dispatch_null;
+
+ for (i = 0; i < ps->ps_instances[src]; i++) {
+ for (j = 0; j < ps->ps_instances[procs[proc].p_id];
+ j++) {
+ pa = &ps->ps_pipes[src][i];
+ pb = &ps->ps_pipes[procs[proc].p_id][j];
+
+ /* Check if fds are already set by peer */
+ if (pa->pp_pipes[procs[proc].p_id][j] != -1)
+ continue;
+
+ if (socketpair(AF_UNIX,
+ SOCK_STREAM | SOCK_NONBLOCK,
+ PF_UNSPEC, fds) == -1)
+ fatal("socketpair");
+
+ pa->pp_pipes[procs[proc].p_id][j] = fds[0];
+ pb->pp_pipes[src][i] = fds[1];
+ }
+ }
+ }
+}
+
+void
+proc_listen(struct privsep *ps, struct privsep_proc *procs, size_t nproc)
+{
+ unsigned int i, dst, src, n, m;
+ struct privsep_pipes *pp;
+
+ /*
+ * Close unused pipes
+ */
+ for (src = 0; src < PROC_MAX; src++) {
+ for (n = 0; n < ps->ps_instances[src]; n++) {
+ /* Ingore current process */
+ if (src == (unsigned int)privsep_process &&
+ n == ps->ps_instance)
+ continue;
+
+ pp = &ps->ps_pipes[src][n];
+
+ for (dst = 0; dst < PROC_MAX; dst++) {
+ if (src == dst)
+ continue;
+ for (m = 0; m < ps->ps_instances[dst]; m++) {
+ if (pp->pp_pipes[dst][m] == -1)
+ continue;
+
+ /* Close and invalidate fd */
+ close(pp->pp_pipes[dst][m]);
+ pp->pp_pipes[dst][m] = -1;
+ }
+ }
+ }
+ }
+
+ src = privsep_process;
+ ps->ps_pp = pp = &ps->ps_pipes[src][ps->ps_instance];
+
+ /*
+ * Listen on appropriate pipes
+ */
+ for (i = 0; i < nproc; i++) {
+ dst = procs[i].p_id;
+
+ if (src == dst)
+ fatal("proc_listen: cannot peer with oneself");
+
+ if ((ps->ps_ievs[dst] = calloc(ps->ps_instances[dst],
+ sizeof(struct imsgev))) == NULL)
+ fatal("proc_open");
+
+ for (n = 0; n < ps->ps_instances[dst]; n++) {
+ if (pp->pp_pipes[dst][n] == -1)
+ continue;
+
+ imsg_init(&(ps->ps_ievs[dst][n].ibuf),
+ pp->pp_pipes[dst][n]);
+ ps->ps_ievs[dst][n].handler = proc_dispatch;
+ ps->ps_ievs[dst][n].events = EV_READ;
+ ps->ps_ievs[dst][n].proc = &procs[i];
+ ps->ps_ievs[dst][n].data = &ps->ps_ievs[dst][n];
+ procs[i].p_instance = n;
+
+ event_set(&(ps->ps_ievs[dst][n].ev),
+ ps->ps_ievs[dst][n].ibuf.fd,
+ ps->ps_ievs[dst][n].events,
+ ps->ps_ievs[dst][n].handler,
+ ps->ps_ievs[dst][n].data);
+ event_add(&(ps->ps_ievs[dst][n].ev), NULL);
+ }
+ }
+}
+
+void
+proc_close(struct privsep *ps)
+{
+ unsigned int dst, n;
+ struct privsep_pipes *pp;
+
+ if (ps == NULL)
+ return;
+
+ pp = ps->ps_pp;
+
+ for (dst = 0; dst < PROC_MAX; dst++) {
+ if (ps->ps_ievs[dst] == NULL)
+ continue;
+
+ for (n = 0; n < ps->ps_instances[dst]; n++) {
+ if (pp->pp_pipes[dst][n] == -1)
+ continue;
+
+ /* Cancel the fd, close and invalidate the fd */
+ event_del(&(ps->ps_ievs[dst][n].ev));
+ imsg_clear(&(ps->ps_ievs[dst][n].ibuf));
+ close(pp->pp_pipes[dst][n]);
+ pp->pp_pipes[dst][n] = -1;
+ }
+ free(ps->ps_ievs[dst]);
+ }
+}
+
+void
+proc_shutdown(struct privsep_proc *p)
+{
+ struct privsep *ps = p->p_ps;
+
+ if (p->p_id == PROC_CONTROL && ps)
+ control_cleanup(&ps->ps_csock);
+
+ if (p->p_shutdown != NULL)
+ (*p->p_shutdown)();
+
+ proc_close(ps);
+
+ log_info("%s exiting, pid %d", p->p_title, getpid());
+
+ _exit(0);
+}
+
+void
+proc_sig_handler(int sig, short event, void *arg)
+{
+ struct privsep_proc *p = arg;
+
+ switch (sig) {
+ case SIGINT:
+ case SIGTERM:
+ proc_shutdown(p);
+ break;
+ case SIGCHLD:
+ case SIGHUP:
+ case SIGPIPE:
+ case SIGUSR1:
+ /* ignore */
+ break;
+ default:
+ fatalx("proc_sig_handler: unexpected signal");
+ /* NOTREACHED */
+ }
+}
+
+pid_t
+proc_run(struct privsep *ps, struct privsep_proc *p,
+ struct privsep_proc *procs, unsigned int nproc,
+ void (*run)(struct privsep *, struct privsep_proc *, void *), void *arg)
+{
+ pid_t pid;
+ struct passwd *pw;
+ const char *root;
+ struct control_sock *rcs;
+ unsigned int n;
+
+ if (ps->ps_noaction)
+ return (0);
+
+ proc_open(ps, p, procs, nproc);
+
+ /* Fork child handlers */
+ switch (pid = fork()) {
+ case -1:
+ fatal("proc_run: cannot fork");
+ case 0:
+ log_procinit(p->p_title);
+
+ /* Set the process group of the current process */
+ setpgid(0, 0);
+ break;
+ default:
+ return (pid);
+ }
+
+ pw = ps->ps_pw;
+
+ if (p->p_id == PROC_CONTROL && ps->ps_instance == 0) {
+ if (control_init(ps, &ps->ps_csock) == -1)
+ fatalx(__func__);
+ TAILQ_FOREACH(rcs, &ps->ps_rcsocks, cs_entry)
+ if (control_init(ps, rcs) == -1)
+ fatalx(__func__);
+ }
+
+ /* Change root directory */
+ if (p->p_chroot != NULL)
+ root = p->p_chroot;
+ else
+ root = pw->pw_dir;
+
+ if (chroot(root) == -1)
+ fatal("proc_run: chroot");
+ if (chdir("/") == -1)
+ fatal("proc_run: chdir(\"/\")");
+
+ privsep_process = p->p_id;
+
+ setproctitle("%s", p->p_title);
+
+ if (setgroups(1, &pw->pw_gid) ||
+ setresgid(pw->pw_gid, pw->pw_gid, pw->pw_gid) ||
+ setresuid(pw->pw_uid, pw->pw_uid, pw->pw_uid))
+ fatal("proc_run: cannot drop privileges");
+
+ /* Fork child handlers */
+ for (n = 1; n < ps->ps_instances[p->p_id]; n++) {
+ if (fork() == 0) {
+ ps->ps_instance = p->p_instance = n;
+ break;
+ }
+ }
+
+#ifdef DEBUG
+ log_debug("%s: %s %d/%d, pid %d", __func__, p->p_title,
+ ps->ps_instance + 1, ps->ps_instances[p->p_id], getpid());
+#endif
+
+ event_init();
+
+ signal_set(&ps->ps_evsigint, SIGINT, proc_sig_handler, p);
+ signal_set(&ps->ps_evsigterm, SIGTERM, proc_sig_handler, p);
+ signal_set(&ps->ps_evsigchld, SIGCHLD, proc_sig_handler, p);
+ signal_set(&ps->ps_evsighup, SIGHUP, proc_sig_handler, p);
+ signal_set(&ps->ps_evsigpipe, SIGPIPE, proc_sig_handler, p);
+ signal_set(&ps->ps_evsigusr1, SIGUSR1, proc_sig_handler, p);
+
+ signal_add(&ps->ps_evsigint, NULL);
+ signal_add(&ps->ps_evsigterm, NULL);
+ signal_add(&ps->ps_evsigchld, NULL);
+ signal_add(&ps->ps_evsighup, NULL);
+ signal_add(&ps->ps_evsigpipe, NULL);
+ signal_add(&ps->ps_evsigusr1, NULL);
+
+ proc_listen(ps, procs, nproc);
+
+ if (p->p_id == PROC_CONTROL && ps->ps_instance == 0) {
+ TAILQ_INIT(&ctl_conns);
+ if (control_listen(&ps->ps_csock) == -1)
+ fatalx(__func__);
+ TAILQ_FOREACH(rcs, &ps->ps_rcsocks, cs_entry)
+ if (control_listen(rcs) == -1)
+ fatalx(__func__);
+ }
+
+ if (run != NULL)
+ run(ps, p, arg);
+
+ event_dispatch();
+
+ proc_shutdown(p);
+
+ return (0);
+}
+
+void
+proc_dispatch(int fd, short event, void *arg)
+{
+ struct imsgev *iev = arg;
+ struct privsep_proc *p = iev->proc;
+ struct privsep *ps = p->p_ps;
+ struct imsgbuf *ibuf;
+ struct imsg imsg;
+ ssize_t n;
+ int verbose;
+ const char *title;
+
+ title = ps->ps_title[privsep_process];
+ ibuf = &iev->ibuf;
+
+ if (event & EV_READ) {
+ if ((n = imsg_read(ibuf)) == -1)
+ fatal(__func__);
+ if (n == 0) {
+ /* this pipe is dead, so remove the event handler */
+ event_del(&iev->ev);
+ event_loopexit(NULL);
+ return;
+ }
+ }
+
+ if (event & EV_WRITE) {
+ if (msgbuf_write(&ibuf->w) <= 0 && errno != EAGAIN)
+ fatal(__func__);
+ }
+
+ for (;;) {
+ if ((n = imsg_get(ibuf, &imsg)) == -1)
+ fatal(__func__);
+ if (n == 0)
+ break;
+
+#if DEBUG > 1
+ log_debug("%s: %s %d got imsg %d from %s %d",
+ __func__, title, ps->ps_instance + 1,
+ imsg.hdr.type, p->p_title, p->p_instance);
+#endif
+
+ /*
+ * Check the message with the program callback
+ */
+ if ((p->p_cb)(fd, p, &imsg) == 0) {
+ /* Message was handled by the callback, continue */
+ imsg_free(&imsg);
+ continue;
+ }
+
+ /*
+ * Generic message handling
+ */
+ switch (imsg.hdr.type) {
+ case IMSG_CTL_VERBOSE:
+ IMSG_SIZE_CHECK(&imsg, &verbose);
+ memcpy(&verbose, imsg.data, sizeof(verbose));
+ log_verbose(verbose);
+ break;
+ default:
+ log_warnx("%s: %s %d got invalid imsg %d from %s %d",
+ __func__, title, ps->ps_instance + 1,
+ imsg.hdr.type, p->p_title, p->p_instance);
+ fatalx(__func__);
+ }
+ imsg_free(&imsg);
+ }
+ imsg_event_add(iev);
+}
+
+int
+proc_dispatch_null(int fd, struct privsep_proc *p, struct imsg *imsg)
+{
+ return (-1);
+}
+
+/*
+ * imsg helper functions
+ */
+
+void
+imsg_event_add(struct imsgev *iev)
+{
+ if (iev->handler == NULL) {
+ imsg_flush(&iev->ibuf);
+ return;
+ }
+
+ iev->events = EV_READ;
+ if (iev->ibuf.w.queued)
+ iev->events |= EV_WRITE;
+
+ event_del(&iev->ev);
+ event_set(&iev->ev, iev->ibuf.fd, iev->events, iev->handler, iev->data);
+ event_add(&iev->ev, NULL);
+}
+
+int
+imsg_compose_event(struct imsgev *iev, uint16_t type, uint32_t peerid,
+ pid_t pid, int fd, void *data, uint16_t datalen)
+{
+ int ret;
+
+ if ((ret = imsg_compose(&iev->ibuf, type, peerid,
+ pid, fd, data, datalen)) == -1)
+ return (ret);
+ imsg_event_add(iev);
+ return (ret);
+}
+
+int
+imsg_composev_event(struct imsgev *iev, uint16_t type, uint32_t peerid,
+ pid_t pid, int fd, const struct iovec *iov, int iovcnt)
+{
+ int ret;
+
+ if ((ret = imsg_composev(&iev->ibuf, type, peerid,
+ pid, fd, iov, iovcnt)) == -1)
+ return (ret);
+ imsg_event_add(iev);
+ return (ret);
+}
+
+void
+proc_range(struct privsep *ps, enum privsep_procid id, int *n, int *m)
+{
+ if (*n == -1) {
+ /* Use a range of all target instances */
+ *n = 0;
+ *m = ps->ps_instances[id];
+ } else {
+ /* Use only a single slot of the specified peer process */
+ *m = *n + 1;
+ }
+}
+
+int
+proc_compose_imsg(struct privsep *ps, enum privsep_procid id, int n,
+ uint16_t type, int fd, void *data, uint16_t datalen)
+{
+ int m;
+
+ proc_range(ps, id, &n, &m);
+ for (; n < m; n++) {
+ if (imsg_compose_event(&ps->ps_ievs[id][n],
+ type, -1, 0, fd, data, datalen) == -1)
+ return (-1);
+ }
+
+ return (0);
+}
+
+int
+proc_composev_imsg(struct privsep *ps, enum privsep_procid id, int n,
+ uint16_t type, int fd, const struct iovec *iov, int iovcnt)
+{
+ int m;
+
+ proc_range(ps, id, &n, &m);
+ for (; n < m; n++)
+ if (imsg_composev_event(&ps->ps_ievs[id][n],
+ type, -1, 0, fd, iov, iovcnt) == -1)
+ return (-1);
+
+ return (0);
+}
+
+int
+proc_forward_imsg(struct privsep *ps, struct imsg *imsg,
+ enum privsep_procid id, int n)
+{
+ return (proc_compose_imsg(ps, id, n, imsg->hdr.type,
+ imsg->fd, imsg->data, IMSG_DATA_SIZE(imsg)));
+}
+
+struct imsgbuf *
+proc_ibuf(struct privsep *ps, enum privsep_procid id, int n)
+{
+ int m;
+
+ proc_range(ps, id, &n, &m);
+ return (&ps->ps_ievs[id][n].ibuf);
+}
+
+struct imsgev *
+proc_iev(struct privsep *ps, enum privsep_procid id, int n)
+{
+ int m;
+
+ proc_range(ps, id, &n, &m);
+ return (&ps->ps_ievs[id][n]);
+}
diff --git a/usr.sbin/vmd/proc.h b/usr.sbin/vmd/proc.h
new file mode 100644
index 00000000000..2c192551ef4
--- /dev/null
+++ b/usr.sbin/vmd/proc.h
@@ -0,0 +1,187 @@
+/* $OpenBSD: proc.h,v 1.1 2015/12/02 09:14:25 reyk Exp $ */
+
+/*
+ * Copyright (c) 2010-2015 Reyk Floeter <reyk@openbsd.org>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <sys/queue.h>
+#include <sys/uio.h>
+
+#include <imsg.h>
+#include <event.h>
+
+#ifndef _PROC_H
+#define _PROC_H
+
+enum {
+ IMSG_NONE,
+ IMSG_CTL_OK,
+ IMSG_CTL_FAIL,
+ IMSG_CTL_VERBOSE,
+ IMSG_CTL_END,
+ IMSG_CTL_NOTIFY,
+ IMSG_PROC_MAX
+};
+
+/* imsg */
+struct imsgev {
+ struct imsgbuf ibuf;
+ void (*handler)(int, short, void *);
+ struct event ev;
+ struct privsep_proc *proc;
+ void *data;
+ short events;
+};
+
+#define IMSG_SIZE_CHECK(imsg, p) do { \
+ if (IMSG_DATA_SIZE(imsg) < sizeof(*p)) \
+ fatalx("bad length imsg received"); \
+} while (0)
+#define IMSG_DATA_SIZE(imsg) ((imsg)->hdr.len - IMSG_HEADER_SIZE)
+
+/* control socket */
+struct control_sock {
+ const char *cs_name;
+ struct event cs_ev;
+ struct event cs_evt;
+ int cs_fd;
+ int cs_restricted;
+ void *cs_env;
+
+ TAILQ_ENTRY(control_sock) cs_entry;
+};
+TAILQ_HEAD(control_socks, control_sock);
+
+struct {
+ struct event ev;
+ int fd;
+} control_state;
+
+struct ctl_conn {
+ TAILQ_ENTRY(ctl_conn) entry;
+ u_int8_t flags;
+ u_int waiting;
+#define CTL_CONN_NOTIFY 0x01
+ struct imsgev iev;
+
+};
+TAILQ_HEAD(ctl_connlist, ctl_conn);
+extern struct ctl_connlist ctl_conns;
+
+/* privsep */
+enum privsep_procid {
+ PROC_PARENT = 0,
+ PROC_CONTROL,
+ PROC_MAX,
+} privsep_process;
+
+struct privsep_pipes {
+ int *pp_pipes[PROC_MAX];
+};
+
+struct privsep {
+ struct privsep_pipes *ps_pipes[PROC_MAX];
+ struct privsep_pipes *ps_pp;
+
+ struct imsgev *ps_ievs[PROC_MAX];
+ const char *ps_title[PROC_MAX];
+ pid_t ps_pid[PROC_MAX];
+ struct passwd *ps_pw;
+ int ps_noaction;
+
+ struct control_sock ps_csock;
+ struct control_socks ps_rcsocks;
+
+ u_int ps_instances[PROC_MAX];
+ u_int ps_ninstances;
+ u_int ps_instance;
+
+ /* Event and signal handlers */
+ struct event ps_evsigint;
+ struct event ps_evsigterm;
+ struct event ps_evsigchld;
+ struct event ps_evsighup;
+ struct event ps_evsigpipe;
+ struct event ps_evsigusr1;
+
+ void *ps_env;
+};
+
+struct privsep_proc {
+ const char *p_title;
+ enum privsep_procid p_id;
+ int (*p_cb)(int, struct privsep_proc *,
+ struct imsg *);
+ pid_t (*p_init)(struct privsep *,
+ struct privsep_proc *);
+ const char *p_chroot;
+ struct privsep *p_ps;
+ void *p_env;
+ void (*p_shutdown)(void);
+ u_int p_instance;
+};
+
+/* proc.c */
+void proc_init(struct privsep *, struct privsep_proc *, unsigned int);
+void proc_kill(struct privsep *);
+void proc_listen(struct privsep *, struct privsep_proc *, size_t);
+void proc_dispatch(int, short event, void *);
+pid_t proc_run(struct privsep *, struct privsep_proc *,
+ struct privsep_proc *, unsigned int,
+ void (*)(struct privsep *, struct privsep_proc *, void *), void *);
+void imsg_event_add(struct imsgev *);
+int imsg_compose_event(struct imsgev *, uint16_t, uint32_t,
+ pid_t, int, void *, uint16_t);
+int imsg_composev_event(struct imsgev *, uint16_t, uint32_t,
+ pid_t, int, const struct iovec *, int);
+int proc_compose_imsg(struct privsep *, enum privsep_procid, int,
+ uint16_t, int, void *, uint16_t);
+int proc_composev_imsg(struct privsep *, enum privsep_procid, int,
+ uint16_t, int, const struct iovec *, int);
+int proc_forward_imsg(struct privsep *, struct imsg *,
+ enum privsep_procid, int);
+struct imsgbuf *
+ proc_ibuf(struct privsep *, enum privsep_procid, int);
+struct imsgev *
+ proc_iev(struct privsep *, enum privsep_procid, int);
+
+/* control.c */
+pid_t control(struct privsep *, struct privsep_proc *);
+int control_init(struct privsep *, struct control_sock *);
+int control_listen(struct control_sock *);
+void control_cleanup(struct control_sock *);
+
+/* log.c */
+void log_init(int, int);
+void log_procinit(const char *);
+void log_verbose(int);
+void log_warn(const char *, ...)
+ __attribute__((__format__ (printf, 1, 2)));
+void log_warnx(const char *, ...)
+ __attribute__((__format__ (printf, 1, 2)));
+void log_info(const char *, ...)
+ __attribute__((__format__ (printf, 1, 2)));
+void log_debug(const char *, ...)
+ __attribute__((__format__ (printf, 1, 2)));
+void logit(int, const char *, ...)
+ __attribute__((__format__ (printf, 2, 3)));
+void vlog(int, const char *, va_list)
+ __attribute__((__format__ (printf, 2, 0)));
+__dead void fatal(const char *, ...)
+ __attribute__((__format__ (printf, 1, 2)));
+__dead void fatalx(const char *, ...)
+ __attribute__((__format__ (printf, 1, 2)));
+
+#endif /* _PROC_H */
diff --git a/usr.sbin/vmd/vmd.c b/usr.sbin/vmd/vmd.c
index 35ad6d749ff..b72874397b7 100644
--- a/usr.sbin/vmd/vmd.c
+++ b/usr.sbin/vmd/vmd.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: vmd.c,v 1.8 2015/11/26 08:26:48 reyk Exp $ */
+/* $OpenBSD: vmd.c,v 1.9 2015/12/02 09:14:25 reyk Exp $ */
/*
* Copyright (c) 2015 Mike Larkin <mlarkin@openbsd.org>
@@ -16,163 +16,105 @@
* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
*/
-/*
- * vmd(8) - virtual machine daemon
- */
-
-#include <sys/types.h>
-#include <sys/ioctl.h>
+#include <sys/param.h>
#include <sys/queue.h>
-#include <sys/uio.h>
-#include <sys/socket.h>
-#include <sys/stat.h>
-#include <sys/un.h>
#include <sys/wait.h>
-#include <sys/mman.h>
-#include <sys/time.h>
-
-#include <dev/ic/comreg.h>
-#include <dev/ic/i8253reg.h>
-#include <dev/isa/isareg.h>
-#include <dev/pci/pcireg.h>
-
-#include <machine/param.h>
-#include <machine/vmmvar.h>
+#include <sys/cdefs.h>
+#include <stdio.h>
+#include <stdlib.h>
#include <errno.h>
+#include <event.h>
#include <fcntl.h>
-#include <imsg.h>
-#include <limits.h>
-#include <pthread.h>
#include <pwd.h>
#include <signal.h>
-#include <stddef.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
#include <syslog.h>
-#include <termios.h>
#include <unistd.h>
-#include <poll.h>
-#include <util.h>
+#include "proc.h"
#include "vmd.h"
-#include "loadfile.h"
-#include "pci.h"
-#include "virtio.h"
-
-#define NR_BACKLOG 5
-
-#define MAX_TAP 256
-
-/*
- * Emulated 8250 UART
- *
- */
-#define COM1_DATA 0x3f8
-#define COM1_IER 0x3f9
-#define COM1_IIR 0x3fa
-#define COM1_LCR 0x3fb
-#define COM1_MCR 0x3fc
-#define COM1_LSR 0x3fd
-#define COM1_MSR 0x3fe
-#define COM1_SCR 0x3ff
-
-/*
- * Emulated i8253 PIT (counter)
- */
-#define TIMER_BASE 0x40
-#define TIMER_CTRL 0x43 /* 8253 Timer #1 */
-#define NS_PER_TICK (1000000000 / TIMER_FREQ)
-
-/* i8253 registers */
-struct i8253_counter {
- struct timeval tv; /* timer start time */
- uint16_t start; /* starting value */
- uint16_t olatch; /* output latch */
- uint16_t ilatch; /* input latch */
- uint8_t last_r; /* last read byte (MSB/LSB) */
- uint8_t last_w; /* last written byte (MSB/LSB) */
-};
-
-/* ns8250 UART registers */
-struct ns8250_regs {
- uint8_t lcr; /* Line Control Register */
- uint8_t fcr; /* FIFO Control Register */
- uint8_t iir; /* Interrupt ID Register */
- uint8_t ier; /* Interrupt Enable Register */
- uint8_t divlo; /* Baud rate divisor low byte */
- uint8_t divhi; /* Baud rate divisor high byte */
- uint8_t msr; /* Modem Status Register */
- uint8_t lsr; /* Line Status Register */
- uint8_t mcr; /* Modem Control Register */
- uint8_t scr; /* Scratch Register */
- uint8_t data; /* Unread input data */
-};
-
-struct i8253_counter i8253_counter[3];
-struct ns8250_regs com1_regs;
__dead void usage(void);
-void sighdlr(int);
-int main(int, char **);
-int control_run(void);
-int start_vm(struct imsg *);
-int terminate_vm(struct imsg *);
-int get_info_vm(struct imsgbuf *);
-int start_client_vmd(void);
-int opentap(void);
-int run_vm(int *, int *, struct vm_create_params *);
-void *vcpu_run_loop(void *);
-int vcpu_exit(struct vm_run_params *);
-int vmm_create_vm(struct vm_create_params *);
-void init_emulated_hw(struct vm_create_params *, int *, int *);
-void vcpu_exit_inout(struct vm_run_params *);
-uint8_t vcpu_exit_pci(struct vm_run_params *);
-void vcpu_exit_i8253(union vm_exit *);
-void vcpu_exit_com(struct vm_run_params *);
-void vcpu_process_com_data(union vm_exit *);
-void vcpu_process_com_lcr(union vm_exit *);
-void vcpu_process_com_lsr(union vm_exit *);
-void vcpu_process_com_ier(union vm_exit *);
-void vcpu_process_com_mcr(union vm_exit *);
-void vcpu_process_com_iir(union vm_exit *);
-void vcpu_process_com_msr(union vm_exit *);
-void vcpu_process_com_scr(union vm_exit *);
+int main(int, char **);
+int vmd_configure(void);
+void vmd_sighdlr(int sig, short event, void *arg);
+void vmd_shutdown(void);
+int vmd_control_run(void);
-int vmm_fd, con_fd, vm_id;
-volatile sig_atomic_t quit;
+struct vmd *env;
-SLIST_HEAD(vmstate_head, vmstate);
-struct vmstate_head vmstate;
-
-extern char *__progname;
+static struct privsep_proc procs[] = {
+ { "control", PROC_CONTROL, vmm_dispatch_control, control },
+};
-/*
- * sighdlr
- *
- * Signal handler for TERM/INT/CHLD signals used during daemon shutdown
- *
- * Parameters:
- * sig: signal caught
- */
void
-sighdlr(int sig)
+vmd_sighdlr(int sig, short event, void *arg)
{
- pid_t pid;
+ struct privsep *ps = arg;
+ int die = 0, status, fail, id;
+ pid_t pid;
+ char *cause;
+ const char *title = "vm";
switch (sig) {
+ case SIGHUP:
+ log_info("%s: ignoring SIGHUP", __func__);
+ break;
+ case SIGPIPE:
+ log_info("%s: ignoring SIGPIPE", __func__);
+ break;
+ case SIGUSR1:
+ log_info("%s: ignoring SIGUSR1", __func__);
+ break;
case SIGTERM:
case SIGINT:
- /* Tell main imsg loop to exit */
- quit = 1;
- break;
+ die = 1;
+ /* FALLTHROUGH */
case SIGCHLD:
do {
- pid = waitpid(WAIT_ANY, NULL, WNOHANG);
- } while (pid != -1 || (pid == -1 && errno == EINTR));
+ int len;
+
+ pid = waitpid(-1, &status, WNOHANG);
+ if (pid <= 0)
+ continue;
+
+ fail = 0;
+ if (WIFSIGNALED(status)) {
+ fail = 1;
+ len = asprintf(&cause, "terminated; signal %d",
+ WTERMSIG(status));
+ } else if (WIFEXITED(status)) {
+ if (WEXITSTATUS(status) != 0) {
+ fail = 1;
+ len = asprintf(&cause,
+ "exited abnormally");
+ } else
+ len = asprintf(&cause, "exited okay");
+ } else
+ fatalx("unexpected cause of SIGCHLD");
+
+ if (len == -1)
+ fatal("asprintf");
+
+ for (id = 0; id < PROC_MAX; id++) {
+ if (pid == ps->ps_pid[id]) {
+ die = 1;
+ title = ps->ps_title[id];
+ break;
+ }
+ }
+ if (fail)
+ log_warnx("lost child: %s %s", title, cause);
+
+ free(cause);
+ } while (pid > 0 || (pid == -1 && errno == EINTR));
+
+ if (die)
+ vmd_shutdown();
break;
+ default:
+ fatalx("unexpected signal");
}
}
@@ -187,1453 +129,114 @@ usage(void)
int
main(int argc, char **argv)
{
- int debug = 0, verbose = 0, c, res;
+ struct privsep *ps;
+ int ch;
+
+ if ((env = calloc(1, sizeof(*env))) == NULL)
+ fatal("calloc: env");
- while ((c = getopt(argc, argv, "dv")) != -1) {
- switch (c) {
+ while ((ch = getopt(argc, argv, "dvn")) != -1) {
+ switch (ch) {
case 'd':
- debug = 2;
+ env->vmd_debug = 2;
break;
case 'v':
- verbose++;
+ env->vmd_verbose++;
+ break;
+ case 'n':
+ env->vmd_noaction = 1;
break;
default:
usage();
}
}
- /* log to stderr until daemonized */
- log_init(debug ? debug : 1, LOG_DAEMON);
-
- /* Open /dev/vmm */
- vmm_fd = open(VMM_NODE, O_RDONLY);
- if (vmm_fd == -1)
- fatal("can't open vmm device node %s", VMM_NODE);
-
- setproctitle("control");
-
- SLIST_INIT(&vmstate);
-
- signal(SIGTERM, sighdlr);
- signal(SIGINT, sighdlr);
- signal(SIGCHLD, sighdlr);
-
- log_init(debug, LOG_DAEMON);
- log_verbose(verbose);
- log_procinit("control");
-
- if (!debug && daemon(1, 0) == -1)
- fatal("can't daemonize");
-
- res = control_run();
-
- if (res == -1)
- fatalx("control socket error");
-
- return (0);
-}
-
-/*
- * control_run
- *
- * Main control loop - establishes listening socket for incoming vmmctl(8)
- * requests and dispatches appropriate calls to vmm(4). Replies to
- * vmmctl(8) using imsg.
- *
- * Return values:
- * 0: normal exit (signal to quit received)
- * -1: abnormal exit (various causes)
- */
-int
-control_run(void)
-{
- struct sockaddr_un sun, c_sun;
- socklen_t len;
- int fd, connfd, n, res, nfd;
- mode_t mode, old_umask;
- char *socketpath;
- struct imsgbuf *ibuf;
- struct imsg imsg;
- struct pollfd pfd[1];
-
- /* Establish and start listening on control socket */
- socketpath = SOCKET_NAME;
- if ((fd = socket(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0)) == -1) {
- log_warn("%s: socket error", __progname);
- return (-1);
- }
-
- bzero(&sun, sizeof(sun));
- sun.sun_family = AF_UNIX;
- if (strlcpy(sun.sun_path, socketpath, sizeof(sun.sun_path)) >=
- sizeof(sun.sun_path)) {
- log_warnx("%s: socket name too long", __progname);
- close(fd);
- return (-1);
- }
-
- if (unlink(socketpath) == -1)
- if (errno != ENOENT) {
- log_warn("%s: unlink of %s failed",
- __progname, socketpath);
- close(fd);
- return (-1);
- }
-
- old_umask = umask(S_IXUSR|S_IXGRP|S_IWOTH|S_IROTH|S_IXOTH);
- mode = S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP;
-
- if (bind(fd, (struct sockaddr *)&sun, sizeof(sun)) == -1) {
- log_warn("%s: control_init: bind of %s failed",
- __progname, socketpath);
- close(fd);
- umask(old_umask);
- return (-1);
- }
-
- umask(old_umask);
-
- if (chmod(socketpath, mode) == -1) {
- log_warn("%s: control_init: chmod of %s failed",
- __progname, socketpath);
- close(fd);
- unlink(socketpath);
- return (-1);
- }
+ /* check for root privileges */
+ if (geteuid())
+ fatalx("need root privileges");
- if ((ibuf = malloc(sizeof(struct imsgbuf))) == NULL) {
- log_warn("%s: out of memory", __progname);
- close(fd);
- unlink(socketpath);
- return (-1);
- }
+ SLIST_INIT(&env->vmd_vmstate);
- if (listen(fd, NR_BACKLOG) == -1) {
- log_warn("%s: listen failed", __progname);
- close(fd);
- unlink(socketpath);
- return (-1);
- }
+ ps = &env->vmd_ps;
+ ps->ps_env = env;
+ TAILQ_INIT(&ps->ps_rcsocks);
- while (!quit) {
- pfd[0].fd = fd;
- pfd[0].events = POLLIN;
+ if ((ps->ps_pw = getpwnam(VMD_USER)) == NULL)
+ fatal("unknown user %s", VMD_USER);
- nfd = poll(pfd, 1, INFTIM);
- if (nfd == -1) {
- if (errno == EINTR)
- continue;
- fatal("poll");
- }
- if (nfd == 0)
- continue;
- if ((pfd[0].revents & (POLLERR|POLLNVAL)))
- fatalx("bad fd %d", fd);
- if ((pfd[0].revents & (POLLIN|POLLHUP)) == 0)
- fatalx("bad fd %d events", fd);
+ /* Configure the control socket */
+ ps->ps_csock.cs_name = SOCKET_NAME;
- if ((connfd = accept4(fd, (struct sockaddr *)&c_sun, &len,
- SOCK_CLOEXEC)) == -1) {
- log_warn("%s: accept4 error", __progname);
- close(fd);
- unlink(socketpath);
- return (-1);
- }
-
- imsg_init(ibuf, connfd);
- if ((n = imsg_read(ibuf)) == -1 || n == 0) {
- log_warnx("%s: imsg_read error, n=%d",
- __progname, n);
- continue;
- }
-
- for (;;) {
- if ((n = imsg_get(ibuf, &imsg)) == -1)
- return (-1);
-
- if (n == 0)
- break;
-
- /* Process incoming message (from vmmctl(8)) */
- switch (imsg.hdr.type) {
- case IMSG_VMDOP_START_VM_REQUEST:
- res = start_vm(&imsg);
- imsg_compose(ibuf,
- IMSG_VMDOP_START_VM_RESPONSE, 0, 0, -1,
- &res, sizeof(res));
- break;
- case IMSG_VMDOP_TERMINATE_VM_REQUEST:
- res = terminate_vm(&imsg);
- imsg_compose(ibuf,
- IMSG_VMDOP_TERMINATE_VM_RESPONSE, 0, 0, -1,
- &res, sizeof(res));
- break;
- case IMSG_VMDOP_GET_INFO_VM_REQUEST:
- res = get_info_vm(ibuf);
- imsg_compose(ibuf,
- IMSG_VMDOP_GET_INFO_VM_END_DATA, 0, 0, -1,
- &res, sizeof(res));
- break;
- }
-
- while (ibuf->w.queued)
- if (msgbuf_write(&ibuf->w) <= 0 && errno !=
- EAGAIN) {
- log_warn("%s: msgbuf_write error",
- __progname);
- close(fd);
- close(connfd);
- unlink(socketpath);
- return (-1);
- }
- imsg_free(&imsg);
- }
- close(connfd);
- }
-
- signal(SIGCHLD, SIG_IGN);
-
- return (0);
-}
-
-/*
- * terminate_vm
- *
- * Requests vmm(4) to terminate the VM whose ID is provided in the
- * supplied vm_terminate_params structure (vtp->vtp_vm_id)
- *
- * Parameters
- * imsg: The incoming imsg body whose 'data' field contains the
- * vm_terminate_params struct
- *
- * Return values:
- * 0: success
- * !0 : ioctl to vmm(4) failed (eg, ENOENT if the supplied VM is not
- * valid)
- */
-int
-terminate_vm(struct imsg *imsg)
-{
- struct vm_terminate_params *vtp;
-
- vtp = (struct vm_terminate_params *)imsg->data;
-
- if (ioctl(vmm_fd, VMM_IOC_TERM, vtp) < 0)
- return (errno);
-
- return (0);
-}
-
-/*
- * opentap
- *
- * Opens the next available tap device, up to MAX_TAP.
- *
- * Returns a file descriptor to the tap node opened, or -1 if no tap
- * devices were available.
- */
-int
-opentap(void)
-{
- int i, fd;
- char path[PATH_MAX];
-
- for (i = 0; i < MAX_TAP; i++) {
- snprintf(path, PATH_MAX, "/dev/tap%d", i);
- fd = open(path, O_RDWR | O_NONBLOCK);
- if (fd != -1)
- return (fd);
- }
-
- return (-1);
-}
-
-/*
- * start_vm
- *
- * Starts a new VM with the creation parameters supplied (in the incoming
- * imsg->data field). This function performs a basic sanity check on the
- * incoming parameters and then performs the following steps to complete
- * the creation of the VM:
- *
- * 1. opens the VM disk image files specified in the VM creation parameters
- * 2. opens the specified VM kernel
- * 3. creates a VM console tty pair using openpty
- * 4. forks, passing the file descriptors opened in steps 1-3 to the child
- * vmd responsible for dropping privilege and running the VM's VCPU
- * loops.
- *
- * Parameters:
- * imsg: The incoming imsg body whose 'data' field is a vm_create_params
- * struct containing the VM creation parameters.
- *
- * Return values:
- * 0: success
- * !0 : failure - typically an errno indicating the source of the failure
- */
-int
-start_vm(struct imsg *imsg)
-{
- struct vm_create_params *vcp;
- size_t i;
- off_t kernel_size;
- struct stat sb;
- int child_disks[VMM_MAX_DISKS_PER_VM], kernel_fd, ret, ttym_fd;
- int child_taps[VMM_MAX_NICS_PER_VM];
- int ttys_fd;
- char ptyn[32];
-
- vcp = (struct vm_create_params *)imsg->data;
-
- for (i = 0 ; i < VMM_MAX_DISKS_PER_VM; i++)
- child_disks[i] = -1;
- for (i = 0 ; i < VMM_MAX_NICS_PER_VM; i++)
- child_taps[i] = -1;
-
- /*
- * XXX kernel_fd can't be global (possible race if multiple VMs
- * being created at the same time). Probably need to move this
- * into the child before dropping privs, or just make it local
- * to this function?
- */
- kernel_fd = -1;
-
- ttym_fd = -1;
- ttys_fd = -1;
-
- /* Open disk images for child */
- for (i = 0 ; i < vcp->vcp_ndisks; i++) {
- child_disks[i] = open(vcp->vcp_disks[i], O_RDWR);
- if (child_disks[i] == -1) {
- ret = errno;
- log_warn("%s: can't open %s", __progname,
- vcp->vcp_disks[i]);
- goto err;
- }
- }
-
- bzero(&sb, sizeof(sb));
- if (stat(vcp->vcp_kernel, &sb) == -1) {
- ret = errno;
- log_warn("%s: can't stat kernel image %s",
- __progname, vcp->vcp_kernel);
- goto err;
- }
-
- kernel_size = sb.st_size;
-
- /* Open kernel image */
- kernel_fd = open(vcp->vcp_kernel, O_RDONLY);
- if (kernel_fd == -1) {
- ret = errno;
- log_warn("%s: can't open kernel image %s",
- __progname, vcp->vcp_kernel);
- goto err;
- }
-
- if (openpty(&ttym_fd, &ttys_fd, ptyn, NULL, NULL) == -1) {
- ret = errno;
- log_warn("%s: openpty failed", __progname);
- goto err;
- }
-
- if (close(ttys_fd)) {
- ret = errno;
- log_warn("%s: close tty failed", __progname);
- goto err;
- }
-
- /* Open tap devices for child */
- for (i = 0 ; i < vcp->vcp_nnics; i++) {
- child_taps[i] = opentap();
- if (child_taps[i] == -1) {
- ret = errno;
- log_warn("%s: can't open tap for nic %zd",
- __progname, i);
- goto err;
- }
- }
-
- /* Start child vmd for this VM (fork, chroot, drop privs) */
- ret = start_client_vmd();
-
- /* Start child failed? - cleanup and leave */
- if (ret == -1) {
- ret = EIO;
- goto err;
- }
-
- if (ret > 0) {
- /* Parent */
- for (i = 0 ; i < vcp->vcp_ndisks; i++)
- close(child_disks[i]);
-
- for (i = 0 ; i < vcp->vcp_nnics; i++)
- close(child_taps[i]);
-
- close(kernel_fd);
- close(ttym_fd);
+ /* Open /dev/vmm */
+ env->vmd_fd = open(VMM_NODE, O_RDWR);
+ if (env->vmd_fd == -1)
+ fatal("can't open vmm device node %s", VMM_NODE);
- return (0);
- }
- else {
- /* Child */
- setproctitle(vcp->vcp_name);
- log_procinit(vcp->vcp_name);
+ /* log to stderr until daemonized */
+ log_init(env->vmd_debug ? env->vmd_debug : 1, LOG_DAEMON);
- log_info("%s: vm console: %s", __progname, ptyn);
- ret = vmm_create_vm(vcp);
- if (ret) {
- errno = ret;
- fatal("create vmm ioctl failed - exiting");
- }
+ if (!env->vmd_debug && daemon(0, 0) == -1)
+ fatal("can't daemonize");
- /* Load kernel image */
- ret = loadelf_main(kernel_fd, vcp->vcp_id, vcp->vcp_memory_size);
- if (ret) {
- errno = ret;
- fatal("failed to load kernel - exiting");
- }
+ ps->ps_ninstances = 1;
+ proc_init(ps, procs, nitems(procs));
- close(kernel_fd);
+ setproctitle("parent");
+ log_procinit("parent");
- con_fd = ttym_fd;
- if (fcntl(con_fd, F_SETFL, O_NONBLOCK) == -1)
- fatal("failed to set nonblocking mode on console");
+ event_init();
- /* Execute the vcpu run loop(s) for this VM */
- ret = run_vm(child_disks, child_taps, vcp);
- _exit(ret != 0);
- }
-
- return (ret);
+ signal_set(&ps->ps_evsigint, SIGINT, vmd_sighdlr, ps);
+ signal_set(&ps->ps_evsigterm, SIGTERM, vmd_sighdlr, ps);
+ signal_set(&ps->ps_evsigchld, SIGCHLD, vmd_sighdlr, ps);
+ signal_set(&ps->ps_evsighup, SIGHUP, vmd_sighdlr, ps);
+ signal_set(&ps->ps_evsigpipe, SIGPIPE, vmd_sighdlr, ps);
+ signal_set(&ps->ps_evsigusr1, SIGUSR1, vmd_sighdlr, ps);
-err:
- for (i = 0 ; i < vcp->vcp_ndisks; i++)
- if (child_disks[i] != -1)
- close(child_disks[i]);
+ signal_add(&ps->ps_evsigint, NULL);
+ signal_add(&ps->ps_evsigterm, NULL);
+ signal_add(&ps->ps_evsigchld, NULL);
+ signal_add(&ps->ps_evsighup, NULL);
+ signal_add(&ps->ps_evsigpipe, NULL);
+ signal_add(&ps->ps_evsigusr1, NULL);
- for (i = 0 ; i < vcp->vcp_nnics; i++)
- if (child_taps[i] != -1)
- close(child_taps[i]);
+ proc_listen(ps, procs, nitems(procs));
- if (kernel_fd != -1)
- close(kernel_fd);
+ if (vmd_configure() == -1)
+ fatalx("configuration failed");
- if (ttym_fd != -1)
- close(ttym_fd);
+ event_dispatch();
- return (ret);
-}
+ log_debug("parent exiting");
-/*
- * get_info_vm
- *
- * Returns a list of VMs known to vmm(4).
- *
- * Parameters:
- * ibuf: the imsg ibuf in which to place the results. A new imsg will
- * be created using this ibuf.
- *
- * Return values:
- * 0: success
- * !0 : failure (eg, ENOMEM, EIO or another error code from vmm(4) ioctl)
- */
-int
-get_info_vm(struct imsgbuf *ibuf)
-{
- int ret;
- size_t ct, i;
- struct ibuf *obuf;
- struct vm_info_params vip;
- struct vm_info_result *info;
-
- /*
- * We issue the VMM_IOC_INFO ioctl twice, once with an input
- * buffer size of 0, which results in vmm(4) returning the
- * number of bytes required back to us in vip.vip_size,
- * and then we call it again after malloc'ing the required
- * number of bytes.
- *
- * It is possible that we could fail a second time (eg, if
- * another VM was created in the instant between the two
- * ioctls, but in that case the caller can just try again
- * as vmm(4) will return a zero-sized list in that case.
- */
- vip.vip_size = 0;
- info = NULL;
- ret = 0;
-
- /* First ioctl to see how many bytes needed (vip.vip_size) */
- if (ioctl(vmm_fd, VMM_IOC_INFO, &vip) < 0)
- return (errno);
-
- if (vip.vip_info_ct != 0)
- return (EIO);
-
- info = malloc(vip.vip_size);
- if (info == NULL)
- return (ENOMEM);
-
- /* Second ioctl to get the actual list */
- vip.vip_info = info;
- if (ioctl(vmm_fd, VMM_IOC_INFO, &vip) < 0) {
- ret = errno;
- free(info);
- return (ret);
- }
-
- /* Return info to vmmctl(4) */
- ct = vip.vip_size / sizeof(struct vm_info_result);
- for (i = 0; i < ct; i++) {
- obuf = imsg_create(ibuf, IMSG_VMDOP_GET_INFO_VM_DATA, 0, 0,
- sizeof(struct vm_info_result));
- imsg_add(obuf, &info[i], sizeof(struct vm_info_result));
- imsg_close(ibuf, obuf);
- }
- free(info);
return (0);
}
-
-/*
- * start_client_vmd
- *
- * forks a copy of the parent vmd, chroots to VMD_USER's home, drops
- * privileges (changes to user VMD_USER), and returns.
- * Should the fork operation succeed, but later chroot/privsep
- * fail, the child exits.
- *
- * Return values (returns to both child and parent on success):
- * -1 : failure
- * 0: return to child vmd returns 0
- * !0 : return to parent vmd returns the child's pid
- */
int
-start_client_vmd(void)
+vmd_configure(void)
{
- int child_pid;
- struct passwd *pw;
-
- pw = getpwnam(VMD_USER);
- if (pw == NULL) {
- log_warnx("%s: no such user %s", __progname, VMD_USER);
- return (-1);
+#if 0
+ if (parse_config(env->sc_conffile, env) == -1) {
+ proc_kill(&env->sc_ps);
+ exit(1);
}
+#endif
- child_pid = fork();
- if (child_pid < 0)
- return (-1);
-
- if (!child_pid) {
- /* Child */
- if (chroot(pw->pw_dir) != 0)
- fatal("unable to chroot");
- if (chdir("/") != 0)
- fatal("unable to chdir");
-
- if (setgroups(1, &pw->pw_gid) == -1)
- fatal("setgroups() failed");
- if (setresgid(pw->pw_gid, pw->pw_gid, pw->pw_gid) == -1)
- fatal("setresgid() failed");
- if (setresuid(pw->pw_uid, pw->pw_uid, pw->pw_uid) == -1)
- fatal("setresuid() failed");
-
- return (0);
+ if (env->vmd_noaction) {
+ fprintf(stderr, "configuration OK\n");
+ proc_kill(&env->vmd_ps);
+ exit(0);
}
- /* Parent */
- return (child_pid);
-}
-
-/*
- * vmm_create_vm
- *
- * Requests vmm(4) to create a new VM using the supplied creation
- * parameters. This operation results in the creation of the in-kernel
- * structures for the VM, but does not start the VM's vcpu(s).
- *
- * Parameters:
- * vcp: vm_create_params struct containing the VM's desired creation
- * configuration
- *
- * Return values:
- * 0: success
- * !0 : ioctl to vmm(4) failed
- */
-int
-vmm_create_vm(struct vm_create_params *vcp)
-{
- /* Sanity check arguments */
- if (vcp->vcp_ncpus > VMM_MAX_VCPUS_PER_VM)
- return (EINVAL);
-
- if (vcp->vcp_memory_size > VMM_MAX_VM_MEM_SIZE)
- return (EINVAL);
-
- if (vcp->vcp_ndisks > VMM_MAX_DISKS_PER_VM)
- return (EINVAL);
-
- if (ioctl(vmm_fd, VMM_IOC_CREATE, vcp) < 0)
- return (errno);
-
return (0);
}
-/*
- * init_emulated_hw
- *
- * Initializes the userspace hardware emulation
- */
-void
-init_emulated_hw(struct vm_create_params *vcp, int *child_disks,
- int *child_taps)
-{
- /* Init the i8253 PIT's 3 counters */
- bzero(&i8253_counter, sizeof(struct i8253_counter) * 3);
- gettimeofday(&i8253_counter[0].tv, NULL);
- gettimeofday(&i8253_counter[1].tv, NULL);
- gettimeofday(&i8253_counter[2].tv, NULL);
- i8253_counter[0].start = TIMER_DIV(100);
- i8253_counter[1].start = TIMER_DIV(100);
- i8253_counter[2].start = TIMER_DIV(100);
-
- /* Init ns8250 UART */
- bzero(&com1_regs, sizeof(struct ns8250_regs));
-
- /* Initialize PCI */
- pci_init();
-
- /* Initialize virtio devices */
- virtio_init(vcp, child_disks, child_taps);
-}
-
-/*
- * run_vm
- *
- * Runs the VM whose creation parameters are specified in vcp
- *
- * Parameters:
- * vcp: vm_create_params struct containing the VM's desired creation
- * configuration
- * child_disks: previously-opened child VM disk file file descriptors
- * child_taps: previously-opened child tap file descriptors
- *
- * Return values:
- * 0: the VM exited normally
- * !0 : the VM exited abnormally or failed to start
- */
-int
-run_vm(int *child_disks, int *child_taps, struct vm_create_params *vcp)
-{
- size_t i;
- int ret;
- pthread_t *tid;
- void *exit_status;
- struct vm_run_params **vrp;
-
- ret = 0;
-
- /* XXX cap vcp_ncpus to avoid overflow here */
- /*
- * XXX ensure nvcpus in vcp is same as vm, or fix vmm to return einval
- * on bad vcpu id
- */
- tid = malloc(sizeof(pthread_t) * vcp->vcp_ncpus);
- vrp = malloc(sizeof(struct vm_run_params *) * vcp->vcp_ncpus);
- if (tid == NULL || vrp == NULL) {
- log_warn("%s: memory allocation error - exiting.",
- __progname);
- return (ENOMEM);
- }
-
- init_emulated_hw(vcp, child_disks, child_taps);
-
- /*
- * Create and launch one thread for each VCPU. These threads may
- * migrate between PCPUs over time; the need to reload CPU state
- * in such situations is detected and performed by vmm(4) in the
- * kernel.
- */
- for (i = 0 ; i < vcp->vcp_ncpus; i++) {
- vrp[i] = malloc(sizeof(struct vm_run_params));
- if (vrp[i] == NULL) {
- log_warn("%s: memory allocation error - "
- "exiting.", __progname);
- /* caller will exit, so skip free'ing */
- return (ENOMEM);
- }
- vrp[i]->vrp_exit = malloc(sizeof(union vm_exit));
- if (vrp[i]->vrp_exit == NULL) {
- log_warn("%s: memory allocation error - "
- "exiting.", __progname);
- /* caller will exit, so skip free'ing */
- return (ENOMEM);
- }
- vrp[i]->vrp_vm_id = vcp->vcp_id;
- vrp[i]->vrp_vcpu_id = i;
-
- /* Start each VCPU run thread at vcpu_run_loop */
- ret = pthread_create(&tid[i], NULL, vcpu_run_loop, vrp[i]);
- if (ret) {
- /* caller will _exit after this return */
- return (ret);
- }
- }
-
- /* Wait for all the threads to exit */
- for (i = 0; i < vcp->vcp_ncpus; i++) {
- if (pthread_join(tid[i], &exit_status)) {
- log_warn("%s: failed to join thread %zd - "
- "exiting", __progname, i);
- return (EIO);
- }
-
- if (exit_status != NULL) {
- log_warnx("%s: vm %d vcpu run thread %zd exited "
- "abnormally", __progname, vcp->vcp_id, i);
- ret = EIO;
- }
- }
-
- return (ret);
-}
-
-/*
- * vcpu_run_loop
- *
- * Runs a single VCPU until vmm(4) requires help handling an exit,
- * or the VM terminates.
- *
- * Parameters:
- * arg: vcpu_run_params for the VCPU being run by this thread
- *
- * Return values:
- * NULL: the VCPU shutdown properly
- * !NULL: error processing VCPU run, or the VCPU shutdown abnormally
- */
-void *
-vcpu_run_loop(void *arg)
-{
- struct vm_run_params *vrp = (struct vm_run_params *)arg;
- intptr_t ret;
-
- vrp->vrp_continue = 0;
- vrp->vrp_injint = -1;
-
- for (;;) {
- if (ioctl(vmm_fd, VMM_IOC_RUN, vrp) < 0) {
- /* If run ioctl failed, exit */
- ret = errno;
- return ((void *)ret);
- }
-
- /* If the VM is terminating, exit normally */
- if (vrp->vrp_exit_reason == VM_EXIT_TERMINATED)
- return (NULL);
-
- if (vrp->vrp_exit_reason != VM_EXIT_NONE) {
- /*
- * vmm(4) needs help handling an exit, handle in
- * vcpu_exit.
- */
- if (vcpu_exit(vrp))
- return ((void *)EIO);
- }
- }
-
- return (NULL);
-}
-
-/*
- * vcpu_exit_i8253
- *
- * Handles emulated i8253 PIT access (in/out instruction to PIT ports).
- * We don't emulate all the modes of the i8253, just the basic squarewave
- * clock.
- *
- * Parameters:
- * vei: VM exit information from vmm(4) containing information on the in/out
- * instruction being performed
- */
-void
-vcpu_exit_i8253(union vm_exit *vei)
-{
- uint32_t out_data;
- uint8_t sel, rw, data;
- uint64_t ns, ticks;
- struct timeval now, delta;
-
- if (vei->vei.vei_port == TIMER_CTRL) {
- if (vei->vei.vei_dir == 0) { /* OUT instruction */
- out_data = vei->vei.vei_data;
- sel = out_data &
- (TIMER_SEL0 | TIMER_SEL1 | TIMER_SEL2);
- sel = sel >> 6;
- if (sel > 2) {
- log_warnx("%s: i8253 PIT: invalid "
- "timer selected (%d)",
- __progname, sel);
- return;
- }
-
- rw = vei->vei.vei_data &
- (TIMER_LATCH | TIMER_LSB |
- TIMER_MSB | TIMER_16BIT);
-
- if (rw == TIMER_16BIT) {
- /*
- * XXX this seems to be used on occasion, needs
- * to be implemented
- */
- log_warnx("%s: i8253 PIT: 16 bit "
- "counter I/O not supported",
- __progname);
- return;
- }
-
- /*
- * Since we don't truly emulate each tick of the PIT
- * clock, when the guest asks for the timer to be
- * latched, simulate what the counter would have been
- * had we performed full emulation. We do this by
- * calculating when the counter was reset vs how much
- * time has elapsed, then bias by the counter tick
- * rate.
- */
- if (rw == TIMER_LATCH) {
- gettimeofday(&now, NULL);
- delta.tv_sec = now.tv_sec -
- i8253_counter[sel].tv.tv_sec;
- delta.tv_usec = now.tv_usec -
- i8253_counter[sel].tv.tv_usec;
- if (delta.tv_usec < 0) {
- delta.tv_sec--;
- delta.tv_usec += 1000000;
- }
- if (delta.tv_usec > 1000000) {
- delta.tv_sec++;
- delta.tv_usec -= 1000000;
- }
- ns = delta.tv_usec * 1000 +
- delta.tv_sec * 1000000000;
- ticks = ns / NS_PER_TICK;
- i8253_counter[sel].olatch =
- i8253_counter[sel].start -
- ticks % i8253_counter[sel].start;
- return;
- }
-
- log_warnx("%s: i8253 PIT: unsupported rw mode "
- "%d", __progname, rw);
- return;
- } else {
- /* XXX should this return 0xff? */
- log_warnx("%s: i8253 PIT: read from control "
- "port unsupported", __progname);
- }
- } else {
- sel = vei->vei.vei_port - (TIMER_CNTR0 + TIMER_BASE);
- if (vei->vei.vei_dir == 0) { /* OUT instruction */
- if (i8253_counter[sel].last_w == 0) {
- out_data = vei->vei.vei_data;
- i8253_counter[sel].ilatch |= (out_data << 8);
- i8253_counter[sel].last_w = 1;
- } else {
- out_data = vei->vei.vei_data;
- i8253_counter[sel].ilatch |= out_data;
- i8253_counter[sel].start =
- i8253_counter[sel].ilatch;
- i8253_counter[sel].last_w = 0;
- }
- } else {
- if (i8253_counter[sel].last_r == 0) {
- data = i8253_counter[sel].olatch >> 8;
- vei->vei.vei_data = data;
- i8253_counter[sel].last_w = 1;
- } else {
- data = i8253_counter[sel].olatch & 0xFF;
- vei->vei.vei_data = data;
- i8253_counter[sel].last_w = 0;
- }
- }
- }
-}
-
-/*
- * vcpu_process_com_data
- *
- * Emulate in/out instructions to the com1 (ns8250) UART data register
- *
- * Parameters:
- * vei: vm exit information from vmm(4) containing information on the in/out
- * instruction being performed
- */
-void
-vcpu_process_com_data(union vm_exit *vei)
-{
- /*
- * vei_dir == 0 : out instruction
- *
- * The guest wrote to the data register. Since we are emulating a
- * no-fifo chip, write the character immediately to the pty and
- * assert TXRDY in IIR (if the guest has requested TXRDY interrupt
- * reporting)
- */
- if (vei->vei.vei_dir == 0) {
- write(con_fd, &vei->vei.vei_data, 1);
- if (com1_regs.ier & 0x2) {
- /* Set TXRDY */
- com1_regs.iir |= IIR_TXRDY;
- /* Set "interrupt pending" (IIR low bit cleared) */
- com1_regs.iir &= ~0x1;
- }
- } else {
- /*
- * vei_dir == 1 : in instruction
- *
- * The guest read from the data register. Check to see if
- * there is data available (RXRDY) and if so, consume the
- * input data and return to the guest. Also clear the
- * interrupt info register regardless.
- */
- if (com1_regs.lsr & LSR_RXRDY) {
- vei->vei.vei_data = com1_regs.data;
- com1_regs.data = 0x0;
- com1_regs.lsr &= ~LSR_RXRDY;
- } else {
- /* XXX should this be com1_regs.data or 0xff? */
- vei->vei.vei_data = com1_regs.data;
- log_warnx("guest reading com1 when not ready");
- }
-
- /* Reading the data register always clears RXRDY from IIR */
- com1_regs.iir &= ~IIR_RXRDY;
-
- /*
- * Clear "interrupt pending" by setting IIR low bit to 1
- * if no interrupt are pending
- */
- if (com1_regs.iir == 0x0)
- com1_regs.iir = 0x1;
- }
-}
-
-/*
- * vcpu_process_com_lcr
- *
- * Emulate in/out instructions to the com1 (ns8250) UART line control register
- *
- * Paramters:
- * vei: vm exit information from vmm(4) containing information on the in/out
- * instruction being performed
- */
-void
-vcpu_process_com_lcr(union vm_exit *vei)
-{
- /*
- * vei_dir == 0 : out instruction
- *
- * Write content to line control register
- */
- if (vei->vei.vei_dir == 0) {
- com1_regs.lcr = (uint8_t)vei->vei.vei_data;
- } else {
- /*
- * vei_dir == 1 : in instruction
- *
- * Read line control register
- */
- vei->vei.vei_data = com1_regs.lcr;
- }
-}
-
-/*
- * vcpu_process_com_iir
- *
- * Emulate in/out instructions to the com1 (ns8250) UART interrupt information
- * register. Note that writes to this register actually are to a different
- * register, the FCR (FIFO control register) that we don't emulate but still
- * consume the data provided.
- *
- * Parameters:
- * vei: vm exit information from vmm(4) containing information on the in/out
- * instruction being performed
- */
void
-vcpu_process_com_iir(union vm_exit *vei)
+vmd_shutdown(void)
{
- /*
- * vei_dir == 0 : out instruction
- *
- * Write to FCR
- */
- if (vei->vei.vei_dir == 0) {
- com1_regs.fcr = vei->vei.vei_data;
- } else {
- /*
- * vei_dir == 1 : in instruction
- *
- * Read IIR. Reading the IIR resets the TXRDY bit in the IIR
- * after the data is read.
- */
- vei->vei.vei_data = com1_regs.iir;
- com1_regs.iir &= ~IIR_TXRDY;
+ proc_kill(&env->vmd_ps);
+ free(env);
- /*
- * Clear "interrupt pending" by setting IIR low bit to 1
- * if no interrupts are pending
- */
- if (com1_regs.iir == 0x0)
- com1_regs.iir = 0x1;
- }
-}
-
-/*
- * vcpu_process_com_mcr
- *
- * Emulate in/out instructions to the com1 (ns8250) UART modem control
- * register.
- *
- * Parameters:
- * vei: vm exit information from vmm(4) containing information on the in/out
- * instruction being performed
- */
-void
-vcpu_process_com_mcr(union vm_exit *vei)
-{
- /*
- * vei_dir == 0 : out instruction
- *
- * Write to MCR
- */
- if (vei->vei.vei_dir == 0) {
- com1_regs.mcr = vei->vei.vei_data;
- } else {
- /*
- * vei_dir == 1 : in instruction
- *
- * Read from MCR
- */
- vei->vei.vei_data = com1_regs.mcr;
- }
-}
-
-/*
- * vcpu_process_com_lsr
- *
- * Emulate in/out instructions to the com1 (ns8250) UART line status register.
- *
- * Parameters:
- * vei: vm exit information from vmm(4) containing information on the in/out
- * instruction being performed
- */
-void
-vcpu_process_com_lsr(union vm_exit *vei)
-{
- /*
- * vei_dir == 0 : out instruction
- *
- * Write to LSR. This is an illegal operation, so we just log it and
- * continue.
- */
- if (vei->vei.vei_dir == 0) {
- log_warnx("%s: LSR UART write 0x%x unsupported",
- __progname, vei->vei.vei_data);
- } else {
- /*
- * vei_dir == 1 : in instruction
- *
- * Read from LSR. We always report TXRDY and TSRE since we
- * can process output characters immediately (at any time).
- */
- vei->vei.vei_data = com1_regs.lsr | LSR_TSRE | LSR_TXRDY;
- }
-}
-
-/*
- * vcpu_process_com_msr
- *
- * Emulate in/out instructions to the com1 (ns8250) UART modem status register.
- *
- * Parameters:
- * vei: vm exit information from vmm(4) containing information on the in/out
- * instruction being performed
- */
-void
-vcpu_process_com_msr(union vm_exit *vei)
-{
- /*
- * vei_dir == 0 : out instruction
- *
- * Write to MSR. This is an illegal operation, so we just log it and
- * continue.
- */
- if (vei->vei.vei_dir == 0) {
- log_warnx("%s: MSR UART write 0x%x unsupported",
- __progname, vei->vei.vei_data);
- } else {
- /*
- * vei_dir == 1 : in instruction
- *
- * Read from MSR. We always report DCD, DSR, and CTS.
- */
- vei->vei.vei_data = com1_regs.lsr | MSR_DCD | MSR_DSR | MSR_CTS;
- }
-}
-
-/*
- * vcpu_process_com_scr
- *
- * Emulate in/out instructions to the com1 (ns8250) UART scratch register. The
- * scratch register is sometimes used to distinguish an 8250 from a 16450,
- * and/or used to distinguish submodels of the 8250 (eg 8250A, 8250B). We
- * simulate an "original" 8250 by forcing the scratch register to return data
- * on read that is different from what was written.
- *
- * Parameters:
- * vei: vm exit information from vmm(4) containing information on the in/out
- * instruction being performed
- */
-void
-vcpu_process_com_scr(union vm_exit *vei)
-{
- /*
- * vei_dir == 0 : out instruction
- *
- * Write to SCR
- */
- if (vei->vei.vei_dir == 0) {
- com1_regs.scr = vei->vei.vei_data;
- } else {
- /*
- * vei_dir == 1 : in instruction
- *
- * Read from SCR. To make sure we don't accidentally simulate
- * a real scratch register, we negate what was written on
- * subsequent readback.
- */
- vei->vei.vei_data = ~com1_regs.scr;
- }
-}
-
-/*
- * vcpu_process_com_ier
- *
- * Emulate in/out instructions to the com1 (ns8250) UART interrupt enable
- * register.
- *
- * Parameters:
- * vei: vm exit information from vmm(4) containing information on the in/out
- * instruction being performed
- */
-void
-vcpu_process_com_ier(union vm_exit *vei)
-{
- /*
- * vei_dir == 0 : out instruction
- *
- * Write to IER
- */
- if (vei->vei.vei_dir == 0) {
- com1_regs.ier = vei->vei.vei_data;
- } else {
- /*
- * vei_dir == 1 : in instruction
- *
- * Read from IER
- */
- vei->vei.vei_data = com1_regs.ier;
- }
-}
-
-/*
- * vcpu_exit_com
- *
- * Process com1 (ns8250) UART exits. vmd handles most basic 8250
- * features with the exception of the divisor latch (eg, no baud
- * rate support)
- *
- * Parameters:
- * vrp: vcpu run parameters containing guest state for this exit
- */
-void
-vcpu_exit_com(struct vm_run_params *vrp)
-{
- union vm_exit *vei = vrp->vrp_exit;
-
- switch(vei->vei.vei_port) {
- case COM1_LCR:
- vcpu_process_com_lcr(vei);
- break;
- case COM1_IER:
- vcpu_process_com_ier(vei);
- break;
- case COM1_IIR:
- vcpu_process_com_iir(vei);
- break;
- case COM1_MCR:
- vcpu_process_com_mcr(vei);
- break;
- case COM1_LSR:
- vcpu_process_com_lsr(vei);
- break;
- case COM1_MSR:
- vcpu_process_com_msr(vei);
- break;
- case COM1_SCR:
- vcpu_process_com_scr(vei);
- break;
- case COM1_DATA:
- vcpu_process_com_data(vei);
- break;
- }
-}
-
-/*
- * vcpu_exit_pci
- *
- * Handle all I/O to the emulated PCI subsystem.
- *
- * Parameters:
- * vrp: vcpu run paramters containing guest state for this exit
- *
- * Return values:
- * 0xff if no interrupt is required after this pci exit,
- * or an interrupt vector otherwise
- */
-uint8_t
-vcpu_exit_pci(struct vm_run_params *vrp)
-{
- union vm_exit *vei = vrp->vrp_exit;
- uint8_t intr;
-
- intr = 0xFF;
-
- switch(vei->vei.vei_port) {
- case PCI_MODE1_ADDRESS_REG:
- pci_handle_address_reg(vrp);
- break;
- case PCI_MODE1_DATA_REG:
- pci_handle_data_reg(vrp);
- break;
- case VMM_PCI_IO_BAR_BASE ... VMM_PCI_IO_BAR_END:
- intr = pci_handle_io(vrp);
- break;
- default:
- log_warnx("%s: unknown PCI register 0x%llx",
- __progname, (uint64_t)vei->vei.vei_port);
- break;
- }
-
- return (intr);
-}
-
-/*
- * vcpu_exit_inout
- *
- * Handle all I/O exits that need to be emulated in vmd. This includes the
- * i8253 PIT and the com1 ns8250 UART.
- *
- * Parameters:
- * vrp: vcpu run parameters containing guest state for this exit
- */
-void
-vcpu_exit_inout(struct vm_run_params *vrp)
-{
- union vm_exit *vei = vrp->vrp_exit;
- uint8_t intr;
-
- switch(vei->vei.vei_port) {
- case TIMER_CTRL:
- case (TIMER_CNTR0 + TIMER_BASE):
- case (TIMER_CNTR1 + TIMER_BASE):
- case (TIMER_CNTR2 + TIMER_BASE):
- vcpu_exit_i8253(vei);
- break;
- case COM1_DATA ... COM1_SCR:
- vcpu_exit_com(vrp);
- break;
- case PCI_MODE1_ADDRESS_REG:
- case PCI_MODE1_DATA_REG:
- case VMM_PCI_IO_BAR_BASE ... VMM_PCI_IO_BAR_END:
- intr = vcpu_exit_pci(vrp);
- if (intr != 0xFF)
- vrp->vrp_injint = intr;
- else
- vrp->vrp_injint = -1;
- break;
- default:
- /* IN from unsupported port gives FFs */
- if (vei->vei.vei_dir == 1)
- vei->vei.vei_data = 0xFFFFFFFF;
- break;
- }
-}
-
-/*
- * vcpu_exit
- *
- * Handle a vcpu exit. This function is called when it is determined that
- * vmm(4) requires the assistance of vmd to support a particular guest
- * exit type (eg, accessing an I/O port or device). Guest state is contained
- * in 'vrp', and will be resent to vmm(4) on exit completion.
- *
- * Upon conclusion of handling the exit, the function determines if any
- * interrupts should be injected into the guest, and sets vrp->vrp_injint
- * to the IRQ line whose interrupt should be vectored (or -1 if no interrupt
- * is to be injected).
- *
- * Parameters:
- * vrp: vcpu run parameters containing guest state for this exit
- *
- * Return values:
- * 0: the exit was handled successfully
- * 1: an error occurred (exit not handled)
- */
-int
-vcpu_exit(struct vm_run_params *vrp)
-{
- ssize_t sz;
- char ch;
-
- switch (vrp->vrp_exit_reason) {
- case VMX_EXIT_IO:
- vcpu_exit_inout(vrp);
- break;
- case VMX_EXIT_HLT:
- /*
- * XXX handle halted state, no reason to run this vcpu again
- * until a vm interrupt is to be injected
- */
- break;
- default:
- log_warnx("%s: unknown exit reason %d",
- __progname, vrp->vrp_exit_reason);
- return (1);
- }
-
- /* XXX interrupt priority */
- if (vionet_process_rx())
- vrp->vrp_injint = 9;
-
- /*
- * Is there a new character available on com1?
- * If so, consume the character, buffer it into the com1 data register
- * assert IRQ4, and set the line status register RXRDY bit.
- *
- * XXX - move all this com intr checking to another function
- */
- sz = read(con_fd, &ch, sizeof(char));
- if (sz == 1) {
- com1_regs.lsr |= LSR_RXRDY;
- com1_regs.data = ch;
- /* XXX these ier and iir bits should be IER_x and IIR_x */
- if (com1_regs.ier & 0x1) {
- com1_regs.iir |= (2 << 1);
- com1_regs.iir &= ~0x1;
- }
- }
-
- /*
- * Clear "interrupt pending" by setting IIR low bit to 1 if no
- * interrupts are pending
- */
- /* XXX these iir magic numbers should be IIR_x */
- if ((com1_regs.iir & ~0x1) == 0x0)
- com1_regs.iir = 0x1;
-
- /* If pending interrupt and nothing waiting to be injected, inject */
- if ((com1_regs.iir & 0x1) == 0)
- if (vrp->vrp_injint == -1)
- vrp->vrp_injint = 0x4;
- vrp->vrp_continue = 1;
-
- return (0);
-}
-
-/*
- * write_page
- *
- * Pushes a page of data from 'buf' into the guest VM's memory
- * at paddr 'dst'.
- *
- * Parameters:
- * dst: the destination paddr_t in the guest VM to push into.
- * If there is no guest paddr mapping at 'dst', a new page will be
- * faulted in by the VMM (provided 'dst' represents a valid paddr
- * in the guest's address space)
- * buf: page of data to push
- * len: size of 'buf'
- * do_mask: 1 to mask the destination address (for kernel load), 0 to
- * leave 'dst' unmasked
- *
- * Return values:
- * various return values from ioctl(VMM_IOC_WRITEPAGE), or 0 if no error
- * occurred.
- *
- * Note - this function only handles GPAs < 4GB.
- */
-int
-write_page(uint32_t dst, void *buf, uint32_t len, int do_mask)
-{
- struct vm_writepage_params vwp;
-
- /*
- * Mask kernel load addresses to avoid uint32_t -> uint64_t cast
- * errors
- */
- if (do_mask)
- dst &= 0xFFFFFFF;
-
- vwp.vwp_paddr = (paddr_t)dst;
- vwp.vwp_data = buf;
- vwp.vwp_vm_id = vm_id;
- vwp.vwp_len = len;
- if (ioctl(vmm_fd, VMM_IOC_WRITEPAGE, &vwp) < 0) {
- log_warn("writepage ioctl failed");
- return (errno);
- }
- return (0);
-}
-
-/*
- * read_page
- *
- * Reads a page of memory at guest paddr 'src' into 'buf'.
- *
- * Parameters:
- * src: the source paddr_t in the guest VM to read from.
- * buf: destination (local) buffer
- * len: size of 'buf'
- * do_mask: 1 to mask the source address (for kernel load), 0 to
- * leave 'src' unmasked
- *
- * Return values:
- * various return values from ioctl(VMM_IOC_READPAGE), or 0 if no error
- * occurred.
- *
- * Note - this function only handles GPAs < 4GB.
- */
-int
-read_page(uint32_t src, void *buf, uint32_t len, int do_mask)
-{
- struct vm_readpage_params vrp;
-
- /*
- * Mask kernel load addresses to avoid uint32_t -> uint64_t cast
- * errors
- */
- if (do_mask)
- src &= 0xFFFFFFF;
-
- vrp.vrp_paddr = (paddr_t)src;
- vrp.vrp_data = buf;
- vrp.vrp_vm_id = vm_id;
- vrp.vrp_len = len;
- if (ioctl(vmm_fd, VMM_IOC_READPAGE, &vrp) < 0) {
- log_warn("readpage ioctl failed");
- return (errno);
- }
- return (0);
+ log_warnx("parent terminating");
+ exit(0);
}
diff --git a/usr.sbin/vmd/vmd.h b/usr.sbin/vmd/vmd.h
index c960dcfa735..55e0a221255 100644
--- a/usr.sbin/vmd/vmd.h
+++ b/usr.sbin/vmd/vmd.h
@@ -1,4 +1,4 @@
-/* $OpenBSD: vmd.h,v 1.3 2015/11/23 13:04:49 reyk Exp $ */
+/* $OpenBSD: vmd.h,v 1.4 2015/12/02 09:14:25 reyk Exp $ */
/*
* Copyright (c) 2015 Mike Larkin <mlarkin@openbsd.org>
@@ -16,7 +16,10 @@
* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
*/
-#include <stdarg.h>
+#include <sys/types.h>
+#include <sys/queue.h>
+
+#include "proc.h"
#ifndef __VMD_H__
#define __VMD_H__
@@ -25,6 +28,9 @@
#define SOCKET_NAME "/var/run/vmd.sock"
#define VMM_NODE "/dev/vmm"
#define VM_NAME_MAX 256
+#define MAX_TAP 256
+#define NR_BACKLOG 5
+
/* #define VMD_DEBUG */
@@ -34,10 +40,8 @@
#define dprintf(x...)
#endif /* VMM_DEBUG */
-
enum imsg_type {
- IMSG_NONE,
- IMSG_VMDOP_DISABLE_VMM_REQUEST,
+ IMSG_VMDOP_DISABLE_VMM_REQUEST = IMSG_PROC_MAX,
IMSG_VMDOP_DISABLE_VMM_RESPONSE,
IMSG_VMDOP_ENABLE_VMM_REQUEST,
IMSG_VMDOP_ENABLE_VMM_RESPONSE,
@@ -50,28 +54,22 @@ enum imsg_type {
IMSG_VMDOP_GET_INFO_VM_END_DATA
};
+SLIST_HEAD(vmstate_head, vmstate);
+
+struct vmd {
+ struct privsep vmd_ps;
+ int vmd_fd;
+
+ int vmd_debug;
+ int vmd_verbose;
+ int vmd_noaction;
+
+ struct vmstate_head vmd_vmstate;
+};
+
+/* vmm.c */
+int vmm_dispatch_control(int, struct privsep_proc *, struct imsg *);
int write_page(uint32_t dst, void *buf, uint32_t, int);
int read_page(uint32_t dst, void *buf, uint32_t, int);
-/* log.c */
-void log_init(int, int);
-void log_procinit(const char *);
-void log_verbose(int);
-void log_warn(const char *, ...)
- __attribute__((__format__ (printf, 1, 2)));
-void log_warnx(const char *, ...)
- __attribute__((__format__ (printf, 1, 2)));
-void log_info(const char *, ...)
- __attribute__((__format__ (printf, 1, 2)));
-void log_debug(const char *, ...)
- __attribute__((__format__ (printf, 1, 2)));
-void logit(int, const char *, ...)
- __attribute__((__format__ (printf, 2, 3)));
-void vlog(int, const char *, va_list)
- __attribute__((__format__ (printf, 2, 0)));
-__dead void fatal(const char *, ...)
- __attribute__((__format__ (printf, 1, 2)));
-__dead void fatalx(const char *, ...)
- __attribute__((__format__ (printf, 1, 2)));
-
#endif /* __VMD_H__ */
diff --git a/usr.sbin/vmd/vmm.c b/usr.sbin/vmd/vmm.c
new file mode 100644
index 00000000000..32c10397e86
--- /dev/null
+++ b/usr.sbin/vmd/vmm.c
@@ -0,0 +1,1408 @@
+/* $OpenBSD: vmm.c,v 1.1 2015/12/02 09:14:25 reyk Exp $ */
+
+/*
+ * Copyright (c) 2015 Mike Larkin <mlarkin@openbsd.org>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <sys/param.h>
+#include <sys/ioctl.h>
+#include <sys/queue.h>
+#include <sys/uio.h>
+#include <sys/socket.h>
+#include <sys/stat.h>
+#include <sys/un.h>
+#include <sys/wait.h>
+#include <sys/mman.h>
+#include <sys/time.h>
+
+#include <dev/ic/comreg.h>
+#include <dev/ic/i8253reg.h>
+#include <dev/isa/isareg.h>
+#include <dev/pci/pcireg.h>
+
+#include <machine/param.h>
+#include <machine/vmmvar.h>
+
+#include <errno.h>
+#include <fcntl.h>
+#include <imsg.h>
+#include <limits.h>
+#include <pthread.h>
+#include <pwd.h>
+#include <signal.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <syslog.h>
+#include <termios.h>
+#include <unistd.h>
+#include <poll.h>
+#include <util.h>
+
+#include "vmd.h"
+#include "loadfile.h"
+#include "pci.h"
+#include "virtio.h"
+#include "proc.h"
+
+/*
+ * Emulated 8250 UART
+ *
+ */
+#define COM1_DATA 0x3f8
+#define COM1_IER 0x3f9
+#define COM1_IIR 0x3fa
+#define COM1_LCR 0x3fb
+#define COM1_MCR 0x3fc
+#define COM1_LSR 0x3fd
+#define COM1_MSR 0x3fe
+#define COM1_SCR 0x3ff
+
+/*
+ * Emulated i8253 PIT (counter)
+ */
+#define TIMER_BASE 0x40
+#define TIMER_CTRL 0x43 /* 8253 Timer #1 */
+#define NS_PER_TICK (1000000000 / TIMER_FREQ)
+
+/* i8253 registers */
+struct i8253_counter {
+ struct timeval tv; /* timer start time */
+ uint16_t start; /* starting value */
+ uint16_t olatch; /* output latch */
+ uint16_t ilatch; /* input latch */
+ uint8_t last_r; /* last read byte (MSB/LSB) */
+ uint8_t last_w; /* last written byte (MSB/LSB) */
+};
+
+/* ns8250 UART registers */
+struct ns8250_regs {
+ uint8_t lcr; /* Line Control Register */
+ uint8_t fcr; /* FIFO Control Register */
+ uint8_t iir; /* Interrupt ID Register */
+ uint8_t ier; /* Interrupt Enable Register */
+ uint8_t divlo; /* Baud rate divisor low byte */
+ uint8_t divhi; /* Baud rate divisor high byte */
+ uint8_t msr; /* Modem Status Register */
+ uint8_t lsr; /* Line Status Register */
+ uint8_t mcr; /* Modem Control Register */
+ uint8_t scr; /* Scratch Register */
+ uint8_t data; /* Unread input data */
+};
+
+struct i8253_counter i8253_counter[3];
+struct ns8250_regs com1_regs;
+
+int start_client_vmd(void);
+int opentap(void);
+int start_vm(struct imsg *);
+int terminate_vm(struct imsg *);
+int get_info_vm(struct privsep *, struct imsg *);
+int run_vm(int *, int *, struct vm_create_params *);
+void *vcpu_run_loop(void *);
+int vcpu_exit(struct vm_run_params *);
+int vmm_create_vm(struct vm_create_params *);
+void init_emulated_hw(struct vm_create_params *, int *, int *);
+void vcpu_exit_inout(struct vm_run_params *);
+uint8_t vcpu_exit_pci(struct vm_run_params *);
+void vcpu_exit_i8253(union vm_exit *);
+void vcpu_exit_com(struct vm_run_params *);
+void vcpu_process_com_data(union vm_exit *);
+void vcpu_process_com_lcr(union vm_exit *);
+void vcpu_process_com_lsr(union vm_exit *);
+void vcpu_process_com_ier(union vm_exit *);
+void vcpu_process_com_mcr(union vm_exit *);
+void vcpu_process_com_iir(union vm_exit *);
+void vcpu_process_com_msr(union vm_exit *);
+void vcpu_process_com_scr(union vm_exit *);
+
+int con_fd, vm_id;
+
+extern struct vmd *env;
+
+extern char *__progname;
+
+int
+vmm_dispatch_control(int fd, struct privsep_proc *p, struct imsg *imsg)
+{
+ struct privsep *ps = p->p_ps;
+ int res = 0, cmd = 0;
+
+ switch (imsg->hdr.type) {
+ case IMSG_VMDOP_START_VM_REQUEST:
+ res = start_vm(imsg);
+ cmd = IMSG_VMDOP_START_VM_RESPONSE;
+ break;
+ case IMSG_VMDOP_TERMINATE_VM_REQUEST:
+ res = terminate_vm(imsg);
+ cmd = IMSG_VMDOP_TERMINATE_VM_RESPONSE;
+ break;
+ case IMSG_VMDOP_GET_INFO_VM_REQUEST:
+ res = get_info_vm(ps, imsg);
+ cmd = IMSG_VMDOP_GET_INFO_VM_END_DATA;
+ break;
+ default:
+ return (-1);
+ }
+
+ if (cmd &&
+ imsg_compose_event(&ps->ps_ievs[PROC_CONTROL][0],
+ cmd, imsg->hdr.peerid, 0, -1, &res, sizeof(res)) == -1)
+ return (-1);
+
+ return (0);
+}
+
+/*
+ * terminate_vm
+ *
+ * Requests vmm(4) to terminate the VM whose ID is provided in the
+ * supplied vm_terminate_params structure (vtp->vtp_vm_id)
+ *
+ * Parameters
+ * imsg: The incoming imsg body whose 'data' field contains the
+ * vm_terminate_params struct
+ *
+ * Return values:
+ * 0: success
+ * !0 : ioctl to vmm(4) failed (eg, ENOENT if the supplied VM is not
+ * valid)
+ */
+int
+terminate_vm(struct imsg *imsg)
+{
+ struct vm_terminate_params *vtp;
+
+ vtp = (struct vm_terminate_params *)imsg->data;
+
+ if (ioctl(env->vmd_fd, VMM_IOC_TERM, vtp) < 0)
+ return (errno);
+
+ return (0);
+}
+
+/*
+ * opentap
+ *
+ * Opens the next available tap device, up to MAX_TAP.
+ *
+ * Returns a file descriptor to the tap node opened, or -1 if no tap
+ * devices were available.
+ */
+int
+opentap(void)
+{
+ int i, fd;
+ char path[PATH_MAX];
+
+ for (i = 0; i < MAX_TAP; i++) {
+ snprintf(path, PATH_MAX, "/dev/tap%d", i);
+ fd = open(path, O_RDWR | O_NONBLOCK);
+ if (fd != -1)
+ return (fd);
+ }
+
+ return (-1);
+}
+
+/*
+ * start_vm
+ *
+ * Starts a new VM with the creation parameters supplied (in the incoming
+ * imsg->data field). This function performs a basic sanity check on the
+ * incoming parameters and then performs the following steps to complete
+ * the creation of the VM:
+ *
+ * 1. opens the VM disk image files specified in the VM creation parameters
+ * 2. opens the specified VM kernel
+ * 3. creates a VM console tty pair using openpty
+ * 4. forks, passing the file descriptors opened in steps 1-3 to the child
+ * vmd responsible for dropping privilege and running the VM's VCPU
+ * loops.
+ *
+ * Parameters:
+ * imsg: The incoming imsg body whose 'data' field is a vm_create_params
+ * struct containing the VM creation parameters.
+ *
+ * Return values:
+ * 0: success
+ * !0 : failure - typically an errno indicating the source of the failure
+ */
+int
+start_vm(struct imsg *imsg)
+{
+ struct vm_create_params *vcp;
+ size_t i;
+ off_t kernel_size;
+ struct stat sb;
+ int child_disks[VMM_MAX_DISKS_PER_VM], kernel_fd, ret, ttym_fd;
+ int child_taps[VMM_MAX_NICS_PER_VM];
+ int ttys_fd;
+ char ptyn[32];
+
+ vcp = (struct vm_create_params *)imsg->data;
+
+ for (i = 0 ; i < VMM_MAX_DISKS_PER_VM; i++)
+ child_disks[i] = -1;
+ for (i = 0 ; i < VMM_MAX_NICS_PER_VM; i++)
+ child_taps[i] = -1;
+
+ /*
+ * XXX kernel_fd can't be global (possible race if multiple VMs
+ * being created at the same time). Probably need to move this
+ * into the child before dropping privs, or just make it local
+ * to this function?
+ */
+ kernel_fd = -1;
+
+ ttym_fd = -1;
+ ttys_fd = -1;
+
+ /* Open disk images for child */
+ for (i = 0 ; i < vcp->vcp_ndisks; i++) {
+ child_disks[i] = open(vcp->vcp_disks[i], O_RDWR);
+ if (child_disks[i] == -1) {
+ ret = errno;
+ log_warn("%s: can't open %s", __progname,
+ vcp->vcp_disks[i]);
+ goto err;
+ }
+ }
+
+ bzero(&sb, sizeof(sb));
+ if (stat(vcp->vcp_kernel, &sb) == -1) {
+ ret = errno;
+ log_warn("%s: can't stat kernel image %s",
+ __progname, vcp->vcp_kernel);
+ goto err;
+ }
+
+ kernel_size = sb.st_size;
+
+ /* Open kernel image */
+ kernel_fd = open(vcp->vcp_kernel, O_RDONLY);
+ if (kernel_fd == -1) {
+ ret = errno;
+ log_warn("%s: can't open kernel image %s",
+ __progname, vcp->vcp_kernel);
+ goto err;
+ }
+
+ if (openpty(&ttym_fd, &ttys_fd, ptyn, NULL, NULL) == -1) {
+ ret = errno;
+ log_warn("%s: openpty failed", __progname);
+ goto err;
+ }
+
+ if (close(ttys_fd)) {
+ ret = errno;
+ log_warn("%s: close tty failed", __progname);
+ goto err;
+ }
+
+ /* Open tap devices for child */
+ for (i = 0 ; i < vcp->vcp_nnics; i++) {
+ child_taps[i] = opentap();
+ if (child_taps[i] == -1) {
+ ret = errno;
+ log_warn("%s: can't open tap for nic %zd",
+ __progname, i);
+ goto err;
+ }
+ }
+
+ /* Start child vmd for this VM (fork, chroot, drop privs) */
+ ret = start_client_vmd();
+
+ /* Start child failed? - cleanup and leave */
+ if (ret == -1) {
+ ret = EIO;
+ goto err;
+ }
+
+ if (ret > 0) {
+ /* Parent */
+ for (i = 0 ; i < vcp->vcp_ndisks; i++)
+ close(child_disks[i]);
+
+ for (i = 0 ; i < vcp->vcp_nnics; i++)
+ close(child_taps[i]);
+
+ close(kernel_fd);
+ close(ttym_fd);
+
+ return (0);
+ }
+ else {
+ /* Child */
+ setproctitle(vcp->vcp_name);
+ log_procinit(vcp->vcp_name);
+
+ log_info("%s: vm console: %s", __progname, ptyn);
+ ret = vmm_create_vm(vcp);
+ if (ret) {
+ errno = ret;
+ fatal("create vmm ioctl failed - exiting");
+ }
+
+ /* Load kernel image */
+ ret = loadelf_main(kernel_fd, vcp->vcp_id, vcp->vcp_memory_size);
+ if (ret) {
+ errno = ret;
+ fatal("failed to load kernel - exiting");
+ }
+
+ close(kernel_fd);
+
+ con_fd = ttym_fd;
+ if (fcntl(con_fd, F_SETFL, O_NONBLOCK) == -1)
+ fatal("failed to set nonblocking mode on console");
+
+ /* Execute the vcpu run loop(s) for this VM */
+ ret = run_vm(child_disks, child_taps, vcp);
+ _exit(ret != 0);
+ }
+
+ return (ret);
+
+err:
+ for (i = 0 ; i < vcp->vcp_ndisks; i++)
+ if (child_disks[i] != -1)
+ close(child_disks[i]);
+
+ for (i = 0 ; i < vcp->vcp_nnics; i++)
+ if (child_taps[i] != -1)
+ close(child_taps[i]);
+
+ if (kernel_fd != -1)
+ close(kernel_fd);
+
+ if (ttym_fd != -1)
+ close(ttym_fd);
+
+ return (ret);
+}
+
+/*
+ * get_info_vm
+ *
+ * Returns a list of VMs known to vmm(4).
+ *
+ * Parameters:
+ * ibuf: the imsg ibuf in which to place the results. A new imsg will
+ * be created using this ibuf.
+ *
+ * Return values:
+ * 0: success
+ * !0 : failure (eg, ENOMEM, EIO or another error code from vmm(4) ioctl)
+ */
+int
+get_info_vm(struct privsep *ps, struct imsg *imsg)
+{
+ int ret;
+ size_t ct, i;
+ struct vm_info_params vip;
+ struct vm_info_result *info;
+
+ /*
+ * We issue the VMM_IOC_INFO ioctl twice, once with an input
+ * buffer size of 0, which results in vmm(4) returning the
+ * number of bytes required back to us in vip.vip_size,
+ * and then we call it again after malloc'ing the required
+ * number of bytes.
+ *
+ * It is possible that we could fail a second time (eg, if
+ * another VM was created in the instant between the two
+ * ioctls, but in that case the caller can just try again
+ * as vmm(4) will return a zero-sized list in that case.
+ */
+ vip.vip_size = 0;
+ info = NULL;
+ ret = 0;
+
+ /* First ioctl to see how many bytes needed (vip.vip_size) */
+ if (ioctl(env->vmd_fd, VMM_IOC_INFO, &vip) < 0)
+ return (errno);
+
+ if (vip.vip_info_ct != 0)
+ return (EIO);
+
+ info = malloc(vip.vip_size);
+ if (info == NULL)
+ return (ENOMEM);
+
+ /* Second ioctl to get the actual list */
+ vip.vip_info = info;
+ if (ioctl(env->vmd_fd, VMM_IOC_INFO, &vip) < 0) {
+ ret = errno;
+ free(info);
+ return (ret);
+ }
+
+ /* Return info to vmmctl(4) */
+ ct = vip.vip_size / sizeof(struct vm_info_result);
+ for (i = 0; i < ct; i++) {
+ if (imsg_compose_event(&ps->ps_ievs[PROC_CONTROL][0],
+ IMSG_VMDOP_GET_INFO_VM_DATA, imsg->hdr.peerid, 0,
+ -1, &info[i], sizeof(struct vm_info_result)) == -1)
+ return (EIO);
+ }
+ free(info);
+ return (0);
+}
+
+
+/*
+ * start_client_vmd
+ *
+ * forks a copy of the parent vmd, chroots to VMD_USER's home, drops
+ * privileges (changes to user VMD_USER), and returns.
+ * Should the fork operation succeed, but later chroot/privsep
+ * fail, the child exits.
+ *
+ * Return values (returns to both child and parent on success):
+ * -1 : failure
+ * 0: return to child vmd returns 0
+ * !0 : return to parent vmd returns the child's pid
+ */
+int
+start_client_vmd(void)
+{
+ int child_pid;
+ struct passwd *pw;
+
+ pw = getpwnam(VMD_USER);
+ if (pw == NULL) {
+ log_warnx("%s: no such user %s", __progname, VMD_USER);
+ return (-1);
+ }
+
+ child_pid = fork();
+ if (child_pid < 0)
+ return (-1);
+
+ if (!child_pid) {
+ /* Child */
+ if (chroot(pw->pw_dir) != 0)
+ fatal("unable to chroot");
+ if (chdir("/") != 0)
+ fatal("unable to chdir");
+
+ if (setgroups(1, &pw->pw_gid) == -1)
+ fatal("setgroups() failed");
+ if (setresgid(pw->pw_gid, pw->pw_gid, pw->pw_gid) == -1)
+ fatal("setresgid() failed");
+ if (setresuid(pw->pw_uid, pw->pw_uid, pw->pw_uid) == -1)
+ fatal("setresuid() failed");
+
+ return (0);
+ }
+
+ /* Parent */
+ return (child_pid);
+}
+
+/*
+ * vmm_create_vm
+ *
+ * Requests vmm(4) to create a new VM using the supplied creation
+ * parameters. This operation results in the creation of the in-kernel
+ * structures for the VM, but does not start the VM's vcpu(s).
+ *
+ * Parameters:
+ * vcp: vm_create_params struct containing the VM's desired creation
+ * configuration
+ *
+ * Return values:
+ * 0: success
+ * !0 : ioctl to vmm(4) failed
+ */
+int
+vmm_create_vm(struct vm_create_params *vcp)
+{
+ /* Sanity check arguments */
+ if (vcp->vcp_ncpus > VMM_MAX_VCPUS_PER_VM)
+ return (EINVAL);
+
+ if (vcp->vcp_memory_size > VMM_MAX_VM_MEM_SIZE)
+ return (EINVAL);
+
+ if (vcp->vcp_ndisks > VMM_MAX_DISKS_PER_VM)
+ return (EINVAL);
+
+ if (ioctl(env->vmd_fd, VMM_IOC_CREATE, vcp) < 0)
+ return (errno);
+
+ return (0);
+}
+
+/*
+ * init_emulated_hw
+ *
+ * Initializes the userspace hardware emulation
+ */
+void
+init_emulated_hw(struct vm_create_params *vcp, int *child_disks,
+ int *child_taps)
+{
+ /* Init the i8253 PIT's 3 counters */
+ bzero(&i8253_counter, sizeof(struct i8253_counter) * 3);
+ gettimeofday(&i8253_counter[0].tv, NULL);
+ gettimeofday(&i8253_counter[1].tv, NULL);
+ gettimeofday(&i8253_counter[2].tv, NULL);
+ i8253_counter[0].start = TIMER_DIV(100);
+ i8253_counter[1].start = TIMER_DIV(100);
+ i8253_counter[2].start = TIMER_DIV(100);
+
+ /* Init ns8250 UART */
+ bzero(&com1_regs, sizeof(struct ns8250_regs));
+
+ /* Initialize PCI */
+ pci_init();
+
+ /* Initialize virtio devices */
+ virtio_init(vcp, child_disks, child_taps);
+}
+
+/*
+ * run_vm
+ *
+ * Runs the VM whose creation parameters are specified in vcp
+ *
+ * Parameters:
+ * vcp: vm_create_params struct containing the VM's desired creation
+ * configuration
+ * child_disks: previously-opened child VM disk file file descriptors
+ * child_taps: previously-opened child tap file descriptors
+ *
+ * Return values:
+ * 0: the VM exited normally
+ * !0 : the VM exited abnormally or failed to start
+ */
+int
+run_vm(int *child_disks, int *child_taps, struct vm_create_params *vcp)
+{
+ size_t i;
+ int ret;
+ pthread_t *tid;
+ void *exit_status;
+ struct vm_run_params **vrp;
+
+ ret = 0;
+
+ /* XXX cap vcp_ncpus to avoid overflow here */
+ /*
+ * XXX ensure nvcpus in vcp is same as vm, or fix vmm to return einval
+ * on bad vcpu id
+ */
+ tid = malloc(sizeof(pthread_t) * vcp->vcp_ncpus);
+ vrp = malloc(sizeof(struct vm_run_params *) * vcp->vcp_ncpus);
+ if (tid == NULL || vrp == NULL) {
+ log_warn("%s: memory allocation error - exiting.",
+ __progname);
+ return (ENOMEM);
+ }
+
+ init_emulated_hw(vcp, child_disks, child_taps);
+
+ /*
+ * Create and launch one thread for each VCPU. These threads may
+ * migrate between PCPUs over time; the need to reload CPU state
+ * in such situations is detected and performed by vmm(4) in the
+ * kernel.
+ */
+ for (i = 0 ; i < vcp->vcp_ncpus; i++) {
+ vrp[i] = malloc(sizeof(struct vm_run_params));
+ if (vrp[i] == NULL) {
+ log_warn("%s: memory allocation error - "
+ "exiting.", __progname);
+ /* caller will exit, so skip free'ing */
+ return (ENOMEM);
+ }
+ vrp[i]->vrp_exit = malloc(sizeof(union vm_exit));
+ if (vrp[i]->vrp_exit == NULL) {
+ log_warn("%s: memory allocation error - "
+ "exiting.", __progname);
+ /* caller will exit, so skip free'ing */
+ return (ENOMEM);
+ }
+ vrp[i]->vrp_vm_id = vcp->vcp_id;
+ vrp[i]->vrp_vcpu_id = i;
+
+ /* Start each VCPU run thread at vcpu_run_loop */
+ ret = pthread_create(&tid[i], NULL, vcpu_run_loop, vrp[i]);
+ if (ret) {
+ /* caller will _exit after this return */
+ return (ret);
+ }
+ }
+
+ /* Wait for all the threads to exit */
+ for (i = 0; i < vcp->vcp_ncpus; i++) {
+ if (pthread_join(tid[i], &exit_status)) {
+ log_warn("%s: failed to join thread %zd - "
+ "exiting", __progname, i);
+ return (EIO);
+ }
+
+ if (exit_status != NULL) {
+ log_warnx("%s: vm %d vcpu run thread %zd exited "
+ "abnormally", __progname, vcp->vcp_id, i);
+ ret = EIO;
+ }
+ }
+
+ return (ret);
+}
+
+/*
+ * vcpu_run_loop
+ *
+ * Runs a single VCPU until vmm(4) requires help handling an exit,
+ * or the VM terminates.
+ *
+ * Parameters:
+ * arg: vcpu_run_params for the VCPU being run by this thread
+ *
+ * Return values:
+ * NULL: the VCPU shutdown properly
+ * !NULL: error processing VCPU run, or the VCPU shutdown abnormally
+ */
+void *
+vcpu_run_loop(void *arg)
+{
+ struct vm_run_params *vrp = (struct vm_run_params *)arg;
+ intptr_t ret;
+
+ vrp->vrp_continue = 0;
+ vrp->vrp_injint = -1;
+
+ for (;;) {
+ if (ioctl(env->vmd_fd, VMM_IOC_RUN, vrp) < 0) {
+ /* If run ioctl failed, exit */
+ ret = errno;
+ return ((void *)ret);
+ }
+
+ /* If the VM is terminating, exit normally */
+ if (vrp->vrp_exit_reason == VM_EXIT_TERMINATED)
+ return (NULL);
+
+ if (vrp->vrp_exit_reason != VM_EXIT_NONE) {
+ /*
+ * vmm(4) needs help handling an exit, handle in
+ * vcpu_exit.
+ */
+ if (vcpu_exit(vrp))
+ return ((void *)EIO);
+ }
+ }
+
+ return (NULL);
+}
+
+/*
+ * vcpu_exit_i8253
+ *
+ * Handles emulated i8253 PIT access (in/out instruction to PIT ports).
+ * We don't emulate all the modes of the i8253, just the basic squarewave
+ * clock.
+ *
+ * Parameters:
+ * vei: VM exit information from vmm(4) containing information on the in/out
+ * instruction being performed
+ */
+void
+vcpu_exit_i8253(union vm_exit *vei)
+{
+ uint32_t out_data;
+ uint8_t sel, rw, data;
+ uint64_t ns, ticks;
+ struct timeval now, delta;
+
+ if (vei->vei.vei_port == TIMER_CTRL) {
+ if (vei->vei.vei_dir == 0) { /* OUT instruction */
+ out_data = vei->vei.vei_data;
+ sel = out_data &
+ (TIMER_SEL0 | TIMER_SEL1 | TIMER_SEL2);
+ sel = sel >> 6;
+ if (sel > 2) {
+ log_warnx("%s: i8253 PIT: invalid "
+ "timer selected (%d)",
+ __progname, sel);
+ return;
+ }
+
+ rw = vei->vei.vei_data &
+ (TIMER_LATCH | TIMER_LSB |
+ TIMER_MSB | TIMER_16BIT);
+
+ if (rw == TIMER_16BIT) {
+ /*
+ * XXX this seems to be used on occasion, needs
+ * to be implemented
+ */
+ log_warnx("%s: i8253 PIT: 16 bit "
+ "counter I/O not supported",
+ __progname);
+ return;
+ }
+
+ /*
+ * Since we don't truly emulate each tick of the PIT
+ * clock, when the guest asks for the timer to be
+ * latched, simulate what the counter would have been
+ * had we performed full emulation. We do this by
+ * calculating when the counter was reset vs how much
+ * time has elapsed, then bias by the counter tick
+ * rate.
+ */
+ if (rw == TIMER_LATCH) {
+ gettimeofday(&now, NULL);
+ delta.tv_sec = now.tv_sec -
+ i8253_counter[sel].tv.tv_sec;
+ delta.tv_usec = now.tv_usec -
+ i8253_counter[sel].tv.tv_usec;
+ if (delta.tv_usec < 0) {
+ delta.tv_sec--;
+ delta.tv_usec += 1000000;
+ }
+ if (delta.tv_usec > 1000000) {
+ delta.tv_sec++;
+ delta.tv_usec -= 1000000;
+ }
+ ns = delta.tv_usec * 1000 +
+ delta.tv_sec * 1000000000;
+ ticks = ns / NS_PER_TICK;
+ i8253_counter[sel].olatch =
+ i8253_counter[sel].start -
+ ticks % i8253_counter[sel].start;
+ return;
+ }
+
+ log_warnx("%s: i8253 PIT: unsupported rw mode "
+ "%d", __progname, rw);
+ return;
+ } else {
+ /* XXX should this return 0xff? */
+ log_warnx("%s: i8253 PIT: read from control "
+ "port unsupported", __progname);
+ }
+ } else {
+ sel = vei->vei.vei_port - (TIMER_CNTR0 + TIMER_BASE);
+ if (vei->vei.vei_dir == 0) { /* OUT instruction */
+ if (i8253_counter[sel].last_w == 0) {
+ out_data = vei->vei.vei_data;
+ i8253_counter[sel].ilatch |= (out_data << 8);
+ i8253_counter[sel].last_w = 1;
+ } else {
+ out_data = vei->vei.vei_data;
+ i8253_counter[sel].ilatch |= out_data;
+ i8253_counter[sel].start =
+ i8253_counter[sel].ilatch;
+ i8253_counter[sel].last_w = 0;
+ }
+ } else {
+ if (i8253_counter[sel].last_r == 0) {
+ data = i8253_counter[sel].olatch >> 8;
+ vei->vei.vei_data = data;
+ i8253_counter[sel].last_w = 1;
+ } else {
+ data = i8253_counter[sel].olatch & 0xFF;
+ vei->vei.vei_data = data;
+ i8253_counter[sel].last_w = 0;
+ }
+ }
+ }
+}
+
+/*
+ * vcpu_process_com_data
+ *
+ * Emulate in/out instructions to the com1 (ns8250) UART data register
+ *
+ * Parameters:
+ * vei: vm exit information from vmm(4) containing information on the in/out
+ * instruction being performed
+ */
+void
+vcpu_process_com_data(union vm_exit *vei)
+{
+ /*
+ * vei_dir == 0 : out instruction
+ *
+ * The guest wrote to the data register. Since we are emulating a
+ * no-fifo chip, write the character immediately to the pty and
+ * assert TXRDY in IIR (if the guest has requested TXRDY interrupt
+ * reporting)
+ */
+ if (vei->vei.vei_dir == 0) {
+ write(con_fd, &vei->vei.vei_data, 1);
+ if (com1_regs.ier & 0x2) {
+ /* Set TXRDY */
+ com1_regs.iir |= IIR_TXRDY;
+ /* Set "interrupt pending" (IIR low bit cleared) */
+ com1_regs.iir &= ~0x1;
+ }
+ } else {
+ /*
+ * vei_dir == 1 : in instruction
+ *
+ * The guest read from the data register. Check to see if
+ * there is data available (RXRDY) and if so, consume the
+ * input data and return to the guest. Also clear the
+ * interrupt info register regardless.
+ */
+ if (com1_regs.lsr & LSR_RXRDY) {
+ vei->vei.vei_data = com1_regs.data;
+ com1_regs.data = 0x0;
+ com1_regs.lsr &= ~LSR_RXRDY;
+ } else {
+ /* XXX should this be com1_regs.data or 0xff? */
+ vei->vei.vei_data = com1_regs.data;
+ log_warnx("guest reading com1 when not ready");
+ }
+
+ /* Reading the data register always clears RXRDY from IIR */
+ com1_regs.iir &= ~IIR_RXRDY;
+
+ /*
+ * Clear "interrupt pending" by setting IIR low bit to 1
+ * if no interrupt are pending
+ */
+ if (com1_regs.iir == 0x0)
+ com1_regs.iir = 0x1;
+ }
+}
+
+/*
+ * vcpu_process_com_lcr
+ *
+ * Emulate in/out instructions to the com1 (ns8250) UART line control register
+ *
+ * Paramters:
+ * vei: vm exit information from vmm(4) containing information on the in/out
+ * instruction being performed
+ */
+void
+vcpu_process_com_lcr(union vm_exit *vei)
+{
+ /*
+ * vei_dir == 0 : out instruction
+ *
+ * Write content to line control register
+ */
+ if (vei->vei.vei_dir == 0) {
+ com1_regs.lcr = (uint8_t)vei->vei.vei_data;
+ } else {
+ /*
+ * vei_dir == 1 : in instruction
+ *
+ * Read line control register
+ */
+ vei->vei.vei_data = com1_regs.lcr;
+ }
+}
+
+/*
+ * vcpu_process_com_iir
+ *
+ * Emulate in/out instructions to the com1 (ns8250) UART interrupt information
+ * register. Note that writes to this register actually are to a different
+ * register, the FCR (FIFO control register) that we don't emulate but still
+ * consume the data provided.
+ *
+ * Parameters:
+ * vei: vm exit information from vmm(4) containing information on the in/out
+ * instruction being performed
+ */
+void
+vcpu_process_com_iir(union vm_exit *vei)
+{
+ /*
+ * vei_dir == 0 : out instruction
+ *
+ * Write to FCR
+ */
+ if (vei->vei.vei_dir == 0) {
+ com1_regs.fcr = vei->vei.vei_data;
+ } else {
+ /*
+ * vei_dir == 1 : in instruction
+ *
+ * Read IIR. Reading the IIR resets the TXRDY bit in the IIR
+ * after the data is read.
+ */
+ vei->vei.vei_data = com1_regs.iir;
+ com1_regs.iir &= ~IIR_TXRDY;
+
+ /*
+ * Clear "interrupt pending" by setting IIR low bit to 1
+ * if no interrupts are pending
+ */
+ if (com1_regs.iir == 0x0)
+ com1_regs.iir = 0x1;
+ }
+}
+
+/*
+ * vcpu_process_com_mcr
+ *
+ * Emulate in/out instructions to the com1 (ns8250) UART modem control
+ * register.
+ *
+ * Parameters:
+ * vei: vm exit information from vmm(4) containing information on the in/out
+ * instruction being performed
+ */
+void
+vcpu_process_com_mcr(union vm_exit *vei)
+{
+ /*
+ * vei_dir == 0 : out instruction
+ *
+ * Write to MCR
+ */
+ if (vei->vei.vei_dir == 0) {
+ com1_regs.mcr = vei->vei.vei_data;
+ } else {
+ /*
+ * vei_dir == 1 : in instruction
+ *
+ * Read from MCR
+ */
+ vei->vei.vei_data = com1_regs.mcr;
+ }
+}
+
+/*
+ * vcpu_process_com_lsr
+ *
+ * Emulate in/out instructions to the com1 (ns8250) UART line status register.
+ *
+ * Parameters:
+ * vei: vm exit information from vmm(4) containing information on the in/out
+ * instruction being performed
+ */
+void
+vcpu_process_com_lsr(union vm_exit *vei)
+{
+ /*
+ * vei_dir == 0 : out instruction
+ *
+ * Write to LSR. This is an illegal operation, so we just log it and
+ * continue.
+ */
+ if (vei->vei.vei_dir == 0) {
+ log_warnx("%s: LSR UART write 0x%x unsupported",
+ __progname, vei->vei.vei_data);
+ } else {
+ /*
+ * vei_dir == 1 : in instruction
+ *
+ * Read from LSR. We always report TXRDY and TSRE since we
+ * can process output characters immediately (at any time).
+ */
+ vei->vei.vei_data = com1_regs.lsr | LSR_TSRE | LSR_TXRDY;
+ }
+}
+
+/*
+ * vcpu_process_com_msr
+ *
+ * Emulate in/out instructions to the com1 (ns8250) UART modem status register.
+ *
+ * Parameters:
+ * vei: vm exit information from vmm(4) containing information on the in/out
+ * instruction being performed
+ */
+void
+vcpu_process_com_msr(union vm_exit *vei)
+{
+ /*
+ * vei_dir == 0 : out instruction
+ *
+ * Write to MSR. This is an illegal operation, so we just log it and
+ * continue.
+ */
+ if (vei->vei.vei_dir == 0) {
+ log_warnx("%s: MSR UART write 0x%x unsupported",
+ __progname, vei->vei.vei_data);
+ } else {
+ /*
+ * vei_dir == 1 : in instruction
+ *
+ * Read from MSR. We always report DCD, DSR, and CTS.
+ */
+ vei->vei.vei_data = com1_regs.lsr | MSR_DCD | MSR_DSR | MSR_CTS;
+ }
+}
+
+/*
+ * vcpu_process_com_scr
+ *
+ * Emulate in/out instructions to the com1 (ns8250) UART scratch register. The
+ * scratch register is sometimes used to distinguish an 8250 from a 16450,
+ * and/or used to distinguish submodels of the 8250 (eg 8250A, 8250B). We
+ * simulate an "original" 8250 by forcing the scratch register to return data
+ * on read that is different from what was written.
+ *
+ * Parameters:
+ * vei: vm exit information from vmm(4) containing information on the in/out
+ * instruction being performed
+ */
+void
+vcpu_process_com_scr(union vm_exit *vei)
+{
+ /*
+ * vei_dir == 0 : out instruction
+ *
+ * Write to SCR
+ */
+ if (vei->vei.vei_dir == 0) {
+ com1_regs.scr = vei->vei.vei_data;
+ } else {
+ /*
+ * vei_dir == 1 : in instruction
+ *
+ * Read from SCR. To make sure we don't accidentally simulate
+ * a real scratch register, we negate what was written on
+ * subsequent readback.
+ */
+ vei->vei.vei_data = ~com1_regs.scr;
+ }
+}
+
+/*
+ * vcpu_process_com_ier
+ *
+ * Emulate in/out instructions to the com1 (ns8250) UART interrupt enable
+ * register.
+ *
+ * Parameters:
+ * vei: vm exit information from vmm(4) containing information on the in/out
+ * instruction being performed
+ */
+void
+vcpu_process_com_ier(union vm_exit *vei)
+{
+ /*
+ * vei_dir == 0 : out instruction
+ *
+ * Write to IER
+ */
+ if (vei->vei.vei_dir == 0) {
+ com1_regs.ier = vei->vei.vei_data;
+ } else {
+ /*
+ * vei_dir == 1 : in instruction
+ *
+ * Read from IER
+ */
+ vei->vei.vei_data = com1_regs.ier;
+ }
+}
+
+/*
+ * vcpu_exit_com
+ *
+ * Process com1 (ns8250) UART exits. vmd handles most basic 8250
+ * features with the exception of the divisor latch (eg, no baud
+ * rate support)
+ *
+ * Parameters:
+ * vrp: vcpu run parameters containing guest state for this exit
+ */
+void
+vcpu_exit_com(struct vm_run_params *vrp)
+{
+ union vm_exit *vei = vrp->vrp_exit;
+
+ switch(vei->vei.vei_port) {
+ case COM1_LCR:
+ vcpu_process_com_lcr(vei);
+ break;
+ case COM1_IER:
+ vcpu_process_com_ier(vei);
+ break;
+ case COM1_IIR:
+ vcpu_process_com_iir(vei);
+ break;
+ case COM1_MCR:
+ vcpu_process_com_mcr(vei);
+ break;
+ case COM1_LSR:
+ vcpu_process_com_lsr(vei);
+ break;
+ case COM1_MSR:
+ vcpu_process_com_msr(vei);
+ break;
+ case COM1_SCR:
+ vcpu_process_com_scr(vei);
+ break;
+ case COM1_DATA:
+ vcpu_process_com_data(vei);
+ break;
+ }
+}
+
+/*
+ * vcpu_exit_pci
+ *
+ * Handle all I/O to the emulated PCI subsystem.
+ *
+ * Parameters:
+ * vrp: vcpu run paramters containing guest state for this exit
+ *
+ * Return values:
+ * 0xff if no interrupt is required after this pci exit,
+ * or an interrupt vector otherwise
+ */
+uint8_t
+vcpu_exit_pci(struct vm_run_params *vrp)
+{
+ union vm_exit *vei = vrp->vrp_exit;
+ uint8_t intr;
+
+ intr = 0xFF;
+
+ switch(vei->vei.vei_port) {
+ case PCI_MODE1_ADDRESS_REG:
+ pci_handle_address_reg(vrp);
+ break;
+ case PCI_MODE1_DATA_REG:
+ pci_handle_data_reg(vrp);
+ break;
+ case VMM_PCI_IO_BAR_BASE ... VMM_PCI_IO_BAR_END:
+ intr = pci_handle_io(vrp);
+ break;
+ default:
+ log_warnx("%s: unknown PCI register 0x%llx",
+ __progname, (uint64_t)vei->vei.vei_port);
+ break;
+ }
+
+ return (intr);
+}
+
+/*
+ * vcpu_exit_inout
+ *
+ * Handle all I/O exits that need to be emulated in vmd. This includes the
+ * i8253 PIT and the com1 ns8250 UART.
+ *
+ * Parameters:
+ * vrp: vcpu run parameters containing guest state for this exit
+ */
+void
+vcpu_exit_inout(struct vm_run_params *vrp)
+{
+ union vm_exit *vei = vrp->vrp_exit;
+ uint8_t intr;
+
+ switch(vei->vei.vei_port) {
+ case TIMER_CTRL:
+ case (TIMER_CNTR0 + TIMER_BASE):
+ case (TIMER_CNTR1 + TIMER_BASE):
+ case (TIMER_CNTR2 + TIMER_BASE):
+ vcpu_exit_i8253(vei);
+ break;
+ case COM1_DATA ... COM1_SCR:
+ vcpu_exit_com(vrp);
+ break;
+ case PCI_MODE1_ADDRESS_REG:
+ case PCI_MODE1_DATA_REG:
+ case VMM_PCI_IO_BAR_BASE ... VMM_PCI_IO_BAR_END:
+ intr = vcpu_exit_pci(vrp);
+ if (intr != 0xFF)
+ vrp->vrp_injint = intr;
+ else
+ vrp->vrp_injint = -1;
+ break;
+ default:
+ /* IN from unsupported port gives FFs */
+ if (vei->vei.vei_dir == 1)
+ vei->vei.vei_data = 0xFFFFFFFF;
+ break;
+ }
+}
+
+/*
+ * vcpu_exit
+ *
+ * Handle a vcpu exit. This function is called when it is determined that
+ * vmm(4) requires the assistance of vmd to support a particular guest
+ * exit type (eg, accessing an I/O port or device). Guest state is contained
+ * in 'vrp', and will be resent to vmm(4) on exit completion.
+ *
+ * Upon conclusion of handling the exit, the function determines if any
+ * interrupts should be injected into the guest, and sets vrp->vrp_injint
+ * to the IRQ line whose interrupt should be vectored (or -1 if no interrupt
+ * is to be injected).
+ *
+ * Parameters:
+ * vrp: vcpu run parameters containing guest state for this exit
+ *
+ * Return values:
+ * 0: the exit was handled successfully
+ * 1: an error occurred (exit not handled)
+ */
+int
+vcpu_exit(struct vm_run_params *vrp)
+{
+ ssize_t sz;
+ char ch;
+
+ switch (vrp->vrp_exit_reason) {
+ case VMX_EXIT_IO:
+ vcpu_exit_inout(vrp);
+ break;
+ case VMX_EXIT_HLT:
+ /*
+ * XXX handle halted state, no reason to run this vcpu again
+ * until a vm interrupt is to be injected
+ */
+ break;
+ default:
+ log_warnx("%s: unknown exit reason %d",
+ __progname, vrp->vrp_exit_reason);
+ return (1);
+ }
+
+ /* XXX interrupt priority */
+ if (vionet_process_rx())
+ vrp->vrp_injint = 9;
+
+ /*
+ * Is there a new character available on com1?
+ * If so, consume the character, buffer it into the com1 data register
+ * assert IRQ4, and set the line status register RXRDY bit.
+ *
+ * XXX - move all this com intr checking to another function
+ */
+ sz = read(con_fd, &ch, sizeof(char));
+ if (sz == 1) {
+ com1_regs.lsr |= LSR_RXRDY;
+ com1_regs.data = ch;
+ /* XXX these ier and iir bits should be IER_x and IIR_x */
+ if (com1_regs.ier & 0x1) {
+ com1_regs.iir |= (2 << 1);
+ com1_regs.iir &= ~0x1;
+ }
+ }
+
+ /*
+ * Clear "interrupt pending" by setting IIR low bit to 1 if no
+ * interrupts are pending
+ */
+ /* XXX these iir magic numbers should be IIR_x */
+ if ((com1_regs.iir & ~0x1) == 0x0)
+ com1_regs.iir = 0x1;
+
+ /* If pending interrupt and nothing waiting to be injected, inject */
+ if ((com1_regs.iir & 0x1) == 0)
+ if (vrp->vrp_injint == -1)
+ vrp->vrp_injint = 0x4;
+ vrp->vrp_continue = 1;
+
+ return (0);
+}
+
+/*
+ * write_page
+ *
+ * Pushes a page of data from 'buf' into the guest VM's memory
+ * at paddr 'dst'.
+ *
+ * Parameters:
+ * dst: the destination paddr_t in the guest VM to push into.
+ * If there is no guest paddr mapping at 'dst', a new page will be
+ * faulted in by the VMM (provided 'dst' represents a valid paddr
+ * in the guest's address space)
+ * buf: page of data to push
+ * len: size of 'buf'
+ * do_mask: 1 to mask the destination address (for kernel load), 0 to
+ * leave 'dst' unmasked
+ *
+ * Return values:
+ * various return values from ioctl(VMM_IOC_WRITEPAGE), or 0 if no error
+ * occurred.
+ *
+ * Note - this function only handles GPAs < 4GB.
+ */
+int
+write_page(uint32_t dst, void *buf, uint32_t len, int do_mask)
+{
+ struct vm_writepage_params vwp;
+
+ /*
+ * Mask kernel load addresses to avoid uint32_t -> uint64_t cast
+ * errors
+ */
+ if (do_mask)
+ dst &= 0xFFFFFFF;
+
+ vwp.vwp_paddr = (paddr_t)dst;
+ vwp.vwp_data = buf;
+ vwp.vwp_vm_id = vm_id;
+ vwp.vwp_len = len;
+ if (ioctl(env->vmd_fd, VMM_IOC_WRITEPAGE, &vwp) < 0) {
+ log_warn("writepage ioctl failed");
+ return (errno);
+ }
+ return (0);
+}
+
+/*
+ * read_page
+ *
+ * Reads a page of memory at guest paddr 'src' into 'buf'.
+ *
+ * Parameters:
+ * src: the source paddr_t in the guest VM to read from.
+ * buf: destination (local) buffer
+ * len: size of 'buf'
+ * do_mask: 1 to mask the source address (for kernel load), 0 to
+ * leave 'src' unmasked
+ *
+ * Return values:
+ * various return values from ioctl(VMM_IOC_READPAGE), or 0 if no error
+ * occurred.
+ *
+ * Note - this function only handles GPAs < 4GB.
+ */
+int
+read_page(uint32_t src, void *buf, uint32_t len, int do_mask)
+{
+ struct vm_readpage_params vrp;
+
+ /*
+ * Mask kernel load addresses to avoid uint32_t -> uint64_t cast
+ * errors
+ */
+ if (do_mask)
+ src &= 0xFFFFFFF;
+
+ vrp.vrp_paddr = (paddr_t)src;
+ vrp.vrp_data = buf;
+ vrp.vrp_vm_id = vm_id;
+ vrp.vrp_len = len;
+ if (ioctl(env->vmd_fd, VMM_IOC_READPAGE, &vrp) < 0) {
+ log_warn("readpage ioctl failed");
+ return (errno);
+ }
+ return (0);
+}