diff options
author | reyk <reyk@openbsd.org> | 2015-12-02 09:14:25 +0000 |
---|---|---|
committer | reyk <reyk@openbsd.org> | 2015-12-02 09:14:25 +0000 |
commit | af96af6c62bd200f3658b7d5ec868ef46d2cfd01 (patch) | |
tree | 7a81d339404ef73de70b5b7bce6540d2f48a07c6 /usr.sbin/vmd | |
parent | whitespaces (diff) | |
download | wireguard-openbsd-af96af6c62bd200f3658b7d5ec868ef46d2cfd01.tar.xz wireguard-openbsd-af96af6c62bd200f3658b7d5ec868ef46d2cfd01.zip |
Start tweaking vmd's privsep and daemon model by splitting the main
process into multiple parts and adopting the "proc.c"-style from other
daemons. This allows to further reduce the privileges, to give better
pledge(2), and to add some upcoming changes.
"please do" mlarkin@, deraadt@
Diffstat (limited to 'usr.sbin/vmd')
-rw-r--r-- | usr.sbin/vmd/Makefile | 7 | ||||
-rw-r--r-- | usr.sbin/vmd/control.c | 367 | ||||
-rw-r--r-- | usr.sbin/vmd/proc.c | 632 | ||||
-rw-r--r-- | usr.sbin/vmd/proc.h | 187 | ||||
-rw-r--r-- | usr.sbin/vmd/vmd.c | 1681 | ||||
-rw-r--r-- | usr.sbin/vmd/vmd.h | 50 | ||||
-rw-r--r-- | usr.sbin/vmd/vmm.c | 1408 |
7 files changed, 2764 insertions, 1568 deletions
diff --git a/usr.sbin/vmd/Makefile b/usr.sbin/vmd/Makefile index 14518248692..51eba5700cc 100644 --- a/usr.sbin/vmd/Makefile +++ b/usr.sbin/vmd/Makefile @@ -2,14 +2,15 @@ .if ${MACHINE} == "amd64" PROG= vmd -SRCS= vmd.c loadfile_elf.c pci.c virtio.c log.c +SRCS= vmm.c loadfile_elf.c pci.c virtio.c +SRCS+= vmd.c control.c log.c proc.c CFLAGS+= -Wall -I${.CURDIR} CFLAGS+= -Wstrict-prototypes -Wmissing-prototypes CFLAGS+= -Wmissing-declarations CFLAGS+= -Wshadow -Wpointer-arith -Wcast-qual CFLAGS+= -Wsign-compare -LDADD+= -lutil -lpthread -DPADD+= ${LIBUTIL} +LDADD+= -lutil -lpthread -levent +DPADD+= ${LIBUTIL} ${LIBEVENT} .else diff --git a/usr.sbin/vmd/control.c b/usr.sbin/vmd/control.c new file mode 100644 index 00000000000..e9c23e98255 --- /dev/null +++ b/usr.sbin/vmd/control.c @@ -0,0 +1,367 @@ +/* $OpenBSD: control.c,v 1.1 2015/12/02 09:14:25 reyk Exp $ */ + +/* + * Copyright (c) 2010-2015 Reyk Floeter <reyk@openbsd.org> + * Copyright (c) 2003, 2004 Henning Brauer <henning@openbsd.org> + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include <sys/param.h> +#include <sys/queue.h> +#include <sys/stat.h> +#include <sys/socket.h> +#include <sys/un.h> +#include <sys/tree.h> + +#include <net/if.h> + +#include <errno.h> +#include <event.h> +#include <fcntl.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <signal.h> + +#include "proc.h" +#include "vmd.h" + +#define CONTROL_BACKLOG 5 + +struct ctl_connlist ctl_conns; + +void + control_accept(int, short, void *); +struct ctl_conn + *control_connbyfd(int); +void control_close(int, struct control_sock *); +void control_dispatch_imsg(int, short, void *); +int control_dispatch_vmm(int, struct privsep_proc *, struct imsg *); +void control_imsg_forward(struct imsg *); +void control_run(struct privsep *, struct privsep_proc *, void *); + +static struct privsep_proc procs[] = { + { "parent", PROC_PARENT, control_dispatch_vmm } +}; + +pid_t +control(struct privsep *ps, struct privsep_proc *p) +{ + return (proc_run(ps, p, procs, nitems(procs), control_run, NULL)); +} + +void +control_run(struct privsep *ps, struct privsep_proc *p, void *arg) +{ + /* + * pledge in the control process: + * stdio - for malloc and basic I/O including events. + * unix - for the control socket. + */ + if (pledge("stdio unix", NULL) == -1) + fatal("pledge"); +} + +int +control_dispatch_vmm(int fd, struct privsep_proc *p, struct imsg *imsg) +{ + struct ctl_conn *c; + + if ((c = control_connbyfd(imsg->hdr.peerid)) == NULL) { + log_warnx("%s: fd %d: not found", __func__, imsg->hdr.peerid); + return (-1); + } + + switch (imsg->hdr.type) { + case IMSG_VMDOP_START_VM_RESPONSE: + case IMSG_VMDOP_TERMINATE_VM_RESPONSE: + case IMSG_VMDOP_GET_INFO_VM_DATA: + case IMSG_VMDOP_GET_INFO_VM_END_DATA: + imsg_compose_event(&c->iev, imsg->hdr.type, + 0, 0, -1, imsg->data, IMSG_DATA_SIZE(imsg)); + break; + default: + return (-1); + } + + return (0); +} + +int +control_init(struct privsep *ps, struct control_sock *cs) +{ + struct sockaddr_un sun; + int fd; + mode_t old_umask, mode; + + if (cs->cs_name == NULL) + return (0); + + if ((fd = socket(AF_UNIX, SOCK_STREAM | SOCK_NONBLOCK, 0)) == -1) { + log_warn("%s: socket", __func__); + return (-1); + } + + sun.sun_family = AF_UNIX; + if (strlcpy(sun.sun_path, cs->cs_name, + sizeof(sun.sun_path)) >= sizeof(sun.sun_path)) { + log_warn("%s: %s name too long", __func__, cs->cs_name); + close(fd); + return (-1); + } + + if (unlink(cs->cs_name) == -1) + if (errno != ENOENT) { + log_warn("%s: unlink %s", __func__, cs->cs_name); + close(fd); + return (-1); + } + + if (cs->cs_restricted) { + old_umask = umask(S_IXUSR|S_IXGRP|S_IXOTH); + mode = S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP|S_IROTH|S_IWOTH; + } else { + old_umask = umask(S_IXUSR|S_IXGRP|S_IWOTH|S_IROTH|S_IXOTH); + mode = S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP; + } + + if (bind(fd, (struct sockaddr *)&sun, sizeof(sun)) == -1) { + log_warn("%s: bind: %s", __func__, cs->cs_name); + close(fd); + (void)umask(old_umask); + return (-1); + } + (void)umask(old_umask); + + if (chmod(cs->cs_name, mode) == -1) { + log_warn("%s: chmod", __func__); + close(fd); + (void)unlink(cs->cs_name); + return (-1); + } + + cs->cs_fd = fd; + cs->cs_env = ps; + + return (0); +} + +int +control_listen(struct control_sock *cs) +{ + if (cs->cs_name == NULL) + return (0); + + if (listen(cs->cs_fd, CONTROL_BACKLOG) == -1) { + log_warn("%s: listen", __func__); + return (-1); + } + + event_set(&cs->cs_ev, cs->cs_fd, EV_READ, + control_accept, cs); + event_add(&cs->cs_ev, NULL); + evtimer_set(&cs->cs_evt, control_accept, cs); + + return (0); +} + +void +control_cleanup(struct control_sock *cs) +{ + if (cs->cs_name == NULL) + return; + event_del(&cs->cs_ev); + event_del(&cs->cs_evt); +} + +/* ARGSUSED */ +void +control_accept(int listenfd, short event, void *arg) +{ + struct control_sock *cs = arg; + int connfd; + socklen_t len; + struct sockaddr_un sun; + struct ctl_conn *c; + + event_add(&cs->cs_ev, NULL); + if ((event & EV_TIMEOUT)) + return; + + len = sizeof(sun); + if ((connfd = accept4(listenfd, + (struct sockaddr *)&sun, &len, SOCK_NONBLOCK)) == -1) { + /* + * Pause accept if we are out of file descriptors, or + * libevent will haunt us here too. + */ + if (errno == ENFILE || errno == EMFILE) { + struct timeval evtpause = { 1, 0 }; + + event_del(&cs->cs_ev); + evtimer_add(&cs->cs_evt, &evtpause); + } else if (errno != EWOULDBLOCK && errno != EINTR && + errno != ECONNABORTED) + log_warn("%s: accept", __func__); + return; + } + + if ((c = calloc(1, sizeof(struct ctl_conn))) == NULL) { + log_warn("%s", __func__); + close(connfd); + return; + } + + imsg_init(&c->iev.ibuf, connfd); + c->iev.handler = control_dispatch_imsg; + c->iev.events = EV_READ; + c->iev.data = cs; + event_set(&c->iev.ev, c->iev.ibuf.fd, c->iev.events, + c->iev.handler, c->iev.data); + event_add(&c->iev.ev, NULL); + + TAILQ_INSERT_TAIL(&ctl_conns, c, entry); +} + +struct ctl_conn * +control_connbyfd(int fd) +{ + struct ctl_conn *c; + + for (c = TAILQ_FIRST(&ctl_conns); c != NULL && c->iev.ibuf.fd != fd; + c = TAILQ_NEXT(c, entry)) + ; /* nothing */ + + return (c); +} + +void +control_close(int fd, struct control_sock *cs) +{ + struct ctl_conn *c; + + if ((c = control_connbyfd(fd)) == NULL) { + log_warn("%s: fd %d: not found", __func__, fd); + return; + } + + msgbuf_clear(&c->iev.ibuf.w); + TAILQ_REMOVE(&ctl_conns, c, entry); + + event_del(&c->iev.ev); + close(c->iev.ibuf.fd); + + /* Some file descriptors are available again. */ + if (evtimer_pending(&cs->cs_evt, NULL)) { + evtimer_del(&cs->cs_evt); + event_add(&cs->cs_ev, NULL); + } + + free(c); +} + +/* ARGSUSED */ +void +control_dispatch_imsg(int fd, short event, void *arg) +{ + struct control_sock *cs = arg; + struct privsep *ps = cs->cs_env; + struct ctl_conn *c; + struct imsg imsg; + int n, v; + + if ((c = control_connbyfd(fd)) == NULL) { + log_warn("%s: fd %d: not found", __func__, fd); + return; + } + + if (event & EV_READ) { + if ((n = imsg_read(&c->iev.ibuf)) == -1 || n == 0) { + control_close(fd, cs); + return; + } + } + if (event & EV_WRITE) { + if (msgbuf_write(&c->iev.ibuf.w) <= 0 && errno != EAGAIN) { + control_close(fd, cs); + return; + } + } + + for (;;) { + if ((n = imsg_get(&c->iev.ibuf, &imsg)) == -1) { + control_close(fd, cs); + return; + } + + if (n == 0) + break; + + control_imsg_forward(&imsg); + + switch (imsg.hdr.type) { + case IMSG_CTL_NOTIFY: + if (c->flags & CTL_CONN_NOTIFY) { + log_debug("%s: " + "client requested notify more than once", + __func__); + imsg_compose_event(&c->iev, IMSG_CTL_FAIL, + 0, 0, -1, NULL, 0); + break; + } + c->flags |= CTL_CONN_NOTIFY; + break; + case IMSG_CTL_VERBOSE: + IMSG_SIZE_CHECK(&imsg, &v); + + memcpy(&v, imsg.data, sizeof(v)); + log_verbose(v); + + proc_forward_imsg(ps, &imsg, PROC_PARENT, -1); + break; + case IMSG_VMDOP_START_VM_REQUEST: + case IMSG_VMDOP_TERMINATE_VM_REQUEST: + case IMSG_VMDOP_GET_INFO_VM_REQUEST: + imsg.hdr.peerid = fd; + + if (imsg_compose_event(&ps->ps_ievs[PROC_PARENT][0], + imsg.hdr.type, imsg.hdr.peerid, 0, -1, + imsg.data, IMSG_DATA_SIZE(&imsg)) == -1) { + control_close(fd, cs); + return; + } + break; + default: + log_debug("%s: error handling imsg %d", + __func__, imsg.hdr.type); + control_close(fd, cs); + break; + } + imsg_free(&imsg); + } + + imsg_event_add(&c->iev); +} + +void +control_imsg_forward(struct imsg *imsg) +{ + struct ctl_conn *c; + + TAILQ_FOREACH(c, &ctl_conns, entry) + if (c->flags & CTL_CONN_NOTIFY) + imsg_compose_event(&c->iev, imsg->hdr.type, + imsg->hdr.peerid, imsg->hdr.pid, -1, imsg->data, + imsg->hdr.len - IMSG_HEADER_SIZE); +} diff --git a/usr.sbin/vmd/proc.c b/usr.sbin/vmd/proc.c new file mode 100644 index 00000000000..56f8d720d8a --- /dev/null +++ b/usr.sbin/vmd/proc.c @@ -0,0 +1,632 @@ +/* $OpenBSD: proc.c,v 1.1 2015/12/02 09:14:25 reyk Exp $ */ + +/* + * Copyright (c) 2010 - 2014 Reyk Floeter <reyk@openbsd.org> + * Copyright (c) 2008 Pierre-Yves Ritschard <pyr@openbsd.org> + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include <sys/types.h> +#include <sys/queue.h> +#include <sys/socket.h> +#include <sys/wait.h> + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <string.h> +#include <errno.h> +#include <signal.h> +#include <pwd.h> +#include <event.h> +#include <imsg.h> + +#include "proc.h" + +void proc_open(struct privsep *, struct privsep_proc *, + struct privsep_proc *, size_t); +void proc_close(struct privsep *); +int proc_ispeer(struct privsep_proc *, unsigned int, enum privsep_procid); +void proc_shutdown(struct privsep_proc *); +void proc_sig_handler(int, short, void *); +void proc_range(struct privsep *, enum privsep_procid, int *, int *); +int proc_dispatch_null(int, struct privsep_proc *, struct imsg *); + +int +proc_ispeer(struct privsep_proc *procs, unsigned int nproc, + enum privsep_procid type) +{ + unsigned int i; + + for (i = 0; i < nproc; i++) + if (procs[i].p_id == type) + return (1); + return (0); +} + +void +proc_init(struct privsep *ps, struct privsep_proc *procs, unsigned int nproc) +{ + unsigned int i, j, src, dst; + struct privsep_pipes *pp; + + /* + * Allocate pipes for all process instances (incl. parent) + * + * - ps->ps_pipes: N:M mapping + * N source processes connected to M destination processes: + * [src][instances][dst][instances], for example + * [PROC_RELAY][3][PROC_CA][3] + * + * - ps->ps_pp: per-process 1:M part of ps->ps_pipes + * Each process instance has a destination array of socketpair fds: + * [dst][instances], for example + * [PROC_PARENT][0] + */ + for (src = 0; src < PROC_MAX; src++) { + /* Allocate destination array for each process */ + if ((ps->ps_pipes[src] = calloc(ps->ps_ninstances, + sizeof(struct privsep_pipes))) == NULL) + fatal("proc_init: calloc"); + + for (i = 0; i < ps->ps_ninstances; i++) { + pp = &ps->ps_pipes[src][i]; + + for (dst = 0; dst < PROC_MAX; dst++) { + /* Allocate maximum fd integers */ + if ((pp->pp_pipes[dst] = + calloc(ps->ps_ninstances, + sizeof(int))) == NULL) + fatal("proc_init: calloc"); + + /* Mark fd as unused */ + for (j = 0; j < ps->ps_ninstances; j++) + pp->pp_pipes[dst][j] = -1; + } + } + } + + /* + * Setup and run the parent and its children + */ + privsep_process = PROC_PARENT; + ps->ps_instances[PROC_PARENT] = 1; + ps->ps_title[PROC_PARENT] = "parent"; + ps->ps_pid[PROC_PARENT] = getpid(); + ps->ps_pp = &ps->ps_pipes[privsep_process][0]; + + for (i = 0; i < nproc; i++) { + /* Default to 1 process instance */ + if (ps->ps_instances[procs[i].p_id] < 1) + ps->ps_instances[procs[i].p_id] = 1; + ps->ps_title[procs[i].p_id] = procs[i].p_title; + } + + proc_open(ps, NULL, procs, nproc); + + /* Engage! */ + for (i = 0; i < nproc; i++) + ps->ps_pid[procs[i].p_id] = (*procs[i].p_init)(ps, &procs[i]); +} + +void +proc_kill(struct privsep *ps) +{ + pid_t pid; + unsigned int i; + + if (privsep_process != PROC_PARENT) + return; + + for (i = 0; i < PROC_MAX; i++) { + if (ps->ps_pid[i] == 0) + continue; + killpg(ps->ps_pid[i], SIGTERM); + } + + do { + pid = waitpid(WAIT_ANY, NULL, 0); + } while (pid != -1 || (pid == -1 && errno == EINTR)); + + proc_close(ps); +} + +void +proc_open(struct privsep *ps, struct privsep_proc *p, + struct privsep_proc *procs, size_t nproc) +{ + struct privsep_pipes *pa, *pb; + int fds[2]; + unsigned int i, j, src, proc; + + if (p == NULL) + src = privsep_process; /* parent */ + else + src = p->p_id; + + /* + * Open socket pairs for our peers + */ + for (proc = 0; proc < nproc; proc++) { + procs[proc].p_ps = ps; + procs[proc].p_env = ps->ps_env; + if (procs[proc].p_cb == NULL) + procs[proc].p_cb = proc_dispatch_null; + + for (i = 0; i < ps->ps_instances[src]; i++) { + for (j = 0; j < ps->ps_instances[procs[proc].p_id]; + j++) { + pa = &ps->ps_pipes[src][i]; + pb = &ps->ps_pipes[procs[proc].p_id][j]; + + /* Check if fds are already set by peer */ + if (pa->pp_pipes[procs[proc].p_id][j] != -1) + continue; + + if (socketpair(AF_UNIX, + SOCK_STREAM | SOCK_NONBLOCK, + PF_UNSPEC, fds) == -1) + fatal("socketpair"); + + pa->pp_pipes[procs[proc].p_id][j] = fds[0]; + pb->pp_pipes[src][i] = fds[1]; + } + } + } +} + +void +proc_listen(struct privsep *ps, struct privsep_proc *procs, size_t nproc) +{ + unsigned int i, dst, src, n, m; + struct privsep_pipes *pp; + + /* + * Close unused pipes + */ + for (src = 0; src < PROC_MAX; src++) { + for (n = 0; n < ps->ps_instances[src]; n++) { + /* Ingore current process */ + if (src == (unsigned int)privsep_process && + n == ps->ps_instance) + continue; + + pp = &ps->ps_pipes[src][n]; + + for (dst = 0; dst < PROC_MAX; dst++) { + if (src == dst) + continue; + for (m = 0; m < ps->ps_instances[dst]; m++) { + if (pp->pp_pipes[dst][m] == -1) + continue; + + /* Close and invalidate fd */ + close(pp->pp_pipes[dst][m]); + pp->pp_pipes[dst][m] = -1; + } + } + } + } + + src = privsep_process; + ps->ps_pp = pp = &ps->ps_pipes[src][ps->ps_instance]; + + /* + * Listen on appropriate pipes + */ + for (i = 0; i < nproc; i++) { + dst = procs[i].p_id; + + if (src == dst) + fatal("proc_listen: cannot peer with oneself"); + + if ((ps->ps_ievs[dst] = calloc(ps->ps_instances[dst], + sizeof(struct imsgev))) == NULL) + fatal("proc_open"); + + for (n = 0; n < ps->ps_instances[dst]; n++) { + if (pp->pp_pipes[dst][n] == -1) + continue; + + imsg_init(&(ps->ps_ievs[dst][n].ibuf), + pp->pp_pipes[dst][n]); + ps->ps_ievs[dst][n].handler = proc_dispatch; + ps->ps_ievs[dst][n].events = EV_READ; + ps->ps_ievs[dst][n].proc = &procs[i]; + ps->ps_ievs[dst][n].data = &ps->ps_ievs[dst][n]; + procs[i].p_instance = n; + + event_set(&(ps->ps_ievs[dst][n].ev), + ps->ps_ievs[dst][n].ibuf.fd, + ps->ps_ievs[dst][n].events, + ps->ps_ievs[dst][n].handler, + ps->ps_ievs[dst][n].data); + event_add(&(ps->ps_ievs[dst][n].ev), NULL); + } + } +} + +void +proc_close(struct privsep *ps) +{ + unsigned int dst, n; + struct privsep_pipes *pp; + + if (ps == NULL) + return; + + pp = ps->ps_pp; + + for (dst = 0; dst < PROC_MAX; dst++) { + if (ps->ps_ievs[dst] == NULL) + continue; + + for (n = 0; n < ps->ps_instances[dst]; n++) { + if (pp->pp_pipes[dst][n] == -1) + continue; + + /* Cancel the fd, close and invalidate the fd */ + event_del(&(ps->ps_ievs[dst][n].ev)); + imsg_clear(&(ps->ps_ievs[dst][n].ibuf)); + close(pp->pp_pipes[dst][n]); + pp->pp_pipes[dst][n] = -1; + } + free(ps->ps_ievs[dst]); + } +} + +void +proc_shutdown(struct privsep_proc *p) +{ + struct privsep *ps = p->p_ps; + + if (p->p_id == PROC_CONTROL && ps) + control_cleanup(&ps->ps_csock); + + if (p->p_shutdown != NULL) + (*p->p_shutdown)(); + + proc_close(ps); + + log_info("%s exiting, pid %d", p->p_title, getpid()); + + _exit(0); +} + +void +proc_sig_handler(int sig, short event, void *arg) +{ + struct privsep_proc *p = arg; + + switch (sig) { + case SIGINT: + case SIGTERM: + proc_shutdown(p); + break; + case SIGCHLD: + case SIGHUP: + case SIGPIPE: + case SIGUSR1: + /* ignore */ + break; + default: + fatalx("proc_sig_handler: unexpected signal"); + /* NOTREACHED */ + } +} + +pid_t +proc_run(struct privsep *ps, struct privsep_proc *p, + struct privsep_proc *procs, unsigned int nproc, + void (*run)(struct privsep *, struct privsep_proc *, void *), void *arg) +{ + pid_t pid; + struct passwd *pw; + const char *root; + struct control_sock *rcs; + unsigned int n; + + if (ps->ps_noaction) + return (0); + + proc_open(ps, p, procs, nproc); + + /* Fork child handlers */ + switch (pid = fork()) { + case -1: + fatal("proc_run: cannot fork"); + case 0: + log_procinit(p->p_title); + + /* Set the process group of the current process */ + setpgid(0, 0); + break; + default: + return (pid); + } + + pw = ps->ps_pw; + + if (p->p_id == PROC_CONTROL && ps->ps_instance == 0) { + if (control_init(ps, &ps->ps_csock) == -1) + fatalx(__func__); + TAILQ_FOREACH(rcs, &ps->ps_rcsocks, cs_entry) + if (control_init(ps, rcs) == -1) + fatalx(__func__); + } + + /* Change root directory */ + if (p->p_chroot != NULL) + root = p->p_chroot; + else + root = pw->pw_dir; + + if (chroot(root) == -1) + fatal("proc_run: chroot"); + if (chdir("/") == -1) + fatal("proc_run: chdir(\"/\")"); + + privsep_process = p->p_id; + + setproctitle("%s", p->p_title); + + if (setgroups(1, &pw->pw_gid) || + setresgid(pw->pw_gid, pw->pw_gid, pw->pw_gid) || + setresuid(pw->pw_uid, pw->pw_uid, pw->pw_uid)) + fatal("proc_run: cannot drop privileges"); + + /* Fork child handlers */ + for (n = 1; n < ps->ps_instances[p->p_id]; n++) { + if (fork() == 0) { + ps->ps_instance = p->p_instance = n; + break; + } + } + +#ifdef DEBUG + log_debug("%s: %s %d/%d, pid %d", __func__, p->p_title, + ps->ps_instance + 1, ps->ps_instances[p->p_id], getpid()); +#endif + + event_init(); + + signal_set(&ps->ps_evsigint, SIGINT, proc_sig_handler, p); + signal_set(&ps->ps_evsigterm, SIGTERM, proc_sig_handler, p); + signal_set(&ps->ps_evsigchld, SIGCHLD, proc_sig_handler, p); + signal_set(&ps->ps_evsighup, SIGHUP, proc_sig_handler, p); + signal_set(&ps->ps_evsigpipe, SIGPIPE, proc_sig_handler, p); + signal_set(&ps->ps_evsigusr1, SIGUSR1, proc_sig_handler, p); + + signal_add(&ps->ps_evsigint, NULL); + signal_add(&ps->ps_evsigterm, NULL); + signal_add(&ps->ps_evsigchld, NULL); + signal_add(&ps->ps_evsighup, NULL); + signal_add(&ps->ps_evsigpipe, NULL); + signal_add(&ps->ps_evsigusr1, NULL); + + proc_listen(ps, procs, nproc); + + if (p->p_id == PROC_CONTROL && ps->ps_instance == 0) { + TAILQ_INIT(&ctl_conns); + if (control_listen(&ps->ps_csock) == -1) + fatalx(__func__); + TAILQ_FOREACH(rcs, &ps->ps_rcsocks, cs_entry) + if (control_listen(rcs) == -1) + fatalx(__func__); + } + + if (run != NULL) + run(ps, p, arg); + + event_dispatch(); + + proc_shutdown(p); + + return (0); +} + +void +proc_dispatch(int fd, short event, void *arg) +{ + struct imsgev *iev = arg; + struct privsep_proc *p = iev->proc; + struct privsep *ps = p->p_ps; + struct imsgbuf *ibuf; + struct imsg imsg; + ssize_t n; + int verbose; + const char *title; + + title = ps->ps_title[privsep_process]; + ibuf = &iev->ibuf; + + if (event & EV_READ) { + if ((n = imsg_read(ibuf)) == -1) + fatal(__func__); + if (n == 0) { + /* this pipe is dead, so remove the event handler */ + event_del(&iev->ev); + event_loopexit(NULL); + return; + } + } + + if (event & EV_WRITE) { + if (msgbuf_write(&ibuf->w) <= 0 && errno != EAGAIN) + fatal(__func__); + } + + for (;;) { + if ((n = imsg_get(ibuf, &imsg)) == -1) + fatal(__func__); + if (n == 0) + break; + +#if DEBUG > 1 + log_debug("%s: %s %d got imsg %d from %s %d", + __func__, title, ps->ps_instance + 1, + imsg.hdr.type, p->p_title, p->p_instance); +#endif + + /* + * Check the message with the program callback + */ + if ((p->p_cb)(fd, p, &imsg) == 0) { + /* Message was handled by the callback, continue */ + imsg_free(&imsg); + continue; + } + + /* + * Generic message handling + */ + switch (imsg.hdr.type) { + case IMSG_CTL_VERBOSE: + IMSG_SIZE_CHECK(&imsg, &verbose); + memcpy(&verbose, imsg.data, sizeof(verbose)); + log_verbose(verbose); + break; + default: + log_warnx("%s: %s %d got invalid imsg %d from %s %d", + __func__, title, ps->ps_instance + 1, + imsg.hdr.type, p->p_title, p->p_instance); + fatalx(__func__); + } + imsg_free(&imsg); + } + imsg_event_add(iev); +} + +int +proc_dispatch_null(int fd, struct privsep_proc *p, struct imsg *imsg) +{ + return (-1); +} + +/* + * imsg helper functions + */ + +void +imsg_event_add(struct imsgev *iev) +{ + if (iev->handler == NULL) { + imsg_flush(&iev->ibuf); + return; + } + + iev->events = EV_READ; + if (iev->ibuf.w.queued) + iev->events |= EV_WRITE; + + event_del(&iev->ev); + event_set(&iev->ev, iev->ibuf.fd, iev->events, iev->handler, iev->data); + event_add(&iev->ev, NULL); +} + +int +imsg_compose_event(struct imsgev *iev, uint16_t type, uint32_t peerid, + pid_t pid, int fd, void *data, uint16_t datalen) +{ + int ret; + + if ((ret = imsg_compose(&iev->ibuf, type, peerid, + pid, fd, data, datalen)) == -1) + return (ret); + imsg_event_add(iev); + return (ret); +} + +int +imsg_composev_event(struct imsgev *iev, uint16_t type, uint32_t peerid, + pid_t pid, int fd, const struct iovec *iov, int iovcnt) +{ + int ret; + + if ((ret = imsg_composev(&iev->ibuf, type, peerid, + pid, fd, iov, iovcnt)) == -1) + return (ret); + imsg_event_add(iev); + return (ret); +} + +void +proc_range(struct privsep *ps, enum privsep_procid id, int *n, int *m) +{ + if (*n == -1) { + /* Use a range of all target instances */ + *n = 0; + *m = ps->ps_instances[id]; + } else { + /* Use only a single slot of the specified peer process */ + *m = *n + 1; + } +} + +int +proc_compose_imsg(struct privsep *ps, enum privsep_procid id, int n, + uint16_t type, int fd, void *data, uint16_t datalen) +{ + int m; + + proc_range(ps, id, &n, &m); + for (; n < m; n++) { + if (imsg_compose_event(&ps->ps_ievs[id][n], + type, -1, 0, fd, data, datalen) == -1) + return (-1); + } + + return (0); +} + +int +proc_composev_imsg(struct privsep *ps, enum privsep_procid id, int n, + uint16_t type, int fd, const struct iovec *iov, int iovcnt) +{ + int m; + + proc_range(ps, id, &n, &m); + for (; n < m; n++) + if (imsg_composev_event(&ps->ps_ievs[id][n], + type, -1, 0, fd, iov, iovcnt) == -1) + return (-1); + + return (0); +} + +int +proc_forward_imsg(struct privsep *ps, struct imsg *imsg, + enum privsep_procid id, int n) +{ + return (proc_compose_imsg(ps, id, n, imsg->hdr.type, + imsg->fd, imsg->data, IMSG_DATA_SIZE(imsg))); +} + +struct imsgbuf * +proc_ibuf(struct privsep *ps, enum privsep_procid id, int n) +{ + int m; + + proc_range(ps, id, &n, &m); + return (&ps->ps_ievs[id][n].ibuf); +} + +struct imsgev * +proc_iev(struct privsep *ps, enum privsep_procid id, int n) +{ + int m; + + proc_range(ps, id, &n, &m); + return (&ps->ps_ievs[id][n]); +} diff --git a/usr.sbin/vmd/proc.h b/usr.sbin/vmd/proc.h new file mode 100644 index 00000000000..2c192551ef4 --- /dev/null +++ b/usr.sbin/vmd/proc.h @@ -0,0 +1,187 @@ +/* $OpenBSD: proc.h,v 1.1 2015/12/02 09:14:25 reyk Exp $ */ + +/* + * Copyright (c) 2010-2015 Reyk Floeter <reyk@openbsd.org> + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include <sys/queue.h> +#include <sys/uio.h> + +#include <imsg.h> +#include <event.h> + +#ifndef _PROC_H +#define _PROC_H + +enum { + IMSG_NONE, + IMSG_CTL_OK, + IMSG_CTL_FAIL, + IMSG_CTL_VERBOSE, + IMSG_CTL_END, + IMSG_CTL_NOTIFY, + IMSG_PROC_MAX +}; + +/* imsg */ +struct imsgev { + struct imsgbuf ibuf; + void (*handler)(int, short, void *); + struct event ev; + struct privsep_proc *proc; + void *data; + short events; +}; + +#define IMSG_SIZE_CHECK(imsg, p) do { \ + if (IMSG_DATA_SIZE(imsg) < sizeof(*p)) \ + fatalx("bad length imsg received"); \ +} while (0) +#define IMSG_DATA_SIZE(imsg) ((imsg)->hdr.len - IMSG_HEADER_SIZE) + +/* control socket */ +struct control_sock { + const char *cs_name; + struct event cs_ev; + struct event cs_evt; + int cs_fd; + int cs_restricted; + void *cs_env; + + TAILQ_ENTRY(control_sock) cs_entry; +}; +TAILQ_HEAD(control_socks, control_sock); + +struct { + struct event ev; + int fd; +} control_state; + +struct ctl_conn { + TAILQ_ENTRY(ctl_conn) entry; + u_int8_t flags; + u_int waiting; +#define CTL_CONN_NOTIFY 0x01 + struct imsgev iev; + +}; +TAILQ_HEAD(ctl_connlist, ctl_conn); +extern struct ctl_connlist ctl_conns; + +/* privsep */ +enum privsep_procid { + PROC_PARENT = 0, + PROC_CONTROL, + PROC_MAX, +} privsep_process; + +struct privsep_pipes { + int *pp_pipes[PROC_MAX]; +}; + +struct privsep { + struct privsep_pipes *ps_pipes[PROC_MAX]; + struct privsep_pipes *ps_pp; + + struct imsgev *ps_ievs[PROC_MAX]; + const char *ps_title[PROC_MAX]; + pid_t ps_pid[PROC_MAX]; + struct passwd *ps_pw; + int ps_noaction; + + struct control_sock ps_csock; + struct control_socks ps_rcsocks; + + u_int ps_instances[PROC_MAX]; + u_int ps_ninstances; + u_int ps_instance; + + /* Event and signal handlers */ + struct event ps_evsigint; + struct event ps_evsigterm; + struct event ps_evsigchld; + struct event ps_evsighup; + struct event ps_evsigpipe; + struct event ps_evsigusr1; + + void *ps_env; +}; + +struct privsep_proc { + const char *p_title; + enum privsep_procid p_id; + int (*p_cb)(int, struct privsep_proc *, + struct imsg *); + pid_t (*p_init)(struct privsep *, + struct privsep_proc *); + const char *p_chroot; + struct privsep *p_ps; + void *p_env; + void (*p_shutdown)(void); + u_int p_instance; +}; + +/* proc.c */ +void proc_init(struct privsep *, struct privsep_proc *, unsigned int); +void proc_kill(struct privsep *); +void proc_listen(struct privsep *, struct privsep_proc *, size_t); +void proc_dispatch(int, short event, void *); +pid_t proc_run(struct privsep *, struct privsep_proc *, + struct privsep_proc *, unsigned int, + void (*)(struct privsep *, struct privsep_proc *, void *), void *); +void imsg_event_add(struct imsgev *); +int imsg_compose_event(struct imsgev *, uint16_t, uint32_t, + pid_t, int, void *, uint16_t); +int imsg_composev_event(struct imsgev *, uint16_t, uint32_t, + pid_t, int, const struct iovec *, int); +int proc_compose_imsg(struct privsep *, enum privsep_procid, int, + uint16_t, int, void *, uint16_t); +int proc_composev_imsg(struct privsep *, enum privsep_procid, int, + uint16_t, int, const struct iovec *, int); +int proc_forward_imsg(struct privsep *, struct imsg *, + enum privsep_procid, int); +struct imsgbuf * + proc_ibuf(struct privsep *, enum privsep_procid, int); +struct imsgev * + proc_iev(struct privsep *, enum privsep_procid, int); + +/* control.c */ +pid_t control(struct privsep *, struct privsep_proc *); +int control_init(struct privsep *, struct control_sock *); +int control_listen(struct control_sock *); +void control_cleanup(struct control_sock *); + +/* log.c */ +void log_init(int, int); +void log_procinit(const char *); +void log_verbose(int); +void log_warn(const char *, ...) + __attribute__((__format__ (printf, 1, 2))); +void log_warnx(const char *, ...) + __attribute__((__format__ (printf, 1, 2))); +void log_info(const char *, ...) + __attribute__((__format__ (printf, 1, 2))); +void log_debug(const char *, ...) + __attribute__((__format__ (printf, 1, 2))); +void logit(int, const char *, ...) + __attribute__((__format__ (printf, 2, 3))); +void vlog(int, const char *, va_list) + __attribute__((__format__ (printf, 2, 0))); +__dead void fatal(const char *, ...) + __attribute__((__format__ (printf, 1, 2))); +__dead void fatalx(const char *, ...) + __attribute__((__format__ (printf, 1, 2))); + +#endif /* _PROC_H */ diff --git a/usr.sbin/vmd/vmd.c b/usr.sbin/vmd/vmd.c index 35ad6d749ff..b72874397b7 100644 --- a/usr.sbin/vmd/vmd.c +++ b/usr.sbin/vmd/vmd.c @@ -1,4 +1,4 @@ -/* $OpenBSD: vmd.c,v 1.8 2015/11/26 08:26:48 reyk Exp $ */ +/* $OpenBSD: vmd.c,v 1.9 2015/12/02 09:14:25 reyk Exp $ */ /* * Copyright (c) 2015 Mike Larkin <mlarkin@openbsd.org> @@ -16,163 +16,105 @@ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ -/* - * vmd(8) - virtual machine daemon - */ - -#include <sys/types.h> -#include <sys/ioctl.h> +#include <sys/param.h> #include <sys/queue.h> -#include <sys/uio.h> -#include <sys/socket.h> -#include <sys/stat.h> -#include <sys/un.h> #include <sys/wait.h> -#include <sys/mman.h> -#include <sys/time.h> - -#include <dev/ic/comreg.h> -#include <dev/ic/i8253reg.h> -#include <dev/isa/isareg.h> -#include <dev/pci/pcireg.h> - -#include <machine/param.h> -#include <machine/vmmvar.h> +#include <sys/cdefs.h> +#include <stdio.h> +#include <stdlib.h> #include <errno.h> +#include <event.h> #include <fcntl.h> -#include <imsg.h> -#include <limits.h> -#include <pthread.h> #include <pwd.h> #include <signal.h> -#include <stddef.h> -#include <stdio.h> -#include <stdlib.h> -#include <string.h> #include <syslog.h> -#include <termios.h> #include <unistd.h> -#include <poll.h> -#include <util.h> +#include "proc.h" #include "vmd.h" -#include "loadfile.h" -#include "pci.h" -#include "virtio.h" - -#define NR_BACKLOG 5 - -#define MAX_TAP 256 - -/* - * Emulated 8250 UART - * - */ -#define COM1_DATA 0x3f8 -#define COM1_IER 0x3f9 -#define COM1_IIR 0x3fa -#define COM1_LCR 0x3fb -#define COM1_MCR 0x3fc -#define COM1_LSR 0x3fd -#define COM1_MSR 0x3fe -#define COM1_SCR 0x3ff - -/* - * Emulated i8253 PIT (counter) - */ -#define TIMER_BASE 0x40 -#define TIMER_CTRL 0x43 /* 8253 Timer #1 */ -#define NS_PER_TICK (1000000000 / TIMER_FREQ) - -/* i8253 registers */ -struct i8253_counter { - struct timeval tv; /* timer start time */ - uint16_t start; /* starting value */ - uint16_t olatch; /* output latch */ - uint16_t ilatch; /* input latch */ - uint8_t last_r; /* last read byte (MSB/LSB) */ - uint8_t last_w; /* last written byte (MSB/LSB) */ -}; - -/* ns8250 UART registers */ -struct ns8250_regs { - uint8_t lcr; /* Line Control Register */ - uint8_t fcr; /* FIFO Control Register */ - uint8_t iir; /* Interrupt ID Register */ - uint8_t ier; /* Interrupt Enable Register */ - uint8_t divlo; /* Baud rate divisor low byte */ - uint8_t divhi; /* Baud rate divisor high byte */ - uint8_t msr; /* Modem Status Register */ - uint8_t lsr; /* Line Status Register */ - uint8_t mcr; /* Modem Control Register */ - uint8_t scr; /* Scratch Register */ - uint8_t data; /* Unread input data */ -}; - -struct i8253_counter i8253_counter[3]; -struct ns8250_regs com1_regs; __dead void usage(void); -void sighdlr(int); -int main(int, char **); -int control_run(void); -int start_vm(struct imsg *); -int terminate_vm(struct imsg *); -int get_info_vm(struct imsgbuf *); -int start_client_vmd(void); -int opentap(void); -int run_vm(int *, int *, struct vm_create_params *); -void *vcpu_run_loop(void *); -int vcpu_exit(struct vm_run_params *); -int vmm_create_vm(struct vm_create_params *); -void init_emulated_hw(struct vm_create_params *, int *, int *); -void vcpu_exit_inout(struct vm_run_params *); -uint8_t vcpu_exit_pci(struct vm_run_params *); -void vcpu_exit_i8253(union vm_exit *); -void vcpu_exit_com(struct vm_run_params *); -void vcpu_process_com_data(union vm_exit *); -void vcpu_process_com_lcr(union vm_exit *); -void vcpu_process_com_lsr(union vm_exit *); -void vcpu_process_com_ier(union vm_exit *); -void vcpu_process_com_mcr(union vm_exit *); -void vcpu_process_com_iir(union vm_exit *); -void vcpu_process_com_msr(union vm_exit *); -void vcpu_process_com_scr(union vm_exit *); +int main(int, char **); +int vmd_configure(void); +void vmd_sighdlr(int sig, short event, void *arg); +void vmd_shutdown(void); +int vmd_control_run(void); -int vmm_fd, con_fd, vm_id; -volatile sig_atomic_t quit; +struct vmd *env; -SLIST_HEAD(vmstate_head, vmstate); -struct vmstate_head vmstate; - -extern char *__progname; +static struct privsep_proc procs[] = { + { "control", PROC_CONTROL, vmm_dispatch_control, control }, +}; -/* - * sighdlr - * - * Signal handler for TERM/INT/CHLD signals used during daemon shutdown - * - * Parameters: - * sig: signal caught - */ void -sighdlr(int sig) +vmd_sighdlr(int sig, short event, void *arg) { - pid_t pid; + struct privsep *ps = arg; + int die = 0, status, fail, id; + pid_t pid; + char *cause; + const char *title = "vm"; switch (sig) { + case SIGHUP: + log_info("%s: ignoring SIGHUP", __func__); + break; + case SIGPIPE: + log_info("%s: ignoring SIGPIPE", __func__); + break; + case SIGUSR1: + log_info("%s: ignoring SIGUSR1", __func__); + break; case SIGTERM: case SIGINT: - /* Tell main imsg loop to exit */ - quit = 1; - break; + die = 1; + /* FALLTHROUGH */ case SIGCHLD: do { - pid = waitpid(WAIT_ANY, NULL, WNOHANG); - } while (pid != -1 || (pid == -1 && errno == EINTR)); + int len; + + pid = waitpid(-1, &status, WNOHANG); + if (pid <= 0) + continue; + + fail = 0; + if (WIFSIGNALED(status)) { + fail = 1; + len = asprintf(&cause, "terminated; signal %d", + WTERMSIG(status)); + } else if (WIFEXITED(status)) { + if (WEXITSTATUS(status) != 0) { + fail = 1; + len = asprintf(&cause, + "exited abnormally"); + } else + len = asprintf(&cause, "exited okay"); + } else + fatalx("unexpected cause of SIGCHLD"); + + if (len == -1) + fatal("asprintf"); + + for (id = 0; id < PROC_MAX; id++) { + if (pid == ps->ps_pid[id]) { + die = 1; + title = ps->ps_title[id]; + break; + } + } + if (fail) + log_warnx("lost child: %s %s", title, cause); + + free(cause); + } while (pid > 0 || (pid == -1 && errno == EINTR)); + + if (die) + vmd_shutdown(); break; + default: + fatalx("unexpected signal"); } } @@ -187,1453 +129,114 @@ usage(void) int main(int argc, char **argv) { - int debug = 0, verbose = 0, c, res; + struct privsep *ps; + int ch; + + if ((env = calloc(1, sizeof(*env))) == NULL) + fatal("calloc: env"); - while ((c = getopt(argc, argv, "dv")) != -1) { - switch (c) { + while ((ch = getopt(argc, argv, "dvn")) != -1) { + switch (ch) { case 'd': - debug = 2; + env->vmd_debug = 2; break; case 'v': - verbose++; + env->vmd_verbose++; + break; + case 'n': + env->vmd_noaction = 1; break; default: usage(); } } - /* log to stderr until daemonized */ - log_init(debug ? debug : 1, LOG_DAEMON); - - /* Open /dev/vmm */ - vmm_fd = open(VMM_NODE, O_RDONLY); - if (vmm_fd == -1) - fatal("can't open vmm device node %s", VMM_NODE); - - setproctitle("control"); - - SLIST_INIT(&vmstate); - - signal(SIGTERM, sighdlr); - signal(SIGINT, sighdlr); - signal(SIGCHLD, sighdlr); - - log_init(debug, LOG_DAEMON); - log_verbose(verbose); - log_procinit("control"); - - if (!debug && daemon(1, 0) == -1) - fatal("can't daemonize"); - - res = control_run(); - - if (res == -1) - fatalx("control socket error"); - - return (0); -} - -/* - * control_run - * - * Main control loop - establishes listening socket for incoming vmmctl(8) - * requests and dispatches appropriate calls to vmm(4). Replies to - * vmmctl(8) using imsg. - * - * Return values: - * 0: normal exit (signal to quit received) - * -1: abnormal exit (various causes) - */ -int -control_run(void) -{ - struct sockaddr_un sun, c_sun; - socklen_t len; - int fd, connfd, n, res, nfd; - mode_t mode, old_umask; - char *socketpath; - struct imsgbuf *ibuf; - struct imsg imsg; - struct pollfd pfd[1]; - - /* Establish and start listening on control socket */ - socketpath = SOCKET_NAME; - if ((fd = socket(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0)) == -1) { - log_warn("%s: socket error", __progname); - return (-1); - } - - bzero(&sun, sizeof(sun)); - sun.sun_family = AF_UNIX; - if (strlcpy(sun.sun_path, socketpath, sizeof(sun.sun_path)) >= - sizeof(sun.sun_path)) { - log_warnx("%s: socket name too long", __progname); - close(fd); - return (-1); - } - - if (unlink(socketpath) == -1) - if (errno != ENOENT) { - log_warn("%s: unlink of %s failed", - __progname, socketpath); - close(fd); - return (-1); - } - - old_umask = umask(S_IXUSR|S_IXGRP|S_IWOTH|S_IROTH|S_IXOTH); - mode = S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP; - - if (bind(fd, (struct sockaddr *)&sun, sizeof(sun)) == -1) { - log_warn("%s: control_init: bind of %s failed", - __progname, socketpath); - close(fd); - umask(old_umask); - return (-1); - } - - umask(old_umask); - - if (chmod(socketpath, mode) == -1) { - log_warn("%s: control_init: chmod of %s failed", - __progname, socketpath); - close(fd); - unlink(socketpath); - return (-1); - } + /* check for root privileges */ + if (geteuid()) + fatalx("need root privileges"); - if ((ibuf = malloc(sizeof(struct imsgbuf))) == NULL) { - log_warn("%s: out of memory", __progname); - close(fd); - unlink(socketpath); - return (-1); - } + SLIST_INIT(&env->vmd_vmstate); - if (listen(fd, NR_BACKLOG) == -1) { - log_warn("%s: listen failed", __progname); - close(fd); - unlink(socketpath); - return (-1); - } + ps = &env->vmd_ps; + ps->ps_env = env; + TAILQ_INIT(&ps->ps_rcsocks); - while (!quit) { - pfd[0].fd = fd; - pfd[0].events = POLLIN; + if ((ps->ps_pw = getpwnam(VMD_USER)) == NULL) + fatal("unknown user %s", VMD_USER); - nfd = poll(pfd, 1, INFTIM); - if (nfd == -1) { - if (errno == EINTR) - continue; - fatal("poll"); - } - if (nfd == 0) - continue; - if ((pfd[0].revents & (POLLERR|POLLNVAL))) - fatalx("bad fd %d", fd); - if ((pfd[0].revents & (POLLIN|POLLHUP)) == 0) - fatalx("bad fd %d events", fd); + /* Configure the control socket */ + ps->ps_csock.cs_name = SOCKET_NAME; - if ((connfd = accept4(fd, (struct sockaddr *)&c_sun, &len, - SOCK_CLOEXEC)) == -1) { - log_warn("%s: accept4 error", __progname); - close(fd); - unlink(socketpath); - return (-1); - } - - imsg_init(ibuf, connfd); - if ((n = imsg_read(ibuf)) == -1 || n == 0) { - log_warnx("%s: imsg_read error, n=%d", - __progname, n); - continue; - } - - for (;;) { - if ((n = imsg_get(ibuf, &imsg)) == -1) - return (-1); - - if (n == 0) - break; - - /* Process incoming message (from vmmctl(8)) */ - switch (imsg.hdr.type) { - case IMSG_VMDOP_START_VM_REQUEST: - res = start_vm(&imsg); - imsg_compose(ibuf, - IMSG_VMDOP_START_VM_RESPONSE, 0, 0, -1, - &res, sizeof(res)); - break; - case IMSG_VMDOP_TERMINATE_VM_REQUEST: - res = terminate_vm(&imsg); - imsg_compose(ibuf, - IMSG_VMDOP_TERMINATE_VM_RESPONSE, 0, 0, -1, - &res, sizeof(res)); - break; - case IMSG_VMDOP_GET_INFO_VM_REQUEST: - res = get_info_vm(ibuf); - imsg_compose(ibuf, - IMSG_VMDOP_GET_INFO_VM_END_DATA, 0, 0, -1, - &res, sizeof(res)); - break; - } - - while (ibuf->w.queued) - if (msgbuf_write(&ibuf->w) <= 0 && errno != - EAGAIN) { - log_warn("%s: msgbuf_write error", - __progname); - close(fd); - close(connfd); - unlink(socketpath); - return (-1); - } - imsg_free(&imsg); - } - close(connfd); - } - - signal(SIGCHLD, SIG_IGN); - - return (0); -} - -/* - * terminate_vm - * - * Requests vmm(4) to terminate the VM whose ID is provided in the - * supplied vm_terminate_params structure (vtp->vtp_vm_id) - * - * Parameters - * imsg: The incoming imsg body whose 'data' field contains the - * vm_terminate_params struct - * - * Return values: - * 0: success - * !0 : ioctl to vmm(4) failed (eg, ENOENT if the supplied VM is not - * valid) - */ -int -terminate_vm(struct imsg *imsg) -{ - struct vm_terminate_params *vtp; - - vtp = (struct vm_terminate_params *)imsg->data; - - if (ioctl(vmm_fd, VMM_IOC_TERM, vtp) < 0) - return (errno); - - return (0); -} - -/* - * opentap - * - * Opens the next available tap device, up to MAX_TAP. - * - * Returns a file descriptor to the tap node opened, or -1 if no tap - * devices were available. - */ -int -opentap(void) -{ - int i, fd; - char path[PATH_MAX]; - - for (i = 0; i < MAX_TAP; i++) { - snprintf(path, PATH_MAX, "/dev/tap%d", i); - fd = open(path, O_RDWR | O_NONBLOCK); - if (fd != -1) - return (fd); - } - - return (-1); -} - -/* - * start_vm - * - * Starts a new VM with the creation parameters supplied (in the incoming - * imsg->data field). This function performs a basic sanity check on the - * incoming parameters and then performs the following steps to complete - * the creation of the VM: - * - * 1. opens the VM disk image files specified in the VM creation parameters - * 2. opens the specified VM kernel - * 3. creates a VM console tty pair using openpty - * 4. forks, passing the file descriptors opened in steps 1-3 to the child - * vmd responsible for dropping privilege and running the VM's VCPU - * loops. - * - * Parameters: - * imsg: The incoming imsg body whose 'data' field is a vm_create_params - * struct containing the VM creation parameters. - * - * Return values: - * 0: success - * !0 : failure - typically an errno indicating the source of the failure - */ -int -start_vm(struct imsg *imsg) -{ - struct vm_create_params *vcp; - size_t i; - off_t kernel_size; - struct stat sb; - int child_disks[VMM_MAX_DISKS_PER_VM], kernel_fd, ret, ttym_fd; - int child_taps[VMM_MAX_NICS_PER_VM]; - int ttys_fd; - char ptyn[32]; - - vcp = (struct vm_create_params *)imsg->data; - - for (i = 0 ; i < VMM_MAX_DISKS_PER_VM; i++) - child_disks[i] = -1; - for (i = 0 ; i < VMM_MAX_NICS_PER_VM; i++) - child_taps[i] = -1; - - /* - * XXX kernel_fd can't be global (possible race if multiple VMs - * being created at the same time). Probably need to move this - * into the child before dropping privs, or just make it local - * to this function? - */ - kernel_fd = -1; - - ttym_fd = -1; - ttys_fd = -1; - - /* Open disk images for child */ - for (i = 0 ; i < vcp->vcp_ndisks; i++) { - child_disks[i] = open(vcp->vcp_disks[i], O_RDWR); - if (child_disks[i] == -1) { - ret = errno; - log_warn("%s: can't open %s", __progname, - vcp->vcp_disks[i]); - goto err; - } - } - - bzero(&sb, sizeof(sb)); - if (stat(vcp->vcp_kernel, &sb) == -1) { - ret = errno; - log_warn("%s: can't stat kernel image %s", - __progname, vcp->vcp_kernel); - goto err; - } - - kernel_size = sb.st_size; - - /* Open kernel image */ - kernel_fd = open(vcp->vcp_kernel, O_RDONLY); - if (kernel_fd == -1) { - ret = errno; - log_warn("%s: can't open kernel image %s", - __progname, vcp->vcp_kernel); - goto err; - } - - if (openpty(&ttym_fd, &ttys_fd, ptyn, NULL, NULL) == -1) { - ret = errno; - log_warn("%s: openpty failed", __progname); - goto err; - } - - if (close(ttys_fd)) { - ret = errno; - log_warn("%s: close tty failed", __progname); - goto err; - } - - /* Open tap devices for child */ - for (i = 0 ; i < vcp->vcp_nnics; i++) { - child_taps[i] = opentap(); - if (child_taps[i] == -1) { - ret = errno; - log_warn("%s: can't open tap for nic %zd", - __progname, i); - goto err; - } - } - - /* Start child vmd for this VM (fork, chroot, drop privs) */ - ret = start_client_vmd(); - - /* Start child failed? - cleanup and leave */ - if (ret == -1) { - ret = EIO; - goto err; - } - - if (ret > 0) { - /* Parent */ - for (i = 0 ; i < vcp->vcp_ndisks; i++) - close(child_disks[i]); - - for (i = 0 ; i < vcp->vcp_nnics; i++) - close(child_taps[i]); - - close(kernel_fd); - close(ttym_fd); + /* Open /dev/vmm */ + env->vmd_fd = open(VMM_NODE, O_RDWR); + if (env->vmd_fd == -1) + fatal("can't open vmm device node %s", VMM_NODE); - return (0); - } - else { - /* Child */ - setproctitle(vcp->vcp_name); - log_procinit(vcp->vcp_name); + /* log to stderr until daemonized */ + log_init(env->vmd_debug ? env->vmd_debug : 1, LOG_DAEMON); - log_info("%s: vm console: %s", __progname, ptyn); - ret = vmm_create_vm(vcp); - if (ret) { - errno = ret; - fatal("create vmm ioctl failed - exiting"); - } + if (!env->vmd_debug && daemon(0, 0) == -1) + fatal("can't daemonize"); - /* Load kernel image */ - ret = loadelf_main(kernel_fd, vcp->vcp_id, vcp->vcp_memory_size); - if (ret) { - errno = ret; - fatal("failed to load kernel - exiting"); - } + ps->ps_ninstances = 1; + proc_init(ps, procs, nitems(procs)); - close(kernel_fd); + setproctitle("parent"); + log_procinit("parent"); - con_fd = ttym_fd; - if (fcntl(con_fd, F_SETFL, O_NONBLOCK) == -1) - fatal("failed to set nonblocking mode on console"); + event_init(); - /* Execute the vcpu run loop(s) for this VM */ - ret = run_vm(child_disks, child_taps, vcp); - _exit(ret != 0); - } - - return (ret); + signal_set(&ps->ps_evsigint, SIGINT, vmd_sighdlr, ps); + signal_set(&ps->ps_evsigterm, SIGTERM, vmd_sighdlr, ps); + signal_set(&ps->ps_evsigchld, SIGCHLD, vmd_sighdlr, ps); + signal_set(&ps->ps_evsighup, SIGHUP, vmd_sighdlr, ps); + signal_set(&ps->ps_evsigpipe, SIGPIPE, vmd_sighdlr, ps); + signal_set(&ps->ps_evsigusr1, SIGUSR1, vmd_sighdlr, ps); -err: - for (i = 0 ; i < vcp->vcp_ndisks; i++) - if (child_disks[i] != -1) - close(child_disks[i]); + signal_add(&ps->ps_evsigint, NULL); + signal_add(&ps->ps_evsigterm, NULL); + signal_add(&ps->ps_evsigchld, NULL); + signal_add(&ps->ps_evsighup, NULL); + signal_add(&ps->ps_evsigpipe, NULL); + signal_add(&ps->ps_evsigusr1, NULL); - for (i = 0 ; i < vcp->vcp_nnics; i++) - if (child_taps[i] != -1) - close(child_taps[i]); + proc_listen(ps, procs, nitems(procs)); - if (kernel_fd != -1) - close(kernel_fd); + if (vmd_configure() == -1) + fatalx("configuration failed"); - if (ttym_fd != -1) - close(ttym_fd); + event_dispatch(); - return (ret); -} + log_debug("parent exiting"); -/* - * get_info_vm - * - * Returns a list of VMs known to vmm(4). - * - * Parameters: - * ibuf: the imsg ibuf in which to place the results. A new imsg will - * be created using this ibuf. - * - * Return values: - * 0: success - * !0 : failure (eg, ENOMEM, EIO or another error code from vmm(4) ioctl) - */ -int -get_info_vm(struct imsgbuf *ibuf) -{ - int ret; - size_t ct, i; - struct ibuf *obuf; - struct vm_info_params vip; - struct vm_info_result *info; - - /* - * We issue the VMM_IOC_INFO ioctl twice, once with an input - * buffer size of 0, which results in vmm(4) returning the - * number of bytes required back to us in vip.vip_size, - * and then we call it again after malloc'ing the required - * number of bytes. - * - * It is possible that we could fail a second time (eg, if - * another VM was created in the instant between the two - * ioctls, but in that case the caller can just try again - * as vmm(4) will return a zero-sized list in that case. - */ - vip.vip_size = 0; - info = NULL; - ret = 0; - - /* First ioctl to see how many bytes needed (vip.vip_size) */ - if (ioctl(vmm_fd, VMM_IOC_INFO, &vip) < 0) - return (errno); - - if (vip.vip_info_ct != 0) - return (EIO); - - info = malloc(vip.vip_size); - if (info == NULL) - return (ENOMEM); - - /* Second ioctl to get the actual list */ - vip.vip_info = info; - if (ioctl(vmm_fd, VMM_IOC_INFO, &vip) < 0) { - ret = errno; - free(info); - return (ret); - } - - /* Return info to vmmctl(4) */ - ct = vip.vip_size / sizeof(struct vm_info_result); - for (i = 0; i < ct; i++) { - obuf = imsg_create(ibuf, IMSG_VMDOP_GET_INFO_VM_DATA, 0, 0, - sizeof(struct vm_info_result)); - imsg_add(obuf, &info[i], sizeof(struct vm_info_result)); - imsg_close(ibuf, obuf); - } - free(info); return (0); } - -/* - * start_client_vmd - * - * forks a copy of the parent vmd, chroots to VMD_USER's home, drops - * privileges (changes to user VMD_USER), and returns. - * Should the fork operation succeed, but later chroot/privsep - * fail, the child exits. - * - * Return values (returns to both child and parent on success): - * -1 : failure - * 0: return to child vmd returns 0 - * !0 : return to parent vmd returns the child's pid - */ int -start_client_vmd(void) +vmd_configure(void) { - int child_pid; - struct passwd *pw; - - pw = getpwnam(VMD_USER); - if (pw == NULL) { - log_warnx("%s: no such user %s", __progname, VMD_USER); - return (-1); +#if 0 + if (parse_config(env->sc_conffile, env) == -1) { + proc_kill(&env->sc_ps); + exit(1); } +#endif - child_pid = fork(); - if (child_pid < 0) - return (-1); - - if (!child_pid) { - /* Child */ - if (chroot(pw->pw_dir) != 0) - fatal("unable to chroot"); - if (chdir("/") != 0) - fatal("unable to chdir"); - - if (setgroups(1, &pw->pw_gid) == -1) - fatal("setgroups() failed"); - if (setresgid(pw->pw_gid, pw->pw_gid, pw->pw_gid) == -1) - fatal("setresgid() failed"); - if (setresuid(pw->pw_uid, pw->pw_uid, pw->pw_uid) == -1) - fatal("setresuid() failed"); - - return (0); + if (env->vmd_noaction) { + fprintf(stderr, "configuration OK\n"); + proc_kill(&env->vmd_ps); + exit(0); } - /* Parent */ - return (child_pid); -} - -/* - * vmm_create_vm - * - * Requests vmm(4) to create a new VM using the supplied creation - * parameters. This operation results in the creation of the in-kernel - * structures for the VM, but does not start the VM's vcpu(s). - * - * Parameters: - * vcp: vm_create_params struct containing the VM's desired creation - * configuration - * - * Return values: - * 0: success - * !0 : ioctl to vmm(4) failed - */ -int -vmm_create_vm(struct vm_create_params *vcp) -{ - /* Sanity check arguments */ - if (vcp->vcp_ncpus > VMM_MAX_VCPUS_PER_VM) - return (EINVAL); - - if (vcp->vcp_memory_size > VMM_MAX_VM_MEM_SIZE) - return (EINVAL); - - if (vcp->vcp_ndisks > VMM_MAX_DISKS_PER_VM) - return (EINVAL); - - if (ioctl(vmm_fd, VMM_IOC_CREATE, vcp) < 0) - return (errno); - return (0); } -/* - * init_emulated_hw - * - * Initializes the userspace hardware emulation - */ -void -init_emulated_hw(struct vm_create_params *vcp, int *child_disks, - int *child_taps) -{ - /* Init the i8253 PIT's 3 counters */ - bzero(&i8253_counter, sizeof(struct i8253_counter) * 3); - gettimeofday(&i8253_counter[0].tv, NULL); - gettimeofday(&i8253_counter[1].tv, NULL); - gettimeofday(&i8253_counter[2].tv, NULL); - i8253_counter[0].start = TIMER_DIV(100); - i8253_counter[1].start = TIMER_DIV(100); - i8253_counter[2].start = TIMER_DIV(100); - - /* Init ns8250 UART */ - bzero(&com1_regs, sizeof(struct ns8250_regs)); - - /* Initialize PCI */ - pci_init(); - - /* Initialize virtio devices */ - virtio_init(vcp, child_disks, child_taps); -} - -/* - * run_vm - * - * Runs the VM whose creation parameters are specified in vcp - * - * Parameters: - * vcp: vm_create_params struct containing the VM's desired creation - * configuration - * child_disks: previously-opened child VM disk file file descriptors - * child_taps: previously-opened child tap file descriptors - * - * Return values: - * 0: the VM exited normally - * !0 : the VM exited abnormally or failed to start - */ -int -run_vm(int *child_disks, int *child_taps, struct vm_create_params *vcp) -{ - size_t i; - int ret; - pthread_t *tid; - void *exit_status; - struct vm_run_params **vrp; - - ret = 0; - - /* XXX cap vcp_ncpus to avoid overflow here */ - /* - * XXX ensure nvcpus in vcp is same as vm, or fix vmm to return einval - * on bad vcpu id - */ - tid = malloc(sizeof(pthread_t) * vcp->vcp_ncpus); - vrp = malloc(sizeof(struct vm_run_params *) * vcp->vcp_ncpus); - if (tid == NULL || vrp == NULL) { - log_warn("%s: memory allocation error - exiting.", - __progname); - return (ENOMEM); - } - - init_emulated_hw(vcp, child_disks, child_taps); - - /* - * Create and launch one thread for each VCPU. These threads may - * migrate between PCPUs over time; the need to reload CPU state - * in such situations is detected and performed by vmm(4) in the - * kernel. - */ - for (i = 0 ; i < vcp->vcp_ncpus; i++) { - vrp[i] = malloc(sizeof(struct vm_run_params)); - if (vrp[i] == NULL) { - log_warn("%s: memory allocation error - " - "exiting.", __progname); - /* caller will exit, so skip free'ing */ - return (ENOMEM); - } - vrp[i]->vrp_exit = malloc(sizeof(union vm_exit)); - if (vrp[i]->vrp_exit == NULL) { - log_warn("%s: memory allocation error - " - "exiting.", __progname); - /* caller will exit, so skip free'ing */ - return (ENOMEM); - } - vrp[i]->vrp_vm_id = vcp->vcp_id; - vrp[i]->vrp_vcpu_id = i; - - /* Start each VCPU run thread at vcpu_run_loop */ - ret = pthread_create(&tid[i], NULL, vcpu_run_loop, vrp[i]); - if (ret) { - /* caller will _exit after this return */ - return (ret); - } - } - - /* Wait for all the threads to exit */ - for (i = 0; i < vcp->vcp_ncpus; i++) { - if (pthread_join(tid[i], &exit_status)) { - log_warn("%s: failed to join thread %zd - " - "exiting", __progname, i); - return (EIO); - } - - if (exit_status != NULL) { - log_warnx("%s: vm %d vcpu run thread %zd exited " - "abnormally", __progname, vcp->vcp_id, i); - ret = EIO; - } - } - - return (ret); -} - -/* - * vcpu_run_loop - * - * Runs a single VCPU until vmm(4) requires help handling an exit, - * or the VM terminates. - * - * Parameters: - * arg: vcpu_run_params for the VCPU being run by this thread - * - * Return values: - * NULL: the VCPU shutdown properly - * !NULL: error processing VCPU run, or the VCPU shutdown abnormally - */ -void * -vcpu_run_loop(void *arg) -{ - struct vm_run_params *vrp = (struct vm_run_params *)arg; - intptr_t ret; - - vrp->vrp_continue = 0; - vrp->vrp_injint = -1; - - for (;;) { - if (ioctl(vmm_fd, VMM_IOC_RUN, vrp) < 0) { - /* If run ioctl failed, exit */ - ret = errno; - return ((void *)ret); - } - - /* If the VM is terminating, exit normally */ - if (vrp->vrp_exit_reason == VM_EXIT_TERMINATED) - return (NULL); - - if (vrp->vrp_exit_reason != VM_EXIT_NONE) { - /* - * vmm(4) needs help handling an exit, handle in - * vcpu_exit. - */ - if (vcpu_exit(vrp)) - return ((void *)EIO); - } - } - - return (NULL); -} - -/* - * vcpu_exit_i8253 - * - * Handles emulated i8253 PIT access (in/out instruction to PIT ports). - * We don't emulate all the modes of the i8253, just the basic squarewave - * clock. - * - * Parameters: - * vei: VM exit information from vmm(4) containing information on the in/out - * instruction being performed - */ -void -vcpu_exit_i8253(union vm_exit *vei) -{ - uint32_t out_data; - uint8_t sel, rw, data; - uint64_t ns, ticks; - struct timeval now, delta; - - if (vei->vei.vei_port == TIMER_CTRL) { - if (vei->vei.vei_dir == 0) { /* OUT instruction */ - out_data = vei->vei.vei_data; - sel = out_data & - (TIMER_SEL0 | TIMER_SEL1 | TIMER_SEL2); - sel = sel >> 6; - if (sel > 2) { - log_warnx("%s: i8253 PIT: invalid " - "timer selected (%d)", - __progname, sel); - return; - } - - rw = vei->vei.vei_data & - (TIMER_LATCH | TIMER_LSB | - TIMER_MSB | TIMER_16BIT); - - if (rw == TIMER_16BIT) { - /* - * XXX this seems to be used on occasion, needs - * to be implemented - */ - log_warnx("%s: i8253 PIT: 16 bit " - "counter I/O not supported", - __progname); - return; - } - - /* - * Since we don't truly emulate each tick of the PIT - * clock, when the guest asks for the timer to be - * latched, simulate what the counter would have been - * had we performed full emulation. We do this by - * calculating when the counter was reset vs how much - * time has elapsed, then bias by the counter tick - * rate. - */ - if (rw == TIMER_LATCH) { - gettimeofday(&now, NULL); - delta.tv_sec = now.tv_sec - - i8253_counter[sel].tv.tv_sec; - delta.tv_usec = now.tv_usec - - i8253_counter[sel].tv.tv_usec; - if (delta.tv_usec < 0) { - delta.tv_sec--; - delta.tv_usec += 1000000; - } - if (delta.tv_usec > 1000000) { - delta.tv_sec++; - delta.tv_usec -= 1000000; - } - ns = delta.tv_usec * 1000 + - delta.tv_sec * 1000000000; - ticks = ns / NS_PER_TICK; - i8253_counter[sel].olatch = - i8253_counter[sel].start - - ticks % i8253_counter[sel].start; - return; - } - - log_warnx("%s: i8253 PIT: unsupported rw mode " - "%d", __progname, rw); - return; - } else { - /* XXX should this return 0xff? */ - log_warnx("%s: i8253 PIT: read from control " - "port unsupported", __progname); - } - } else { - sel = vei->vei.vei_port - (TIMER_CNTR0 + TIMER_BASE); - if (vei->vei.vei_dir == 0) { /* OUT instruction */ - if (i8253_counter[sel].last_w == 0) { - out_data = vei->vei.vei_data; - i8253_counter[sel].ilatch |= (out_data << 8); - i8253_counter[sel].last_w = 1; - } else { - out_data = vei->vei.vei_data; - i8253_counter[sel].ilatch |= out_data; - i8253_counter[sel].start = - i8253_counter[sel].ilatch; - i8253_counter[sel].last_w = 0; - } - } else { - if (i8253_counter[sel].last_r == 0) { - data = i8253_counter[sel].olatch >> 8; - vei->vei.vei_data = data; - i8253_counter[sel].last_w = 1; - } else { - data = i8253_counter[sel].olatch & 0xFF; - vei->vei.vei_data = data; - i8253_counter[sel].last_w = 0; - } - } - } -} - -/* - * vcpu_process_com_data - * - * Emulate in/out instructions to the com1 (ns8250) UART data register - * - * Parameters: - * vei: vm exit information from vmm(4) containing information on the in/out - * instruction being performed - */ -void -vcpu_process_com_data(union vm_exit *vei) -{ - /* - * vei_dir == 0 : out instruction - * - * The guest wrote to the data register. Since we are emulating a - * no-fifo chip, write the character immediately to the pty and - * assert TXRDY in IIR (if the guest has requested TXRDY interrupt - * reporting) - */ - if (vei->vei.vei_dir == 0) { - write(con_fd, &vei->vei.vei_data, 1); - if (com1_regs.ier & 0x2) { - /* Set TXRDY */ - com1_regs.iir |= IIR_TXRDY; - /* Set "interrupt pending" (IIR low bit cleared) */ - com1_regs.iir &= ~0x1; - } - } else { - /* - * vei_dir == 1 : in instruction - * - * The guest read from the data register. Check to see if - * there is data available (RXRDY) and if so, consume the - * input data and return to the guest. Also clear the - * interrupt info register regardless. - */ - if (com1_regs.lsr & LSR_RXRDY) { - vei->vei.vei_data = com1_regs.data; - com1_regs.data = 0x0; - com1_regs.lsr &= ~LSR_RXRDY; - } else { - /* XXX should this be com1_regs.data or 0xff? */ - vei->vei.vei_data = com1_regs.data; - log_warnx("guest reading com1 when not ready"); - } - - /* Reading the data register always clears RXRDY from IIR */ - com1_regs.iir &= ~IIR_RXRDY; - - /* - * Clear "interrupt pending" by setting IIR low bit to 1 - * if no interrupt are pending - */ - if (com1_regs.iir == 0x0) - com1_regs.iir = 0x1; - } -} - -/* - * vcpu_process_com_lcr - * - * Emulate in/out instructions to the com1 (ns8250) UART line control register - * - * Paramters: - * vei: vm exit information from vmm(4) containing information on the in/out - * instruction being performed - */ -void -vcpu_process_com_lcr(union vm_exit *vei) -{ - /* - * vei_dir == 0 : out instruction - * - * Write content to line control register - */ - if (vei->vei.vei_dir == 0) { - com1_regs.lcr = (uint8_t)vei->vei.vei_data; - } else { - /* - * vei_dir == 1 : in instruction - * - * Read line control register - */ - vei->vei.vei_data = com1_regs.lcr; - } -} - -/* - * vcpu_process_com_iir - * - * Emulate in/out instructions to the com1 (ns8250) UART interrupt information - * register. Note that writes to this register actually are to a different - * register, the FCR (FIFO control register) that we don't emulate but still - * consume the data provided. - * - * Parameters: - * vei: vm exit information from vmm(4) containing information on the in/out - * instruction being performed - */ void -vcpu_process_com_iir(union vm_exit *vei) +vmd_shutdown(void) { - /* - * vei_dir == 0 : out instruction - * - * Write to FCR - */ - if (vei->vei.vei_dir == 0) { - com1_regs.fcr = vei->vei.vei_data; - } else { - /* - * vei_dir == 1 : in instruction - * - * Read IIR. Reading the IIR resets the TXRDY bit in the IIR - * after the data is read. - */ - vei->vei.vei_data = com1_regs.iir; - com1_regs.iir &= ~IIR_TXRDY; + proc_kill(&env->vmd_ps); + free(env); - /* - * Clear "interrupt pending" by setting IIR low bit to 1 - * if no interrupts are pending - */ - if (com1_regs.iir == 0x0) - com1_regs.iir = 0x1; - } -} - -/* - * vcpu_process_com_mcr - * - * Emulate in/out instructions to the com1 (ns8250) UART modem control - * register. - * - * Parameters: - * vei: vm exit information from vmm(4) containing information on the in/out - * instruction being performed - */ -void -vcpu_process_com_mcr(union vm_exit *vei) -{ - /* - * vei_dir == 0 : out instruction - * - * Write to MCR - */ - if (vei->vei.vei_dir == 0) { - com1_regs.mcr = vei->vei.vei_data; - } else { - /* - * vei_dir == 1 : in instruction - * - * Read from MCR - */ - vei->vei.vei_data = com1_regs.mcr; - } -} - -/* - * vcpu_process_com_lsr - * - * Emulate in/out instructions to the com1 (ns8250) UART line status register. - * - * Parameters: - * vei: vm exit information from vmm(4) containing information on the in/out - * instruction being performed - */ -void -vcpu_process_com_lsr(union vm_exit *vei) -{ - /* - * vei_dir == 0 : out instruction - * - * Write to LSR. This is an illegal operation, so we just log it and - * continue. - */ - if (vei->vei.vei_dir == 0) { - log_warnx("%s: LSR UART write 0x%x unsupported", - __progname, vei->vei.vei_data); - } else { - /* - * vei_dir == 1 : in instruction - * - * Read from LSR. We always report TXRDY and TSRE since we - * can process output characters immediately (at any time). - */ - vei->vei.vei_data = com1_regs.lsr | LSR_TSRE | LSR_TXRDY; - } -} - -/* - * vcpu_process_com_msr - * - * Emulate in/out instructions to the com1 (ns8250) UART modem status register. - * - * Parameters: - * vei: vm exit information from vmm(4) containing information on the in/out - * instruction being performed - */ -void -vcpu_process_com_msr(union vm_exit *vei) -{ - /* - * vei_dir == 0 : out instruction - * - * Write to MSR. This is an illegal operation, so we just log it and - * continue. - */ - if (vei->vei.vei_dir == 0) { - log_warnx("%s: MSR UART write 0x%x unsupported", - __progname, vei->vei.vei_data); - } else { - /* - * vei_dir == 1 : in instruction - * - * Read from MSR. We always report DCD, DSR, and CTS. - */ - vei->vei.vei_data = com1_regs.lsr | MSR_DCD | MSR_DSR | MSR_CTS; - } -} - -/* - * vcpu_process_com_scr - * - * Emulate in/out instructions to the com1 (ns8250) UART scratch register. The - * scratch register is sometimes used to distinguish an 8250 from a 16450, - * and/or used to distinguish submodels of the 8250 (eg 8250A, 8250B). We - * simulate an "original" 8250 by forcing the scratch register to return data - * on read that is different from what was written. - * - * Parameters: - * vei: vm exit information from vmm(4) containing information on the in/out - * instruction being performed - */ -void -vcpu_process_com_scr(union vm_exit *vei) -{ - /* - * vei_dir == 0 : out instruction - * - * Write to SCR - */ - if (vei->vei.vei_dir == 0) { - com1_regs.scr = vei->vei.vei_data; - } else { - /* - * vei_dir == 1 : in instruction - * - * Read from SCR. To make sure we don't accidentally simulate - * a real scratch register, we negate what was written on - * subsequent readback. - */ - vei->vei.vei_data = ~com1_regs.scr; - } -} - -/* - * vcpu_process_com_ier - * - * Emulate in/out instructions to the com1 (ns8250) UART interrupt enable - * register. - * - * Parameters: - * vei: vm exit information from vmm(4) containing information on the in/out - * instruction being performed - */ -void -vcpu_process_com_ier(union vm_exit *vei) -{ - /* - * vei_dir == 0 : out instruction - * - * Write to IER - */ - if (vei->vei.vei_dir == 0) { - com1_regs.ier = vei->vei.vei_data; - } else { - /* - * vei_dir == 1 : in instruction - * - * Read from IER - */ - vei->vei.vei_data = com1_regs.ier; - } -} - -/* - * vcpu_exit_com - * - * Process com1 (ns8250) UART exits. vmd handles most basic 8250 - * features with the exception of the divisor latch (eg, no baud - * rate support) - * - * Parameters: - * vrp: vcpu run parameters containing guest state for this exit - */ -void -vcpu_exit_com(struct vm_run_params *vrp) -{ - union vm_exit *vei = vrp->vrp_exit; - - switch(vei->vei.vei_port) { - case COM1_LCR: - vcpu_process_com_lcr(vei); - break; - case COM1_IER: - vcpu_process_com_ier(vei); - break; - case COM1_IIR: - vcpu_process_com_iir(vei); - break; - case COM1_MCR: - vcpu_process_com_mcr(vei); - break; - case COM1_LSR: - vcpu_process_com_lsr(vei); - break; - case COM1_MSR: - vcpu_process_com_msr(vei); - break; - case COM1_SCR: - vcpu_process_com_scr(vei); - break; - case COM1_DATA: - vcpu_process_com_data(vei); - break; - } -} - -/* - * vcpu_exit_pci - * - * Handle all I/O to the emulated PCI subsystem. - * - * Parameters: - * vrp: vcpu run paramters containing guest state for this exit - * - * Return values: - * 0xff if no interrupt is required after this pci exit, - * or an interrupt vector otherwise - */ -uint8_t -vcpu_exit_pci(struct vm_run_params *vrp) -{ - union vm_exit *vei = vrp->vrp_exit; - uint8_t intr; - - intr = 0xFF; - - switch(vei->vei.vei_port) { - case PCI_MODE1_ADDRESS_REG: - pci_handle_address_reg(vrp); - break; - case PCI_MODE1_DATA_REG: - pci_handle_data_reg(vrp); - break; - case VMM_PCI_IO_BAR_BASE ... VMM_PCI_IO_BAR_END: - intr = pci_handle_io(vrp); - break; - default: - log_warnx("%s: unknown PCI register 0x%llx", - __progname, (uint64_t)vei->vei.vei_port); - break; - } - - return (intr); -} - -/* - * vcpu_exit_inout - * - * Handle all I/O exits that need to be emulated in vmd. This includes the - * i8253 PIT and the com1 ns8250 UART. - * - * Parameters: - * vrp: vcpu run parameters containing guest state for this exit - */ -void -vcpu_exit_inout(struct vm_run_params *vrp) -{ - union vm_exit *vei = vrp->vrp_exit; - uint8_t intr; - - switch(vei->vei.vei_port) { - case TIMER_CTRL: - case (TIMER_CNTR0 + TIMER_BASE): - case (TIMER_CNTR1 + TIMER_BASE): - case (TIMER_CNTR2 + TIMER_BASE): - vcpu_exit_i8253(vei); - break; - case COM1_DATA ... COM1_SCR: - vcpu_exit_com(vrp); - break; - case PCI_MODE1_ADDRESS_REG: - case PCI_MODE1_DATA_REG: - case VMM_PCI_IO_BAR_BASE ... VMM_PCI_IO_BAR_END: - intr = vcpu_exit_pci(vrp); - if (intr != 0xFF) - vrp->vrp_injint = intr; - else - vrp->vrp_injint = -1; - break; - default: - /* IN from unsupported port gives FFs */ - if (vei->vei.vei_dir == 1) - vei->vei.vei_data = 0xFFFFFFFF; - break; - } -} - -/* - * vcpu_exit - * - * Handle a vcpu exit. This function is called when it is determined that - * vmm(4) requires the assistance of vmd to support a particular guest - * exit type (eg, accessing an I/O port or device). Guest state is contained - * in 'vrp', and will be resent to vmm(4) on exit completion. - * - * Upon conclusion of handling the exit, the function determines if any - * interrupts should be injected into the guest, and sets vrp->vrp_injint - * to the IRQ line whose interrupt should be vectored (or -1 if no interrupt - * is to be injected). - * - * Parameters: - * vrp: vcpu run parameters containing guest state for this exit - * - * Return values: - * 0: the exit was handled successfully - * 1: an error occurred (exit not handled) - */ -int -vcpu_exit(struct vm_run_params *vrp) -{ - ssize_t sz; - char ch; - - switch (vrp->vrp_exit_reason) { - case VMX_EXIT_IO: - vcpu_exit_inout(vrp); - break; - case VMX_EXIT_HLT: - /* - * XXX handle halted state, no reason to run this vcpu again - * until a vm interrupt is to be injected - */ - break; - default: - log_warnx("%s: unknown exit reason %d", - __progname, vrp->vrp_exit_reason); - return (1); - } - - /* XXX interrupt priority */ - if (vionet_process_rx()) - vrp->vrp_injint = 9; - - /* - * Is there a new character available on com1? - * If so, consume the character, buffer it into the com1 data register - * assert IRQ4, and set the line status register RXRDY bit. - * - * XXX - move all this com intr checking to another function - */ - sz = read(con_fd, &ch, sizeof(char)); - if (sz == 1) { - com1_regs.lsr |= LSR_RXRDY; - com1_regs.data = ch; - /* XXX these ier and iir bits should be IER_x and IIR_x */ - if (com1_regs.ier & 0x1) { - com1_regs.iir |= (2 << 1); - com1_regs.iir &= ~0x1; - } - } - - /* - * Clear "interrupt pending" by setting IIR low bit to 1 if no - * interrupts are pending - */ - /* XXX these iir magic numbers should be IIR_x */ - if ((com1_regs.iir & ~0x1) == 0x0) - com1_regs.iir = 0x1; - - /* If pending interrupt and nothing waiting to be injected, inject */ - if ((com1_regs.iir & 0x1) == 0) - if (vrp->vrp_injint == -1) - vrp->vrp_injint = 0x4; - vrp->vrp_continue = 1; - - return (0); -} - -/* - * write_page - * - * Pushes a page of data from 'buf' into the guest VM's memory - * at paddr 'dst'. - * - * Parameters: - * dst: the destination paddr_t in the guest VM to push into. - * If there is no guest paddr mapping at 'dst', a new page will be - * faulted in by the VMM (provided 'dst' represents a valid paddr - * in the guest's address space) - * buf: page of data to push - * len: size of 'buf' - * do_mask: 1 to mask the destination address (for kernel load), 0 to - * leave 'dst' unmasked - * - * Return values: - * various return values from ioctl(VMM_IOC_WRITEPAGE), or 0 if no error - * occurred. - * - * Note - this function only handles GPAs < 4GB. - */ -int -write_page(uint32_t dst, void *buf, uint32_t len, int do_mask) -{ - struct vm_writepage_params vwp; - - /* - * Mask kernel load addresses to avoid uint32_t -> uint64_t cast - * errors - */ - if (do_mask) - dst &= 0xFFFFFFF; - - vwp.vwp_paddr = (paddr_t)dst; - vwp.vwp_data = buf; - vwp.vwp_vm_id = vm_id; - vwp.vwp_len = len; - if (ioctl(vmm_fd, VMM_IOC_WRITEPAGE, &vwp) < 0) { - log_warn("writepage ioctl failed"); - return (errno); - } - return (0); -} - -/* - * read_page - * - * Reads a page of memory at guest paddr 'src' into 'buf'. - * - * Parameters: - * src: the source paddr_t in the guest VM to read from. - * buf: destination (local) buffer - * len: size of 'buf' - * do_mask: 1 to mask the source address (for kernel load), 0 to - * leave 'src' unmasked - * - * Return values: - * various return values from ioctl(VMM_IOC_READPAGE), or 0 if no error - * occurred. - * - * Note - this function only handles GPAs < 4GB. - */ -int -read_page(uint32_t src, void *buf, uint32_t len, int do_mask) -{ - struct vm_readpage_params vrp; - - /* - * Mask kernel load addresses to avoid uint32_t -> uint64_t cast - * errors - */ - if (do_mask) - src &= 0xFFFFFFF; - - vrp.vrp_paddr = (paddr_t)src; - vrp.vrp_data = buf; - vrp.vrp_vm_id = vm_id; - vrp.vrp_len = len; - if (ioctl(vmm_fd, VMM_IOC_READPAGE, &vrp) < 0) { - log_warn("readpage ioctl failed"); - return (errno); - } - return (0); + log_warnx("parent terminating"); + exit(0); } diff --git a/usr.sbin/vmd/vmd.h b/usr.sbin/vmd/vmd.h index c960dcfa735..55e0a221255 100644 --- a/usr.sbin/vmd/vmd.h +++ b/usr.sbin/vmd/vmd.h @@ -1,4 +1,4 @@ -/* $OpenBSD: vmd.h,v 1.3 2015/11/23 13:04:49 reyk Exp $ */ +/* $OpenBSD: vmd.h,v 1.4 2015/12/02 09:14:25 reyk Exp $ */ /* * Copyright (c) 2015 Mike Larkin <mlarkin@openbsd.org> @@ -16,7 +16,10 @@ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ -#include <stdarg.h> +#include <sys/types.h> +#include <sys/queue.h> + +#include "proc.h" #ifndef __VMD_H__ #define __VMD_H__ @@ -25,6 +28,9 @@ #define SOCKET_NAME "/var/run/vmd.sock" #define VMM_NODE "/dev/vmm" #define VM_NAME_MAX 256 +#define MAX_TAP 256 +#define NR_BACKLOG 5 + /* #define VMD_DEBUG */ @@ -34,10 +40,8 @@ #define dprintf(x...) #endif /* VMM_DEBUG */ - enum imsg_type { - IMSG_NONE, - IMSG_VMDOP_DISABLE_VMM_REQUEST, + IMSG_VMDOP_DISABLE_VMM_REQUEST = IMSG_PROC_MAX, IMSG_VMDOP_DISABLE_VMM_RESPONSE, IMSG_VMDOP_ENABLE_VMM_REQUEST, IMSG_VMDOP_ENABLE_VMM_RESPONSE, @@ -50,28 +54,22 @@ enum imsg_type { IMSG_VMDOP_GET_INFO_VM_END_DATA }; +SLIST_HEAD(vmstate_head, vmstate); + +struct vmd { + struct privsep vmd_ps; + int vmd_fd; + + int vmd_debug; + int vmd_verbose; + int vmd_noaction; + + struct vmstate_head vmd_vmstate; +}; + +/* vmm.c */ +int vmm_dispatch_control(int, struct privsep_proc *, struct imsg *); int write_page(uint32_t dst, void *buf, uint32_t, int); int read_page(uint32_t dst, void *buf, uint32_t, int); -/* log.c */ -void log_init(int, int); -void log_procinit(const char *); -void log_verbose(int); -void log_warn(const char *, ...) - __attribute__((__format__ (printf, 1, 2))); -void log_warnx(const char *, ...) - __attribute__((__format__ (printf, 1, 2))); -void log_info(const char *, ...) - __attribute__((__format__ (printf, 1, 2))); -void log_debug(const char *, ...) - __attribute__((__format__ (printf, 1, 2))); -void logit(int, const char *, ...) - __attribute__((__format__ (printf, 2, 3))); -void vlog(int, const char *, va_list) - __attribute__((__format__ (printf, 2, 0))); -__dead void fatal(const char *, ...) - __attribute__((__format__ (printf, 1, 2))); -__dead void fatalx(const char *, ...) - __attribute__((__format__ (printf, 1, 2))); - #endif /* __VMD_H__ */ diff --git a/usr.sbin/vmd/vmm.c b/usr.sbin/vmd/vmm.c new file mode 100644 index 00000000000..32c10397e86 --- /dev/null +++ b/usr.sbin/vmd/vmm.c @@ -0,0 +1,1408 @@ +/* $OpenBSD: vmm.c,v 1.1 2015/12/02 09:14:25 reyk Exp $ */ + +/* + * Copyright (c) 2015 Mike Larkin <mlarkin@openbsd.org> + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include <sys/param.h> +#include <sys/ioctl.h> +#include <sys/queue.h> +#include <sys/uio.h> +#include <sys/socket.h> +#include <sys/stat.h> +#include <sys/un.h> +#include <sys/wait.h> +#include <sys/mman.h> +#include <sys/time.h> + +#include <dev/ic/comreg.h> +#include <dev/ic/i8253reg.h> +#include <dev/isa/isareg.h> +#include <dev/pci/pcireg.h> + +#include <machine/param.h> +#include <machine/vmmvar.h> + +#include <errno.h> +#include <fcntl.h> +#include <imsg.h> +#include <limits.h> +#include <pthread.h> +#include <pwd.h> +#include <signal.h> +#include <stddef.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <syslog.h> +#include <termios.h> +#include <unistd.h> +#include <poll.h> +#include <util.h> + +#include "vmd.h" +#include "loadfile.h" +#include "pci.h" +#include "virtio.h" +#include "proc.h" + +/* + * Emulated 8250 UART + * + */ +#define COM1_DATA 0x3f8 +#define COM1_IER 0x3f9 +#define COM1_IIR 0x3fa +#define COM1_LCR 0x3fb +#define COM1_MCR 0x3fc +#define COM1_LSR 0x3fd +#define COM1_MSR 0x3fe +#define COM1_SCR 0x3ff + +/* + * Emulated i8253 PIT (counter) + */ +#define TIMER_BASE 0x40 +#define TIMER_CTRL 0x43 /* 8253 Timer #1 */ +#define NS_PER_TICK (1000000000 / TIMER_FREQ) + +/* i8253 registers */ +struct i8253_counter { + struct timeval tv; /* timer start time */ + uint16_t start; /* starting value */ + uint16_t olatch; /* output latch */ + uint16_t ilatch; /* input latch */ + uint8_t last_r; /* last read byte (MSB/LSB) */ + uint8_t last_w; /* last written byte (MSB/LSB) */ +}; + +/* ns8250 UART registers */ +struct ns8250_regs { + uint8_t lcr; /* Line Control Register */ + uint8_t fcr; /* FIFO Control Register */ + uint8_t iir; /* Interrupt ID Register */ + uint8_t ier; /* Interrupt Enable Register */ + uint8_t divlo; /* Baud rate divisor low byte */ + uint8_t divhi; /* Baud rate divisor high byte */ + uint8_t msr; /* Modem Status Register */ + uint8_t lsr; /* Line Status Register */ + uint8_t mcr; /* Modem Control Register */ + uint8_t scr; /* Scratch Register */ + uint8_t data; /* Unread input data */ +}; + +struct i8253_counter i8253_counter[3]; +struct ns8250_regs com1_regs; + +int start_client_vmd(void); +int opentap(void); +int start_vm(struct imsg *); +int terminate_vm(struct imsg *); +int get_info_vm(struct privsep *, struct imsg *); +int run_vm(int *, int *, struct vm_create_params *); +void *vcpu_run_loop(void *); +int vcpu_exit(struct vm_run_params *); +int vmm_create_vm(struct vm_create_params *); +void init_emulated_hw(struct vm_create_params *, int *, int *); +void vcpu_exit_inout(struct vm_run_params *); +uint8_t vcpu_exit_pci(struct vm_run_params *); +void vcpu_exit_i8253(union vm_exit *); +void vcpu_exit_com(struct vm_run_params *); +void vcpu_process_com_data(union vm_exit *); +void vcpu_process_com_lcr(union vm_exit *); +void vcpu_process_com_lsr(union vm_exit *); +void vcpu_process_com_ier(union vm_exit *); +void vcpu_process_com_mcr(union vm_exit *); +void vcpu_process_com_iir(union vm_exit *); +void vcpu_process_com_msr(union vm_exit *); +void vcpu_process_com_scr(union vm_exit *); + +int con_fd, vm_id; + +extern struct vmd *env; + +extern char *__progname; + +int +vmm_dispatch_control(int fd, struct privsep_proc *p, struct imsg *imsg) +{ + struct privsep *ps = p->p_ps; + int res = 0, cmd = 0; + + switch (imsg->hdr.type) { + case IMSG_VMDOP_START_VM_REQUEST: + res = start_vm(imsg); + cmd = IMSG_VMDOP_START_VM_RESPONSE; + break; + case IMSG_VMDOP_TERMINATE_VM_REQUEST: + res = terminate_vm(imsg); + cmd = IMSG_VMDOP_TERMINATE_VM_RESPONSE; + break; + case IMSG_VMDOP_GET_INFO_VM_REQUEST: + res = get_info_vm(ps, imsg); + cmd = IMSG_VMDOP_GET_INFO_VM_END_DATA; + break; + default: + return (-1); + } + + if (cmd && + imsg_compose_event(&ps->ps_ievs[PROC_CONTROL][0], + cmd, imsg->hdr.peerid, 0, -1, &res, sizeof(res)) == -1) + return (-1); + + return (0); +} + +/* + * terminate_vm + * + * Requests vmm(4) to terminate the VM whose ID is provided in the + * supplied vm_terminate_params structure (vtp->vtp_vm_id) + * + * Parameters + * imsg: The incoming imsg body whose 'data' field contains the + * vm_terminate_params struct + * + * Return values: + * 0: success + * !0 : ioctl to vmm(4) failed (eg, ENOENT if the supplied VM is not + * valid) + */ +int +terminate_vm(struct imsg *imsg) +{ + struct vm_terminate_params *vtp; + + vtp = (struct vm_terminate_params *)imsg->data; + + if (ioctl(env->vmd_fd, VMM_IOC_TERM, vtp) < 0) + return (errno); + + return (0); +} + +/* + * opentap + * + * Opens the next available tap device, up to MAX_TAP. + * + * Returns a file descriptor to the tap node opened, or -1 if no tap + * devices were available. + */ +int +opentap(void) +{ + int i, fd; + char path[PATH_MAX]; + + for (i = 0; i < MAX_TAP; i++) { + snprintf(path, PATH_MAX, "/dev/tap%d", i); + fd = open(path, O_RDWR | O_NONBLOCK); + if (fd != -1) + return (fd); + } + + return (-1); +} + +/* + * start_vm + * + * Starts a new VM with the creation parameters supplied (in the incoming + * imsg->data field). This function performs a basic sanity check on the + * incoming parameters and then performs the following steps to complete + * the creation of the VM: + * + * 1. opens the VM disk image files specified in the VM creation parameters + * 2. opens the specified VM kernel + * 3. creates a VM console tty pair using openpty + * 4. forks, passing the file descriptors opened in steps 1-3 to the child + * vmd responsible for dropping privilege and running the VM's VCPU + * loops. + * + * Parameters: + * imsg: The incoming imsg body whose 'data' field is a vm_create_params + * struct containing the VM creation parameters. + * + * Return values: + * 0: success + * !0 : failure - typically an errno indicating the source of the failure + */ +int +start_vm(struct imsg *imsg) +{ + struct vm_create_params *vcp; + size_t i; + off_t kernel_size; + struct stat sb; + int child_disks[VMM_MAX_DISKS_PER_VM], kernel_fd, ret, ttym_fd; + int child_taps[VMM_MAX_NICS_PER_VM]; + int ttys_fd; + char ptyn[32]; + + vcp = (struct vm_create_params *)imsg->data; + + for (i = 0 ; i < VMM_MAX_DISKS_PER_VM; i++) + child_disks[i] = -1; + for (i = 0 ; i < VMM_MAX_NICS_PER_VM; i++) + child_taps[i] = -1; + + /* + * XXX kernel_fd can't be global (possible race if multiple VMs + * being created at the same time). Probably need to move this + * into the child before dropping privs, or just make it local + * to this function? + */ + kernel_fd = -1; + + ttym_fd = -1; + ttys_fd = -1; + + /* Open disk images for child */ + for (i = 0 ; i < vcp->vcp_ndisks; i++) { + child_disks[i] = open(vcp->vcp_disks[i], O_RDWR); + if (child_disks[i] == -1) { + ret = errno; + log_warn("%s: can't open %s", __progname, + vcp->vcp_disks[i]); + goto err; + } + } + + bzero(&sb, sizeof(sb)); + if (stat(vcp->vcp_kernel, &sb) == -1) { + ret = errno; + log_warn("%s: can't stat kernel image %s", + __progname, vcp->vcp_kernel); + goto err; + } + + kernel_size = sb.st_size; + + /* Open kernel image */ + kernel_fd = open(vcp->vcp_kernel, O_RDONLY); + if (kernel_fd == -1) { + ret = errno; + log_warn("%s: can't open kernel image %s", + __progname, vcp->vcp_kernel); + goto err; + } + + if (openpty(&ttym_fd, &ttys_fd, ptyn, NULL, NULL) == -1) { + ret = errno; + log_warn("%s: openpty failed", __progname); + goto err; + } + + if (close(ttys_fd)) { + ret = errno; + log_warn("%s: close tty failed", __progname); + goto err; + } + + /* Open tap devices for child */ + for (i = 0 ; i < vcp->vcp_nnics; i++) { + child_taps[i] = opentap(); + if (child_taps[i] == -1) { + ret = errno; + log_warn("%s: can't open tap for nic %zd", + __progname, i); + goto err; + } + } + + /* Start child vmd for this VM (fork, chroot, drop privs) */ + ret = start_client_vmd(); + + /* Start child failed? - cleanup and leave */ + if (ret == -1) { + ret = EIO; + goto err; + } + + if (ret > 0) { + /* Parent */ + for (i = 0 ; i < vcp->vcp_ndisks; i++) + close(child_disks[i]); + + for (i = 0 ; i < vcp->vcp_nnics; i++) + close(child_taps[i]); + + close(kernel_fd); + close(ttym_fd); + + return (0); + } + else { + /* Child */ + setproctitle(vcp->vcp_name); + log_procinit(vcp->vcp_name); + + log_info("%s: vm console: %s", __progname, ptyn); + ret = vmm_create_vm(vcp); + if (ret) { + errno = ret; + fatal("create vmm ioctl failed - exiting"); + } + + /* Load kernel image */ + ret = loadelf_main(kernel_fd, vcp->vcp_id, vcp->vcp_memory_size); + if (ret) { + errno = ret; + fatal("failed to load kernel - exiting"); + } + + close(kernel_fd); + + con_fd = ttym_fd; + if (fcntl(con_fd, F_SETFL, O_NONBLOCK) == -1) + fatal("failed to set nonblocking mode on console"); + + /* Execute the vcpu run loop(s) for this VM */ + ret = run_vm(child_disks, child_taps, vcp); + _exit(ret != 0); + } + + return (ret); + +err: + for (i = 0 ; i < vcp->vcp_ndisks; i++) + if (child_disks[i] != -1) + close(child_disks[i]); + + for (i = 0 ; i < vcp->vcp_nnics; i++) + if (child_taps[i] != -1) + close(child_taps[i]); + + if (kernel_fd != -1) + close(kernel_fd); + + if (ttym_fd != -1) + close(ttym_fd); + + return (ret); +} + +/* + * get_info_vm + * + * Returns a list of VMs known to vmm(4). + * + * Parameters: + * ibuf: the imsg ibuf in which to place the results. A new imsg will + * be created using this ibuf. + * + * Return values: + * 0: success + * !0 : failure (eg, ENOMEM, EIO or another error code from vmm(4) ioctl) + */ +int +get_info_vm(struct privsep *ps, struct imsg *imsg) +{ + int ret; + size_t ct, i; + struct vm_info_params vip; + struct vm_info_result *info; + + /* + * We issue the VMM_IOC_INFO ioctl twice, once with an input + * buffer size of 0, which results in vmm(4) returning the + * number of bytes required back to us in vip.vip_size, + * and then we call it again after malloc'ing the required + * number of bytes. + * + * It is possible that we could fail a second time (eg, if + * another VM was created in the instant between the two + * ioctls, but in that case the caller can just try again + * as vmm(4) will return a zero-sized list in that case. + */ + vip.vip_size = 0; + info = NULL; + ret = 0; + + /* First ioctl to see how many bytes needed (vip.vip_size) */ + if (ioctl(env->vmd_fd, VMM_IOC_INFO, &vip) < 0) + return (errno); + + if (vip.vip_info_ct != 0) + return (EIO); + + info = malloc(vip.vip_size); + if (info == NULL) + return (ENOMEM); + + /* Second ioctl to get the actual list */ + vip.vip_info = info; + if (ioctl(env->vmd_fd, VMM_IOC_INFO, &vip) < 0) { + ret = errno; + free(info); + return (ret); + } + + /* Return info to vmmctl(4) */ + ct = vip.vip_size / sizeof(struct vm_info_result); + for (i = 0; i < ct; i++) { + if (imsg_compose_event(&ps->ps_ievs[PROC_CONTROL][0], + IMSG_VMDOP_GET_INFO_VM_DATA, imsg->hdr.peerid, 0, + -1, &info[i], sizeof(struct vm_info_result)) == -1) + return (EIO); + } + free(info); + return (0); +} + + +/* + * start_client_vmd + * + * forks a copy of the parent vmd, chroots to VMD_USER's home, drops + * privileges (changes to user VMD_USER), and returns. + * Should the fork operation succeed, but later chroot/privsep + * fail, the child exits. + * + * Return values (returns to both child and parent on success): + * -1 : failure + * 0: return to child vmd returns 0 + * !0 : return to parent vmd returns the child's pid + */ +int +start_client_vmd(void) +{ + int child_pid; + struct passwd *pw; + + pw = getpwnam(VMD_USER); + if (pw == NULL) { + log_warnx("%s: no such user %s", __progname, VMD_USER); + return (-1); + } + + child_pid = fork(); + if (child_pid < 0) + return (-1); + + if (!child_pid) { + /* Child */ + if (chroot(pw->pw_dir) != 0) + fatal("unable to chroot"); + if (chdir("/") != 0) + fatal("unable to chdir"); + + if (setgroups(1, &pw->pw_gid) == -1) + fatal("setgroups() failed"); + if (setresgid(pw->pw_gid, pw->pw_gid, pw->pw_gid) == -1) + fatal("setresgid() failed"); + if (setresuid(pw->pw_uid, pw->pw_uid, pw->pw_uid) == -1) + fatal("setresuid() failed"); + + return (0); + } + + /* Parent */ + return (child_pid); +} + +/* + * vmm_create_vm + * + * Requests vmm(4) to create a new VM using the supplied creation + * parameters. This operation results in the creation of the in-kernel + * structures for the VM, but does not start the VM's vcpu(s). + * + * Parameters: + * vcp: vm_create_params struct containing the VM's desired creation + * configuration + * + * Return values: + * 0: success + * !0 : ioctl to vmm(4) failed + */ +int +vmm_create_vm(struct vm_create_params *vcp) +{ + /* Sanity check arguments */ + if (vcp->vcp_ncpus > VMM_MAX_VCPUS_PER_VM) + return (EINVAL); + + if (vcp->vcp_memory_size > VMM_MAX_VM_MEM_SIZE) + return (EINVAL); + + if (vcp->vcp_ndisks > VMM_MAX_DISKS_PER_VM) + return (EINVAL); + + if (ioctl(env->vmd_fd, VMM_IOC_CREATE, vcp) < 0) + return (errno); + + return (0); +} + +/* + * init_emulated_hw + * + * Initializes the userspace hardware emulation + */ +void +init_emulated_hw(struct vm_create_params *vcp, int *child_disks, + int *child_taps) +{ + /* Init the i8253 PIT's 3 counters */ + bzero(&i8253_counter, sizeof(struct i8253_counter) * 3); + gettimeofday(&i8253_counter[0].tv, NULL); + gettimeofday(&i8253_counter[1].tv, NULL); + gettimeofday(&i8253_counter[2].tv, NULL); + i8253_counter[0].start = TIMER_DIV(100); + i8253_counter[1].start = TIMER_DIV(100); + i8253_counter[2].start = TIMER_DIV(100); + + /* Init ns8250 UART */ + bzero(&com1_regs, sizeof(struct ns8250_regs)); + + /* Initialize PCI */ + pci_init(); + + /* Initialize virtio devices */ + virtio_init(vcp, child_disks, child_taps); +} + +/* + * run_vm + * + * Runs the VM whose creation parameters are specified in vcp + * + * Parameters: + * vcp: vm_create_params struct containing the VM's desired creation + * configuration + * child_disks: previously-opened child VM disk file file descriptors + * child_taps: previously-opened child tap file descriptors + * + * Return values: + * 0: the VM exited normally + * !0 : the VM exited abnormally or failed to start + */ +int +run_vm(int *child_disks, int *child_taps, struct vm_create_params *vcp) +{ + size_t i; + int ret; + pthread_t *tid; + void *exit_status; + struct vm_run_params **vrp; + + ret = 0; + + /* XXX cap vcp_ncpus to avoid overflow here */ + /* + * XXX ensure nvcpus in vcp is same as vm, or fix vmm to return einval + * on bad vcpu id + */ + tid = malloc(sizeof(pthread_t) * vcp->vcp_ncpus); + vrp = malloc(sizeof(struct vm_run_params *) * vcp->vcp_ncpus); + if (tid == NULL || vrp == NULL) { + log_warn("%s: memory allocation error - exiting.", + __progname); + return (ENOMEM); + } + + init_emulated_hw(vcp, child_disks, child_taps); + + /* + * Create and launch one thread for each VCPU. These threads may + * migrate between PCPUs over time; the need to reload CPU state + * in such situations is detected and performed by vmm(4) in the + * kernel. + */ + for (i = 0 ; i < vcp->vcp_ncpus; i++) { + vrp[i] = malloc(sizeof(struct vm_run_params)); + if (vrp[i] == NULL) { + log_warn("%s: memory allocation error - " + "exiting.", __progname); + /* caller will exit, so skip free'ing */ + return (ENOMEM); + } + vrp[i]->vrp_exit = malloc(sizeof(union vm_exit)); + if (vrp[i]->vrp_exit == NULL) { + log_warn("%s: memory allocation error - " + "exiting.", __progname); + /* caller will exit, so skip free'ing */ + return (ENOMEM); + } + vrp[i]->vrp_vm_id = vcp->vcp_id; + vrp[i]->vrp_vcpu_id = i; + + /* Start each VCPU run thread at vcpu_run_loop */ + ret = pthread_create(&tid[i], NULL, vcpu_run_loop, vrp[i]); + if (ret) { + /* caller will _exit after this return */ + return (ret); + } + } + + /* Wait for all the threads to exit */ + for (i = 0; i < vcp->vcp_ncpus; i++) { + if (pthread_join(tid[i], &exit_status)) { + log_warn("%s: failed to join thread %zd - " + "exiting", __progname, i); + return (EIO); + } + + if (exit_status != NULL) { + log_warnx("%s: vm %d vcpu run thread %zd exited " + "abnormally", __progname, vcp->vcp_id, i); + ret = EIO; + } + } + + return (ret); +} + +/* + * vcpu_run_loop + * + * Runs a single VCPU until vmm(4) requires help handling an exit, + * or the VM terminates. + * + * Parameters: + * arg: vcpu_run_params for the VCPU being run by this thread + * + * Return values: + * NULL: the VCPU shutdown properly + * !NULL: error processing VCPU run, or the VCPU shutdown abnormally + */ +void * +vcpu_run_loop(void *arg) +{ + struct vm_run_params *vrp = (struct vm_run_params *)arg; + intptr_t ret; + + vrp->vrp_continue = 0; + vrp->vrp_injint = -1; + + for (;;) { + if (ioctl(env->vmd_fd, VMM_IOC_RUN, vrp) < 0) { + /* If run ioctl failed, exit */ + ret = errno; + return ((void *)ret); + } + + /* If the VM is terminating, exit normally */ + if (vrp->vrp_exit_reason == VM_EXIT_TERMINATED) + return (NULL); + + if (vrp->vrp_exit_reason != VM_EXIT_NONE) { + /* + * vmm(4) needs help handling an exit, handle in + * vcpu_exit. + */ + if (vcpu_exit(vrp)) + return ((void *)EIO); + } + } + + return (NULL); +} + +/* + * vcpu_exit_i8253 + * + * Handles emulated i8253 PIT access (in/out instruction to PIT ports). + * We don't emulate all the modes of the i8253, just the basic squarewave + * clock. + * + * Parameters: + * vei: VM exit information from vmm(4) containing information on the in/out + * instruction being performed + */ +void +vcpu_exit_i8253(union vm_exit *vei) +{ + uint32_t out_data; + uint8_t sel, rw, data; + uint64_t ns, ticks; + struct timeval now, delta; + + if (vei->vei.vei_port == TIMER_CTRL) { + if (vei->vei.vei_dir == 0) { /* OUT instruction */ + out_data = vei->vei.vei_data; + sel = out_data & + (TIMER_SEL0 | TIMER_SEL1 | TIMER_SEL2); + sel = sel >> 6; + if (sel > 2) { + log_warnx("%s: i8253 PIT: invalid " + "timer selected (%d)", + __progname, sel); + return; + } + + rw = vei->vei.vei_data & + (TIMER_LATCH | TIMER_LSB | + TIMER_MSB | TIMER_16BIT); + + if (rw == TIMER_16BIT) { + /* + * XXX this seems to be used on occasion, needs + * to be implemented + */ + log_warnx("%s: i8253 PIT: 16 bit " + "counter I/O not supported", + __progname); + return; + } + + /* + * Since we don't truly emulate each tick of the PIT + * clock, when the guest asks for the timer to be + * latched, simulate what the counter would have been + * had we performed full emulation. We do this by + * calculating when the counter was reset vs how much + * time has elapsed, then bias by the counter tick + * rate. + */ + if (rw == TIMER_LATCH) { + gettimeofday(&now, NULL); + delta.tv_sec = now.tv_sec - + i8253_counter[sel].tv.tv_sec; + delta.tv_usec = now.tv_usec - + i8253_counter[sel].tv.tv_usec; + if (delta.tv_usec < 0) { + delta.tv_sec--; + delta.tv_usec += 1000000; + } + if (delta.tv_usec > 1000000) { + delta.tv_sec++; + delta.tv_usec -= 1000000; + } + ns = delta.tv_usec * 1000 + + delta.tv_sec * 1000000000; + ticks = ns / NS_PER_TICK; + i8253_counter[sel].olatch = + i8253_counter[sel].start - + ticks % i8253_counter[sel].start; + return; + } + + log_warnx("%s: i8253 PIT: unsupported rw mode " + "%d", __progname, rw); + return; + } else { + /* XXX should this return 0xff? */ + log_warnx("%s: i8253 PIT: read from control " + "port unsupported", __progname); + } + } else { + sel = vei->vei.vei_port - (TIMER_CNTR0 + TIMER_BASE); + if (vei->vei.vei_dir == 0) { /* OUT instruction */ + if (i8253_counter[sel].last_w == 0) { + out_data = vei->vei.vei_data; + i8253_counter[sel].ilatch |= (out_data << 8); + i8253_counter[sel].last_w = 1; + } else { + out_data = vei->vei.vei_data; + i8253_counter[sel].ilatch |= out_data; + i8253_counter[sel].start = + i8253_counter[sel].ilatch; + i8253_counter[sel].last_w = 0; + } + } else { + if (i8253_counter[sel].last_r == 0) { + data = i8253_counter[sel].olatch >> 8; + vei->vei.vei_data = data; + i8253_counter[sel].last_w = 1; + } else { + data = i8253_counter[sel].olatch & 0xFF; + vei->vei.vei_data = data; + i8253_counter[sel].last_w = 0; + } + } + } +} + +/* + * vcpu_process_com_data + * + * Emulate in/out instructions to the com1 (ns8250) UART data register + * + * Parameters: + * vei: vm exit information from vmm(4) containing information on the in/out + * instruction being performed + */ +void +vcpu_process_com_data(union vm_exit *vei) +{ + /* + * vei_dir == 0 : out instruction + * + * The guest wrote to the data register. Since we are emulating a + * no-fifo chip, write the character immediately to the pty and + * assert TXRDY in IIR (if the guest has requested TXRDY interrupt + * reporting) + */ + if (vei->vei.vei_dir == 0) { + write(con_fd, &vei->vei.vei_data, 1); + if (com1_regs.ier & 0x2) { + /* Set TXRDY */ + com1_regs.iir |= IIR_TXRDY; + /* Set "interrupt pending" (IIR low bit cleared) */ + com1_regs.iir &= ~0x1; + } + } else { + /* + * vei_dir == 1 : in instruction + * + * The guest read from the data register. Check to see if + * there is data available (RXRDY) and if so, consume the + * input data and return to the guest. Also clear the + * interrupt info register regardless. + */ + if (com1_regs.lsr & LSR_RXRDY) { + vei->vei.vei_data = com1_regs.data; + com1_regs.data = 0x0; + com1_regs.lsr &= ~LSR_RXRDY; + } else { + /* XXX should this be com1_regs.data or 0xff? */ + vei->vei.vei_data = com1_regs.data; + log_warnx("guest reading com1 when not ready"); + } + + /* Reading the data register always clears RXRDY from IIR */ + com1_regs.iir &= ~IIR_RXRDY; + + /* + * Clear "interrupt pending" by setting IIR low bit to 1 + * if no interrupt are pending + */ + if (com1_regs.iir == 0x0) + com1_regs.iir = 0x1; + } +} + +/* + * vcpu_process_com_lcr + * + * Emulate in/out instructions to the com1 (ns8250) UART line control register + * + * Paramters: + * vei: vm exit information from vmm(4) containing information on the in/out + * instruction being performed + */ +void +vcpu_process_com_lcr(union vm_exit *vei) +{ + /* + * vei_dir == 0 : out instruction + * + * Write content to line control register + */ + if (vei->vei.vei_dir == 0) { + com1_regs.lcr = (uint8_t)vei->vei.vei_data; + } else { + /* + * vei_dir == 1 : in instruction + * + * Read line control register + */ + vei->vei.vei_data = com1_regs.lcr; + } +} + +/* + * vcpu_process_com_iir + * + * Emulate in/out instructions to the com1 (ns8250) UART interrupt information + * register. Note that writes to this register actually are to a different + * register, the FCR (FIFO control register) that we don't emulate but still + * consume the data provided. + * + * Parameters: + * vei: vm exit information from vmm(4) containing information on the in/out + * instruction being performed + */ +void +vcpu_process_com_iir(union vm_exit *vei) +{ + /* + * vei_dir == 0 : out instruction + * + * Write to FCR + */ + if (vei->vei.vei_dir == 0) { + com1_regs.fcr = vei->vei.vei_data; + } else { + /* + * vei_dir == 1 : in instruction + * + * Read IIR. Reading the IIR resets the TXRDY bit in the IIR + * after the data is read. + */ + vei->vei.vei_data = com1_regs.iir; + com1_regs.iir &= ~IIR_TXRDY; + + /* + * Clear "interrupt pending" by setting IIR low bit to 1 + * if no interrupts are pending + */ + if (com1_regs.iir == 0x0) + com1_regs.iir = 0x1; + } +} + +/* + * vcpu_process_com_mcr + * + * Emulate in/out instructions to the com1 (ns8250) UART modem control + * register. + * + * Parameters: + * vei: vm exit information from vmm(4) containing information on the in/out + * instruction being performed + */ +void +vcpu_process_com_mcr(union vm_exit *vei) +{ + /* + * vei_dir == 0 : out instruction + * + * Write to MCR + */ + if (vei->vei.vei_dir == 0) { + com1_regs.mcr = vei->vei.vei_data; + } else { + /* + * vei_dir == 1 : in instruction + * + * Read from MCR + */ + vei->vei.vei_data = com1_regs.mcr; + } +} + +/* + * vcpu_process_com_lsr + * + * Emulate in/out instructions to the com1 (ns8250) UART line status register. + * + * Parameters: + * vei: vm exit information from vmm(4) containing information on the in/out + * instruction being performed + */ +void +vcpu_process_com_lsr(union vm_exit *vei) +{ + /* + * vei_dir == 0 : out instruction + * + * Write to LSR. This is an illegal operation, so we just log it and + * continue. + */ + if (vei->vei.vei_dir == 0) { + log_warnx("%s: LSR UART write 0x%x unsupported", + __progname, vei->vei.vei_data); + } else { + /* + * vei_dir == 1 : in instruction + * + * Read from LSR. We always report TXRDY and TSRE since we + * can process output characters immediately (at any time). + */ + vei->vei.vei_data = com1_regs.lsr | LSR_TSRE | LSR_TXRDY; + } +} + +/* + * vcpu_process_com_msr + * + * Emulate in/out instructions to the com1 (ns8250) UART modem status register. + * + * Parameters: + * vei: vm exit information from vmm(4) containing information on the in/out + * instruction being performed + */ +void +vcpu_process_com_msr(union vm_exit *vei) +{ + /* + * vei_dir == 0 : out instruction + * + * Write to MSR. This is an illegal operation, so we just log it and + * continue. + */ + if (vei->vei.vei_dir == 0) { + log_warnx("%s: MSR UART write 0x%x unsupported", + __progname, vei->vei.vei_data); + } else { + /* + * vei_dir == 1 : in instruction + * + * Read from MSR. We always report DCD, DSR, and CTS. + */ + vei->vei.vei_data = com1_regs.lsr | MSR_DCD | MSR_DSR | MSR_CTS; + } +} + +/* + * vcpu_process_com_scr + * + * Emulate in/out instructions to the com1 (ns8250) UART scratch register. The + * scratch register is sometimes used to distinguish an 8250 from a 16450, + * and/or used to distinguish submodels of the 8250 (eg 8250A, 8250B). We + * simulate an "original" 8250 by forcing the scratch register to return data + * on read that is different from what was written. + * + * Parameters: + * vei: vm exit information from vmm(4) containing information on the in/out + * instruction being performed + */ +void +vcpu_process_com_scr(union vm_exit *vei) +{ + /* + * vei_dir == 0 : out instruction + * + * Write to SCR + */ + if (vei->vei.vei_dir == 0) { + com1_regs.scr = vei->vei.vei_data; + } else { + /* + * vei_dir == 1 : in instruction + * + * Read from SCR. To make sure we don't accidentally simulate + * a real scratch register, we negate what was written on + * subsequent readback. + */ + vei->vei.vei_data = ~com1_regs.scr; + } +} + +/* + * vcpu_process_com_ier + * + * Emulate in/out instructions to the com1 (ns8250) UART interrupt enable + * register. + * + * Parameters: + * vei: vm exit information from vmm(4) containing information on the in/out + * instruction being performed + */ +void +vcpu_process_com_ier(union vm_exit *vei) +{ + /* + * vei_dir == 0 : out instruction + * + * Write to IER + */ + if (vei->vei.vei_dir == 0) { + com1_regs.ier = vei->vei.vei_data; + } else { + /* + * vei_dir == 1 : in instruction + * + * Read from IER + */ + vei->vei.vei_data = com1_regs.ier; + } +} + +/* + * vcpu_exit_com + * + * Process com1 (ns8250) UART exits. vmd handles most basic 8250 + * features with the exception of the divisor latch (eg, no baud + * rate support) + * + * Parameters: + * vrp: vcpu run parameters containing guest state for this exit + */ +void +vcpu_exit_com(struct vm_run_params *vrp) +{ + union vm_exit *vei = vrp->vrp_exit; + + switch(vei->vei.vei_port) { + case COM1_LCR: + vcpu_process_com_lcr(vei); + break; + case COM1_IER: + vcpu_process_com_ier(vei); + break; + case COM1_IIR: + vcpu_process_com_iir(vei); + break; + case COM1_MCR: + vcpu_process_com_mcr(vei); + break; + case COM1_LSR: + vcpu_process_com_lsr(vei); + break; + case COM1_MSR: + vcpu_process_com_msr(vei); + break; + case COM1_SCR: + vcpu_process_com_scr(vei); + break; + case COM1_DATA: + vcpu_process_com_data(vei); + break; + } +} + +/* + * vcpu_exit_pci + * + * Handle all I/O to the emulated PCI subsystem. + * + * Parameters: + * vrp: vcpu run paramters containing guest state for this exit + * + * Return values: + * 0xff if no interrupt is required after this pci exit, + * or an interrupt vector otherwise + */ +uint8_t +vcpu_exit_pci(struct vm_run_params *vrp) +{ + union vm_exit *vei = vrp->vrp_exit; + uint8_t intr; + + intr = 0xFF; + + switch(vei->vei.vei_port) { + case PCI_MODE1_ADDRESS_REG: + pci_handle_address_reg(vrp); + break; + case PCI_MODE1_DATA_REG: + pci_handle_data_reg(vrp); + break; + case VMM_PCI_IO_BAR_BASE ... VMM_PCI_IO_BAR_END: + intr = pci_handle_io(vrp); + break; + default: + log_warnx("%s: unknown PCI register 0x%llx", + __progname, (uint64_t)vei->vei.vei_port); + break; + } + + return (intr); +} + +/* + * vcpu_exit_inout + * + * Handle all I/O exits that need to be emulated in vmd. This includes the + * i8253 PIT and the com1 ns8250 UART. + * + * Parameters: + * vrp: vcpu run parameters containing guest state for this exit + */ +void +vcpu_exit_inout(struct vm_run_params *vrp) +{ + union vm_exit *vei = vrp->vrp_exit; + uint8_t intr; + + switch(vei->vei.vei_port) { + case TIMER_CTRL: + case (TIMER_CNTR0 + TIMER_BASE): + case (TIMER_CNTR1 + TIMER_BASE): + case (TIMER_CNTR2 + TIMER_BASE): + vcpu_exit_i8253(vei); + break; + case COM1_DATA ... COM1_SCR: + vcpu_exit_com(vrp); + break; + case PCI_MODE1_ADDRESS_REG: + case PCI_MODE1_DATA_REG: + case VMM_PCI_IO_BAR_BASE ... VMM_PCI_IO_BAR_END: + intr = vcpu_exit_pci(vrp); + if (intr != 0xFF) + vrp->vrp_injint = intr; + else + vrp->vrp_injint = -1; + break; + default: + /* IN from unsupported port gives FFs */ + if (vei->vei.vei_dir == 1) + vei->vei.vei_data = 0xFFFFFFFF; + break; + } +} + +/* + * vcpu_exit + * + * Handle a vcpu exit. This function is called when it is determined that + * vmm(4) requires the assistance of vmd to support a particular guest + * exit type (eg, accessing an I/O port or device). Guest state is contained + * in 'vrp', and will be resent to vmm(4) on exit completion. + * + * Upon conclusion of handling the exit, the function determines if any + * interrupts should be injected into the guest, and sets vrp->vrp_injint + * to the IRQ line whose interrupt should be vectored (or -1 if no interrupt + * is to be injected). + * + * Parameters: + * vrp: vcpu run parameters containing guest state for this exit + * + * Return values: + * 0: the exit was handled successfully + * 1: an error occurred (exit not handled) + */ +int +vcpu_exit(struct vm_run_params *vrp) +{ + ssize_t sz; + char ch; + + switch (vrp->vrp_exit_reason) { + case VMX_EXIT_IO: + vcpu_exit_inout(vrp); + break; + case VMX_EXIT_HLT: + /* + * XXX handle halted state, no reason to run this vcpu again + * until a vm interrupt is to be injected + */ + break; + default: + log_warnx("%s: unknown exit reason %d", + __progname, vrp->vrp_exit_reason); + return (1); + } + + /* XXX interrupt priority */ + if (vionet_process_rx()) + vrp->vrp_injint = 9; + + /* + * Is there a new character available on com1? + * If so, consume the character, buffer it into the com1 data register + * assert IRQ4, and set the line status register RXRDY bit. + * + * XXX - move all this com intr checking to another function + */ + sz = read(con_fd, &ch, sizeof(char)); + if (sz == 1) { + com1_regs.lsr |= LSR_RXRDY; + com1_regs.data = ch; + /* XXX these ier and iir bits should be IER_x and IIR_x */ + if (com1_regs.ier & 0x1) { + com1_regs.iir |= (2 << 1); + com1_regs.iir &= ~0x1; + } + } + + /* + * Clear "interrupt pending" by setting IIR low bit to 1 if no + * interrupts are pending + */ + /* XXX these iir magic numbers should be IIR_x */ + if ((com1_regs.iir & ~0x1) == 0x0) + com1_regs.iir = 0x1; + + /* If pending interrupt and nothing waiting to be injected, inject */ + if ((com1_regs.iir & 0x1) == 0) + if (vrp->vrp_injint == -1) + vrp->vrp_injint = 0x4; + vrp->vrp_continue = 1; + + return (0); +} + +/* + * write_page + * + * Pushes a page of data from 'buf' into the guest VM's memory + * at paddr 'dst'. + * + * Parameters: + * dst: the destination paddr_t in the guest VM to push into. + * If there is no guest paddr mapping at 'dst', a new page will be + * faulted in by the VMM (provided 'dst' represents a valid paddr + * in the guest's address space) + * buf: page of data to push + * len: size of 'buf' + * do_mask: 1 to mask the destination address (for kernel load), 0 to + * leave 'dst' unmasked + * + * Return values: + * various return values from ioctl(VMM_IOC_WRITEPAGE), or 0 if no error + * occurred. + * + * Note - this function only handles GPAs < 4GB. + */ +int +write_page(uint32_t dst, void *buf, uint32_t len, int do_mask) +{ + struct vm_writepage_params vwp; + + /* + * Mask kernel load addresses to avoid uint32_t -> uint64_t cast + * errors + */ + if (do_mask) + dst &= 0xFFFFFFF; + + vwp.vwp_paddr = (paddr_t)dst; + vwp.vwp_data = buf; + vwp.vwp_vm_id = vm_id; + vwp.vwp_len = len; + if (ioctl(env->vmd_fd, VMM_IOC_WRITEPAGE, &vwp) < 0) { + log_warn("writepage ioctl failed"); + return (errno); + } + return (0); +} + +/* + * read_page + * + * Reads a page of memory at guest paddr 'src' into 'buf'. + * + * Parameters: + * src: the source paddr_t in the guest VM to read from. + * buf: destination (local) buffer + * len: size of 'buf' + * do_mask: 1 to mask the source address (for kernel load), 0 to + * leave 'src' unmasked + * + * Return values: + * various return values from ioctl(VMM_IOC_READPAGE), or 0 if no error + * occurred. + * + * Note - this function only handles GPAs < 4GB. + */ +int +read_page(uint32_t src, void *buf, uint32_t len, int do_mask) +{ + struct vm_readpage_params vrp; + + /* + * Mask kernel load addresses to avoid uint32_t -> uint64_t cast + * errors + */ + if (do_mask) + src &= 0xFFFFFFF; + + vrp.vrp_paddr = (paddr_t)src; + vrp.vrp_data = buf; + vrp.vrp_vm_id = vm_id; + vrp.vrp_len = len; + if (ioctl(env->vmd_fd, VMM_IOC_READPAGE, &vrp) < 0) { + log_warn("readpage ioctl failed"); + return (errno); + } + return (0); +} |