#include <sys/types.h>
#include <sys/ioctl.h>
#include <sys/queue.h>
#include <sys/wait.h>
#include <sys/socket.h>
#include <dev/vmm/vmm.h>
#include <net/if.h>
#include <errno.h>
#include <event.h>
#include <fcntl.h>
#include <imsg.h>
#include <limits.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include "vmd.h"
#include "atomicio.h"
#include "proc.h"
void vmm_sighdlr(int, short, void *);
int vmm_start_vm(struct imsg *, uint32_t *, pid_t *);
int vmm_dispatch_parent(int, struct privsep_proc *, struct imsg *);
void vmm_run(struct privsep *, struct privsep_proc *, void *);
void vmm_dispatch_vm(int, short, void *);
int terminate_vm(struct vm_terminate_params *);
int get_info_vm(struct privsep *, struct imsg *, int);
int opentap(char *);
int dev_null = -1;
extern struct vmd *env;
static struct privsep_proc procs[] = {
{ "parent", PROC_PARENT, vmm_dispatch_parent },
};
void
vmm(struct privsep *ps, struct privsep_proc *p)
{
proc_run(ps, p, procs, nitems(procs), vmm_run, NULL);
}
void
vmm_run(struct privsep *ps, struct privsep_proc *p, void *arg)
{
if (config_init(ps->ps_env) == -1)
fatal("failed to initialize configuration");
if (!env->vmd_debug) {
dev_null = open("/dev/null", O_RDWR|O_CLOEXEC, 0);
if (dev_null == -1)
fatal("/dev/null");
}
if (unveil(env->argv0, "x") == -1)
fatal("unveil %s", env->argv0);
if (unveil(NULL, NULL) == -1)
fatal("unveil lock");
if (pledge("stdio vmm sendfd recvfd proc exec", NULL) == -1)
fatal("pledge");
signal_del(&ps->ps_evsigchld);
signal_set(&ps->ps_evsigchld, SIGCHLD, vmm_sighdlr, ps);
signal_add(&ps->ps_evsigchld, NULL);
}
int
vmm_dispatch_parent(int fd, struct privsep_proc *p, struct imsg *imsg)
{
struct privsep *ps = p->p_ps;
int res = 0, cmd = IMSG_NONE, verbose;
struct vmd_vm *vm = NULL;
struct vm_terminate_params vtp;
struct vmop_id vid;
struct vmop_result vmr;
struct vmop_addr_result var;
uint32_t id = 0, vm_id, type;
pid_t pid, vm_pid = 0;
unsigned int mode, flags;
pid = imsg_get_pid(imsg);
type = imsg_get_type(imsg);
vm_id = imsg_get_id(imsg);
switch (type) {
case IMSG_VMDOP_START_VM_REQUEST:
res = config_getvm(ps, imsg);
if (res == -1) {
res = errno;
cmd = IMSG_VMDOP_START_VM_RESPONSE;
}
break;
case IMSG_VMDOP_START_VM_CDROM:
res = config_getcdrom(ps, imsg);
if (res == -1) {
res = errno;
cmd = IMSG_VMDOP_START_VM_RESPONSE;
}
break;
case IMSG_VMDOP_START_VM_DISK:
res = config_getdisk(ps, imsg);
if (res == -1) {
res = errno;
cmd = IMSG_VMDOP_START_VM_RESPONSE;
}
break;
case IMSG_VMDOP_START_VM_IF:
res = config_getif(ps, imsg);
if (res == -1) {
res = errno;
cmd = IMSG_VMDOP_START_VM_RESPONSE;
}
break;
case IMSG_VMDOP_START_VM_END:
res = vmm_start_vm(imsg, &id, &vm_pid);
if (res == 0 && (id = vm_id2vmid(id, NULL)) == 0)
res = ENOENT;
cmd = IMSG_VMDOP_START_VM_RESPONSE;
break;
case IMSG_VMDOP_TERMINATE_VM_REQUEST:
vmop_id_read(imsg, &vid);
id = vid.vid_id;
flags = vid.vid_flags;
DPRINTF("%s: recv'ed TERMINATE_VM for %d", __func__, id);
cmd = IMSG_VMDOP_TERMINATE_VM_RESPONSE;
if (id == 0) {
res = ENOENT;
} else if ((vm = vm_getbyvmid(id)) != NULL) {
if (flags & VMOP_FORCE) {
vtp.vtp_vm_id = vm_vmid2id(vm->vm_vmid, vm);
vm->vm_state |= VM_STATE_SHUTDOWN;
(void)terminate_vm(&vtp);
res = 0;
} else if (!(vm->vm_state & VM_STATE_SHUTDOWN)) {
log_debug("%s: sending shutdown request"
" to vm %d", __func__, id);
vm->vm_state |= VM_STATE_SHUTDOWN;
if (imsg_compose_event(&vm->vm_iev,
IMSG_VMDOP_VM_REBOOT,
0, 0, -1, NULL, 0) == -1)
res = errno;
else
res = 0;
} else {
if (vm_vmid2id(vm->vm_vmid, vm) == 0) {
log_debug("%s: no vm running anymore",
__func__);
res = VMD_VM_STOP_INVALID;
}
}
} else {
log_debug("%s: cannot stop vm that is not running",
__func__);
res = VMD_VM_STOP_INVALID;
}
break;
case IMSG_VMDOP_GET_INFO_VM_REQUEST:
res = get_info_vm(ps, imsg, 0);
cmd = IMSG_VMDOP_GET_INFO_VM_END_DATA;
break;
case IMSG_VMDOP_CONFIG:
config_getconfig(env, imsg);
break;
case IMSG_CTL_RESET:
mode = imsg_uint_read(imsg);
if (mode & CONFIG_VMS) {
vmm_shutdown();
mode &= ~CONFIG_VMS;
}
config_purge(env, mode);
break;
case IMSG_CTL_VERBOSE:
verbose = imsg_int_read(imsg);
log_setverbose(verbose);
env->vmd_verbose = verbose;
TAILQ_FOREACH(vm, env->vmd_vms, vm_entry) {
imsg_compose_event(&vm->vm_iev, type, -1, pid, -1,
&verbose, sizeof(verbose));
}
break;
case IMSG_VMDOP_PAUSE_VM:
vmop_id_read(imsg, &vid);
id = vid.vid_id;
if ((vm = vm_getbyvmid(id)) == NULL) {
res = ENOENT;
cmd = IMSG_VMDOP_PAUSE_VM_RESPONSE;
break;
}
imsg_compose_event(&vm->vm_iev, type, -1, pid,
imsg_get_fd(imsg), &vid, sizeof(vid));
break;
case IMSG_VMDOP_UNPAUSE_VM:
vmop_id_read(imsg, &vid);
id = vid.vid_id;
if ((vm = vm_getbyvmid(id)) == NULL) {
res = ENOENT;
cmd = IMSG_VMDOP_UNPAUSE_VM_RESPONSE;
break;
}
imsg_compose_event(&vm->vm_iev, type, -1, pid,
imsg_get_fd(imsg), &vid, sizeof(vid));
break;
case IMSG_VMDOP_PRIV_GET_ADDR_RESPONSE:
vmop_addr_result_read(imsg, &var);
if ((vm = vm_getbyvmid(var.var_vmid)) == NULL) {
res = ENOENT;
break;
}
imsg_compose_event(&vm->vm_iev, type, -1, pid,
imsg_get_fd(imsg), &var, sizeof(var));
break;
case IMSG_VMDOP_RECEIVE_VMM_FD:
if (env->vmd_fd > -1)
fatalx("already received vmm fd");
env->vmd_fd = imsg_get_fd(imsg);
get_info_vm(ps, NULL, 1);
break;
case IMSG_VMDOP_RECEIVE_PSP_FD:
if (env->vmd_psp_fd > -1)
fatalx("already received psp fd");
env->vmd_psp_fd = imsg_get_fd(imsg);
break;
default:
return (-1);
}
switch (cmd) {
case 0:
break;
case IMSG_VMDOP_START_VM_RESPONSE:
if (res != 0) {
if ((vm = vm_getbyvmid(vm_id)) != NULL) {
log_debug("%s: removing vm, START_VM_RESPONSE",
__func__);
vm_remove(vm, __func__);
}
}
if (id == 0)
id = vm_id;
case IMSG_VMDOP_PAUSE_VM_RESPONSE:
case IMSG_VMDOP_UNPAUSE_VM_RESPONSE:
case IMSG_VMDOP_TERMINATE_VM_RESPONSE:
memset(&vmr, 0, sizeof(vmr));
vmr.vmr_result = res;
vmr.vmr_id = id;
vmr.vmr_pid = vm_pid;
if (proc_compose_imsg(ps, PROC_PARENT, cmd, vm_id, -1, &vmr,
sizeof(vmr)) == -1)
return (-1);
break;
default:
if (proc_compose_imsg(ps, PROC_PARENT, cmd, vm_id, -1, &res,
sizeof(res)) == -1)
return (-1);
break;
}
return (0);
}
void
vmm_sighdlr(int sig, short event, void *arg)
{
struct privsep *ps = arg;
int status, ret = 0;
pid_t pid;
struct vmop_result vmr;
struct vmd_vm *vm;
struct vm_terminate_params vtp;
log_debug("%s: handling signal %d", __func__, sig);
switch (sig) {
case SIGCHLD:
do {
pid = waitpid(-1, &status, WNOHANG);
if (pid <= 0)
continue;
if (WIFEXITED(status) || WIFSIGNALED(status)) {
vm = vm_getbypid(pid);
if (vm == NULL) {
continue;
}
if (WIFEXITED(status))
ret = WEXITSTATUS(status);
if (ret == EAGAIN &&
(vm->vm_state & VM_STATE_SHUTDOWN))
ret = 0;
vtp.vtp_vm_id = vm->vm_vmmid;
if (terminate_vm(&vtp) == 0)
log_debug("%s: terminated vm %s"
" (id %d)", __func__,
vm->vm_params.vmc_name,
vm->vm_vmid);
memset(&vmr, 0, sizeof(vmr));
vmr.vmr_result = ret;
vmr.vmr_id = vm_id2vmid(vm->vm_vmmid, vm);
if (proc_compose_imsg(ps, PROC_PARENT,
IMSG_VMDOP_TERMINATE_VM_EVENT,
vm->vm_peerid, -1, &vmr, sizeof(vmr)) == -1)
log_warnx("could not signal "
"termination of VM %u to "
"parent", vm->vm_vmid);
vm_remove(vm, __func__);
} else
fatalx("unexpected cause of SIGCHLD");
} while (pid > 0 || (pid == -1 && errno == EINTR));
break;
default:
fatalx("unexpected signal");
}
}
void
vmm_shutdown(void)
{
struct vm_terminate_params vtp;
struct vmd_vm *vm, *vm_next;
TAILQ_FOREACH_SAFE(vm, env->vmd_vms, vm_entry, vm_next) {
vtp.vtp_vm_id = vm_vmid2id(vm->vm_vmid, vm);
(void)terminate_vm(&vtp);
vm_remove(vm, __func__);
}
}
int
vmm_pipe(struct vmd_vm *vm, int fd, void (*cb)(int, short, void *))
{
struct imsgev *iev = &vm->vm_iev;
if (fcntl(fd, F_SETFD, FD_CLOEXEC) == -1) {
log_warn("failed to set close-on-exec for vmm ipc channel");
return (-1);
}
if (imsgbuf_init(&iev->ibuf, fd) == -1) {
log_warn("failed to init imsgbuf");
return (-1);
}
imsgbuf_allow_fdpass(&iev->ibuf);
iev->handler = cb;
iev->data = vm;
imsg_event_add(iev);
return (0);
}
void
vmm_dispatch_vm(int fd, short event, void *arg)
{
struct vmd_vm *vm = arg;
struct imsgev *iev = &vm->vm_iev;
struct imsgbuf *ibuf = &iev->ibuf;
struct imsg imsg;
ssize_t n;
unsigned int i;
uint32_t type;
if (event & EV_READ) {
if ((n = imsgbuf_read(ibuf)) == -1)
fatal("%s: imsgbuf_read", __func__);
if (n == 0) {
event_del(&iev->ev);
return;
}
}
if (event & EV_WRITE) {
if (imsgbuf_write(ibuf) == -1) {
if (errno == EPIPE) {
event_del(&iev->ev);
return;
}
fatal("%s: imsgbuf_write fd %d", __func__, ibuf->fd);
}
}
for (;;) {
if ((n = imsg_get(ibuf, &imsg)) == -1)
fatal("%s: imsg_get", __func__);
if (n == 0)
break;
type = imsg_get_type(&imsg);
switch (type) {
case IMSG_VMDOP_VM_SHUTDOWN:
vm->vm_state |= VM_STATE_SHUTDOWN;
break;
case IMSG_VMDOP_VM_REBOOT:
vm->vm_state &= ~VM_STATE_SHUTDOWN;
break;
case IMSG_VMDOP_PAUSE_VM_RESPONSE:
case IMSG_VMDOP_UNPAUSE_VM_RESPONSE:
for (i = 0; i < nitems(procs); i++) {
if (procs[i].p_id == PROC_PARENT) {
proc_forward_imsg(procs[i].p_ps,
&imsg, PROC_PARENT, -1);
break;
}
}
break;
default:
fatalx("%s: got invalid imsg %d from %s", __func__,
type, vm->vm_params.vmc_name);
}
imsg_free(&imsg);
}
imsg_event_add(iev);
}
int
terminate_vm(struct vm_terminate_params *vtp)
{
if (ioctl(env->vmd_fd, VMM_IOC_TERM, vtp) == -1)
return (errno);
return (0);
}
int
opentap(char *ifname)
{
int err = 0, i, fd;
char path[PATH_MAX];
for (i = 0; i < MAX_TAP; i++) {
snprintf(path, PATH_MAX, "/dev/tap%d", i);
errno = 0;
fd = open(path, O_RDWR | O_NONBLOCK);
if (fd != -1)
break;
err = errno;
if (err == EBUSY) {
continue;
} else if (err == ENOENT) {
break;
} else {
log_warn("%s: unexpected error", __func__);
break;
}
}
snprintf(ifname, IF_NAMESIZE, "tap%d", i);
if (err)
errno = err;
return (fd);
}
int
vmm_start_vm(struct imsg *imsg, uint32_t *id, pid_t *pid)
{
struct vmd_vm *vm;
char *nargv[10], num[32], vmm_fd[32], psp_fd[32];
int ret = EINVAL;
int fds[2];
pid_t vm_pid;
size_t i, j, sz;
uint32_t peer_id;
peer_id = imsg_get_id(imsg);
if ((vm = vm_getbyvmid(peer_id)) == NULL) {
log_warnx("%s: can't find vm", __func__);
return (ENOENT);
}
if ((vm->vm_tty = imsg_get_fd(imsg)) == -1) {
log_warnx("%s: can't get tty", __func__);
goto err;
}
if (socketpair(AF_UNIX, SOCK_STREAM | SOCK_NONBLOCK, PF_UNSPEC, fds)
== -1)
fatal("socketpair");
vm_pid = fork();
if (vm_pid == -1) {
log_warn("%s: start child failed", __func__);
ret = EIO;
goto err;
}
if (vm_pid > 0) {
vm->vm_pid = vm_pid;
close_fd(fds[1]);
sz = atomicio(vwrite, fds[0], vm, sizeof(*vm));
if (sz != sizeof(*vm)) {
log_warnx("%s: failed to send config for vm '%s'",
__func__, vm->vm_params.vmc_name);
ret = EIO;
}
for (i = 0 ; i < vm->vm_params.vmc_ndisks; i++) {
for (j = 0; j < VM_MAX_BASE_PER_DISK; j++) {
if (close_fd(vm->vm_disks[i][j]) == 0)
vm->vm_disks[i][j] = -1;
}
}
for (i = 0 ; i < vm->vm_params.vmc_nnics; i++) {
if (close_fd(vm->vm_ifs[i].vif_fd) == 0)
vm->vm_ifs[i].vif_fd = -1;
}
if (close_fd(vm->vm_kernel) == 0)
vm->vm_kernel = -1;
if (close_fd(vm->vm_cdrom) == 0)
vm->vm_cdrom = -1;
if (close_fd(vm->vm_tty) == 0)
vm->vm_tty = -1;
if (ret == EIO)
goto err;
sz = atomicio(vwrite, fds[0], &env->vmd_cfg.cfg_localprefix,
sizeof(env->vmd_cfg.cfg_localprefix));
if (sz != sizeof(env->vmd_cfg.cfg_localprefix)) {
log_warnx("%s: failed to send local prefix for vm '%s'",
__func__, vm->vm_params.vmc_name);
ret = EIO;
goto err;
}
sz = atomicio(read, fds[0], &vm->vm_vmmid,
sizeof(vm->vm_vmmid));
if (sz != sizeof(vm->vm_vmmid)) {
log_debug("%s: failed to receive vm id from vm %s",
__func__, vm->vm_params.vmc_name);
ret = ENOMEM;
goto err;
}
if (vm->vm_vmmid == 0)
goto err;
*id = vm->vm_vmmid;
*pid = vm->vm_pid;
if (vmm_pipe(vm, fds[0], vmm_dispatch_vm) == -1)
fatal("setup vm pipe");
} else {
if (setsid() == -1)
fatal("setsid");
close_fd(fds[0]);
close_fd(PROC_PARENT_SOCK_FILENO);
if (!env->vmd_debug) {
dup2(dev_null, STDIN_FILENO);
dup2(dev_null, STDOUT_FILENO);
dup2(dev_null, STDERR_FILENO);
if (dev_null > 2)
close(dev_null);
}
if (env->vmd_psp_fd > 0)
fcntl(env->vmd_psp_fd, F_SETFD, 0);
memset(num, 0, sizeof(num));
snprintf(num, sizeof(num), "%d", fds[1]);
memset(vmm_fd, 0, sizeof(vmm_fd));
snprintf(vmm_fd, sizeof(vmm_fd), "%d", env->vmd_fd);
memset(psp_fd, 0, sizeof(psp_fd));
snprintf(psp_fd, sizeof(psp_fd), "%d", env->vmd_psp_fd);
i = 0;
nargv[i++] = env->argv0;
nargv[i++] = "-V";
nargv[i++] = num;
nargv[i++] = "-i";
nargv[i++] = vmm_fd;
nargv[i++] = "-j";
nargv[i++] = psp_fd;
if (env->vmd_debug)
nargv[i++] = "-d";
if (env->vmd_verbose == 1)
nargv[i++] = "-v";
else if (env->vmd_verbose > 1)
nargv[i++] = "-vv";
nargv[i++] = NULL;
if (i > sizeof(nargv) / sizeof(nargv[0]))
fatalx("%s: nargv overflow", __func__);
execvp(nargv[0], nargv);
ret = errno;
log_warn("execvp %s", nargv[0]);
_exit(ret);
}
return (0);
err:
if (!vm->vm_from_config)
vm_remove(vm, __func__);
return (ret);
}
int
get_info_vm(struct privsep *ps, struct imsg *imsg, int terminate)
{
int ret;
size_t ct, i;
struct vm_info_params vip;
struct vm_info_result *info;
struct vm_terminate_params vtp;
struct vmop_info_result vir;
uint32_t peer_id;
vip.vip_size = 0;
info = NULL;
ret = 0;
memset(&vir, 0, sizeof(vir));
if (ioctl(env->vmd_fd, VMM_IOC_INFO, &vip) == -1)
return (errno);
if (vip.vip_info_ct != 0)
return (EIO);
info = malloc(vip.vip_size);
if (info == NULL)
return (ENOMEM);
vip.vip_info = info;
if (ioctl(env->vmd_fd, VMM_IOC_INFO, &vip) == -1) {
ret = errno;
free(info);
return (ret);
}
ct = vip.vip_size / sizeof(struct vm_info_result);
for (i = 0; i < ct; i++) {
if (terminate) {
vtp.vtp_vm_id = info[i].vir_id;
if ((ret = terminate_vm(&vtp)) != 0)
break;
log_debug("%s: terminated vm %s (id %d)", __func__,
info[i].vir_name, info[i].vir_id);
continue;
}
vir.vir_memory_size = info[i].vir_memory_size;
vir.vir_used_size = info[i].vir_used_size;
vir.vir_ncpus = info[i].vir_ncpus;
memcpy(vir.vir_vcpu_state, info[i].vir_vcpu_state,
sizeof(vir.vir_vcpu_state));
vir.vir_creator_pid = info[i].vir_creator_pid;
vir.vir_id = vm_id2vmid(info[i].vir_id, NULL);
memcpy(vir.vir_name, info[i].vir_name, sizeof(vir.vir_name));
peer_id = imsg_get_id(imsg);
if (proc_compose_imsg(ps, PROC_PARENT,
IMSG_VMDOP_GET_INFO_VM_DATA, peer_id, -1,
&vir, sizeof(vir)) == -1) {
ret = EIO;
break;
}
}
free(info);
return (ret);
}