#include <sys/types.h>
#include <sys/queue.h>
#include <sys/time.h>
#include <net/if.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <limits.h>
#include <string.h>
#include <fcntl.h>
#include <errno.h>
#include <imsg.h>
#include "proc.h"
#include "vmd.h"
const char *vmd_descsw[] = { "bridge", "veb", NULL };
static int config_init_localprefix(struct vmd_config *);
static int
config_init_localprefix(struct vmd_config *cfg)
{
if (parse_prefix4(VMD_DHCP_PREFIX, &cfg->cfg_localprefix, NULL) == -1)
return (-1);
cfg->cfg_flags &= ~VMD_CFG_INET6;
if (cfg->cfg_flags & VMD_CFG_AUTOINET6)
return (0);
if (parse_prefix6(VMD_ULA_PREFIX, &cfg->cfg_localprefix, NULL) == -1)
return (-1);
arc4random_buf(&cfg->cfg_localprefix.lp_in6.s6_addr[1], 7);
cfg->cfg_flags |= VMD_CFG_AUTOINET6;
return (0);
}
int
config_init(struct vmd *env)
{
struct privsep *ps = &env->vmd_ps;
unsigned int what;
ps->ps_what[PROC_PARENT] = CONFIG_ALL;
ps->ps_what[PROC_VMM] = CONFIG_VMS;
if (config_init_localprefix(&env->vmd_cfg) == -1)
return (-1);
what = ps->ps_what[privsep_process];
if (what & CONFIG_VMS) {
if ((env->vmd_vms = calloc(1, sizeof(*env->vmd_vms))) == NULL)
return (-1);
if ((env->vmd_known = calloc(1, sizeof(*env->vmd_known))) == NULL)
return (-1);
TAILQ_INIT(env->vmd_vms);
TAILQ_INIT(env->vmd_known);
}
if (what & CONFIG_SWITCHES) {
if ((env->vmd_switches = calloc(1,
sizeof(*env->vmd_switches))) == NULL)
return (-1);
TAILQ_INIT(env->vmd_switches);
}
return (0);
}
void
config_purge(struct vmd *env, unsigned int reset)
{
struct privsep *ps = &env->vmd_ps;
struct name2id *n2i;
struct vmd_vm *vm;
struct vmd_switch *vsw;
unsigned int what;
DPRINTF("%s: %s purging vms and switches",
__func__, ps->ps_title[privsep_process]);
config_init_localprefix(&env->vmd_cfg);
what = ps->ps_what[privsep_process] & reset;
if (what & CONFIG_VMS && env->vmd_vms != NULL) {
while ((vm = TAILQ_FIRST(env->vmd_vms)) != NULL) {
vm_remove(vm, __func__);
}
while ((n2i = TAILQ_FIRST(env->vmd_known)) != NULL) {
TAILQ_REMOVE(env->vmd_known, n2i, entry);
free(n2i);
}
env->vmd_nvm = 0;
}
if (what & CONFIG_SWITCHES && env->vmd_switches != NULL) {
while ((vsw = TAILQ_FIRST(env->vmd_switches)) != NULL)
switch_remove(vsw);
env->vmd_nswitches = 0;
}
}
int
config_setconfig(struct vmd *env)
{
struct privsep *ps = &env->vmd_ps;
unsigned int id;
DPRINTF("%s: setting config", __func__);
for (id = 0; id < PROC_MAX; id++) {
if (id == privsep_process)
continue;
proc_compose(ps, id, IMSG_VMDOP_CONFIG, &env->vmd_cfg,
sizeof(env->vmd_cfg));
}
return (0);
}
int
config_getconfig(struct vmd *env, struct imsg *imsg)
{
struct privsep *ps = &env->vmd_ps;
log_debug("%s: %s retrieving config",
__func__, ps->ps_title[privsep_process]);
vmop_config_read(imsg, &env->vmd_cfg);
return (0);
}
int
config_setreset(struct vmd *env, unsigned int reset)
{
struct privsep *ps = &env->vmd_ps;
unsigned int id;
DPRINTF("%s: resetting state", __func__);
for (id = 0; id < PROC_MAX; id++) {
if ((reset & ps->ps_what[id]) == 0 ||
id == privsep_process)
continue;
proc_compose(ps, id, IMSG_CTL_RESET, &reset, sizeof(reset));
}
return (0);
}
int
config_setvm(struct privsep *ps, struct vmd_vm *vm, uint32_t peerid, uid_t uid)
{
int diskfds[VM_MAX_DISKS_PER_VM][VM_MAX_BASE_PER_DISK];
struct vmd_if *vif;
struct vmop_create_params *vmc = &vm->vm_params;
unsigned int i, j;
int fd = -1, cdromfd = -1, kernfd = -1;
int *tapfds = NULL;
int n = 0, aflags, oflags, ret = -1;
char ifname[IF_NAMESIZE], *s;
char path[PATH_MAX], base[PATH_MAX];
unsigned int unit;
struct timeval tv, rate, since_last;
struct vmop_addr_req var;
size_t bytes = 0;
if (vm->vm_state & VM_STATE_RUNNING) {
log_warnx("%s: vm is already running", __func__);
return (EALREADY);
}
getmonotime(&tv);
if (vm->vm_start_tv.tv_sec) {
timersub(&tv, &vm->vm_start_tv, &since_last);
rate.tv_sec = VM_START_RATE_SEC;
rate.tv_usec = 0;
if (timercmp(&since_last, &rate, <))
vm->vm_start_limit++;
else {
vm->vm_start_limit = 0;
}
log_debug("%s: vm %u restarted after %lld.%ld seconds,"
" limit %d/%d", __func__, vm->vm_vmid, since_last.tv_sec,
since_last.tv_usec, vm->vm_start_limit,
VM_START_RATE_LIMIT);
if (vm->vm_start_limit >= VM_START_RATE_LIMIT) {
log_warnx("%s: vm %u restarted too quickly", __func__,
vm->vm_vmid);
return (EPERM);
}
}
vm->vm_start_tv = tv;
for (i = 0; i < VM_MAX_DISKS_PER_VM; i++)
for (j = 0; j < VM_MAX_BASE_PER_DISK; j++)
diskfds[i][j] = -1;
tapfds = reallocarray(NULL, vmc->vmc_nnics, sizeof(*tapfds));
if (tapfds == NULL) {
ret = errno;
log_warn("%s: can't allocate tap fds", __func__);
return (ret);
}
for (i = 0; i < vmc->vmc_nnics; i++)
tapfds[i] = -1;
vm->vm_peerid = peerid;
vm->vm_uid = uid;
if (vm->vm_kernel == -1) {
if (vm->vm_kernel_path != NULL) {
kernfd = open(vm->vm_kernel_path, O_RDONLY | O_CLOEXEC);
if (kernfd == -1) {
ret = errno;
log_warn("%s: can't open kernel or BIOS "
"boot image %s", __func__,
vm->vm_kernel_path);
goto fail;
}
}
if (kernfd == -1) {
if ((kernfd = open(VM_DEFAULT_BIOS,
O_RDONLY | O_CLOEXEC)) == -1) {
log_warn("can't open %s", VM_DEFAULT_BIOS);
ret = VMD_BIOS_MISSING;
goto fail;
}
}
if (vm_checkaccess(kernfd,
vmc->vmc_checkaccess & VMOP_CREATE_KERNEL,
uid, R_OK) == -1) {
log_warnx("vm \"%s\" no read access to kernel "
"%s", vmc->vmc_name, vm->vm_kernel_path);
ret = EPERM;
goto fail;
}
vm->vm_kernel = kernfd;
vmc->vmc_kernel = kernfd;
}
if (strlen(vmc->vmc_cdrom)) {
if ((cdromfd =
open(vmc->vmc_cdrom, O_RDONLY)) == -1) {
log_warn("can't open cdrom %s", vmc->vmc_cdrom);
ret = VMD_CDROM_MISSING;
goto fail;
}
if (vm_checkaccess(cdromfd,
vmc->vmc_checkaccess & VMOP_CREATE_CDROM,
uid, R_OK) == -1) {
log_warnx("vm \"%s\" no read access to cdrom %s",
vmc->vmc_name, vmc->vmc_cdrom);
ret = EPERM;
goto fail;
}
}
for (i = 0 ; i < vmc->vmc_ndisks; i++) {
if (strlcpy(path, vmc->vmc_disks[i], sizeof(path))
>= sizeof(path))
log_warnx("disk path %s too long", vmc->vmc_disks[i]);
memset(vmc->vmc_diskbases, 0, sizeof(vmc->vmc_diskbases));
oflags = O_RDWR | O_EXLOCK | O_NONBLOCK;
aflags = R_OK | W_OK;
for (j = 0; j < VM_MAX_BASE_PER_DISK; j++) {
if ((diskfds[i][j] = open(path, oflags)) == -1) {
log_warn("can't open disk %s",
vmc->vmc_disks[i]);
ret = VMD_DISK_MISSING;
goto fail;
}
if (vm_checkaccess(diskfds[i][j],
vmc->vmc_checkaccess & VMOP_CREATE_DISK,
uid, aflags) == -1) {
log_warnx("vm \"%s\" unable to access "
"disk %s", vmc->vmc_name, path);
errno = EPERM;
goto fail;
}
oflags = O_RDONLY | O_NONBLOCK;
aflags = R_OK;
n = virtio_get_base(diskfds[i][j], base, sizeof(base),
vmc->vmc_disktypes[i], path);
if (n == 0)
break;
if (n == -1) {
log_warnx("vm \"%s\" unable to read "
"base for disk %s", vmc->vmc_name,
vmc->vmc_disks[i]);
goto fail;
}
(void)strlcpy(path, base, sizeof(path));
}
}
for (i = 0 ; i < vmc->vmc_nnics; i++) {
vif = &vm->vm_ifs[i];
s = vmc->vmc_ifnames[i];
if (*s != '\0' && strcmp("tap", s) != 0) {
if (priv_getiftype(s, ifname, &unit) == -1 ||
strcmp(ifname, "tap") != 0) {
log_warnx("%s: invalid tap name %s",
__func__, s);
ret = EINVAL;
goto fail;
}
} else
s = NULL;
if (s != NULL) {
snprintf(path, PATH_MAX, "/dev/%s", s);
tapfds[i] = open(path, O_RDWR | O_NONBLOCK);
} else {
tapfds[i] = opentap(ifname);
s = ifname;
}
if (tapfds[i] == -1) {
ret = errno;
log_warnx("%s: can't open /dev/%s", __func__, s);
goto fail;
}
if ((vif->vif_name = strdup(s)) == NULL) {
log_warn("%s: can't save tap %s", __func__, s);
goto fail;
}
s = vmc->vmc_ifswitch[i];
if (*s != '\0') {
if ((vif->vif_switch = strdup(s)) == NULL) {
log_warn("%s: can't save switch %s",
__func__, s);
goto fail;
}
}
s = vmc->vmc_ifgroup[i];
if (*s != '\0') {
if ((vif->vif_group = strdup(s)) == NULL) {
log_warn("%s: can't save group %s",
__func__, s);
goto fail;
}
}
vif->vif_rdomain = vmc->vmc_ifrdomain[i];
vif->vif_flags =
vmc->vmc_ifflags[i] & (VMIFF_UP|VMIFF_OPTMASK);
}
if (vm->vm_ttyname[0] == '\0') {
if (vm_opentty(vm) == -1) {
log_warn("%s: can't open tty %s", __func__,
vm->vm_ttyname[0] == '\0' ? "" : vm->vm_ttyname);
goto fail;
}
}
if ((fd = dup(vm->vm_tty)) == -1) {
log_warn("%s: can't re-open tty %s", __func__, vm->vm_ttyname);
goto fail;
}
proc_compose_imsg(ps, PROC_VMM, IMSG_VMDOP_START_VM_REQUEST,
vm->vm_vmid, vm->vm_kernel, vmc, sizeof(*vmc));
if (strlen(vmc->vmc_cdrom))
proc_compose_imsg(ps, PROC_VMM, IMSG_VMDOP_START_VM_CDROM,
vm->vm_vmid, cdromfd, NULL, 0);
for (i = 0; i < vmc->vmc_ndisks; i++) {
for (j = 0; j < VM_MAX_BASE_PER_DISK; j++) {
if (diskfds[i][j] == -1)
break;
proc_compose_imsg(ps, PROC_VMM,
IMSG_VMDOP_START_VM_DISK, vm->vm_vmid,
diskfds[i][j], &i, sizeof(i));
}
}
for (i = 0; i < vmc->vmc_nnics; i++) {
proc_compose_imsg(ps, PROC_VMM, IMSG_VMDOP_START_VM_IF,
vm->vm_vmid, tapfds[i], &i, sizeof(i));
memset(&var, 0, sizeof(var));
var.var_vmid = vm->vm_vmid;
var.var_nic_idx = i;
proc_compose_imsg(ps, PROC_PRIV, IMSG_VMDOP_PRIV_GET_ADDR,
vm->vm_vmid, dup(tapfds[i]), &var, sizeof(var));
}
proc_compose_imsg(ps, PROC_VMM, IMSG_VMDOP_START_VM_END,
vm->vm_vmid, fd, NULL, 0);
free(tapfds);
if (vmc->vmc_nmemranges > 0) {
for (i = 0; i < vmc->vmc_nmemranges; i++)
bytes += vmc->vmc_memranges[i].vmr_size;
memset(&vmc->vmc_memranges, 0, sizeof(vmc->vmc_memranges));
vmc->vmc_nmemranges = 0;
vmc->vmc_memranges[0].vmr_size = bytes;
}
vm->vm_state |= VM_STATE_RUNNING;
return (0);
fail:
log_warnx("failed to start vm %s", vmc->vmc_name);
if (vm->vm_kernel != -1)
close(kernfd);
if (cdromfd != -1)
close(cdromfd);
for (i = 0; i < vmc->vmc_ndisks; i++)
for (j = 0; j < VM_MAX_BASE_PER_DISK; j++)
if (diskfds[i][j] != -1)
close(diskfds[i][j]);
if (tapfds != NULL) {
for (i = 0; i < vmc->vmc_nnics; i++)
close(tapfds[i]);
free(tapfds);
}
if (vm->vm_from_config) {
vm_stop(vm, 0, __func__);
} else {
vm_remove(vm, __func__);
}
return (ret);
}
int
config_getvm(struct privsep *ps, struct imsg *imsg)
{
struct vmop_create_params vmc;
struct vmd_vm *vm = NULL;
uint32_t peer_id;
int fd;
vmop_create_params_read(imsg, &vmc);
fd = imsg_get_fd(imsg);
peer_id = imsg_get_id(imsg);
vmc.vmc_kernel = fd;
errno = 0;
if (vm_register(ps, &vmc, &vm, peer_id, 0) == -1)
goto fail;
vm->vm_state |= VM_STATE_RUNNING;
vm->vm_peerid = (uint32_t)-1;
vm->vm_kernel = fd;
return (0);
fail:
if (fd != -1)
close(fd);
vm_remove(vm, __func__);
if (errno == 0)
errno = EINVAL;
return (-1);
}
int
config_getdisk(struct privsep *ps, struct imsg *imsg)
{
struct vmd_vm *vm;
unsigned int n, idx;
int fd;
uint32_t peer_id;
peer_id = imsg_get_id(imsg);
errno = 0;
if ((vm = vm_getbyvmid(peer_id)) == NULL) {
errno = ENOENT;
return (-1);
}
n = imsg_uint_read(imsg);
fd = imsg_get_fd(imsg);
if (n >= vm->vm_params.vmc_ndisks || fd == -1) {
log_warnx("invalid disk id");
errno = EINVAL;
return (-1);
}
idx = vm->vm_params.vmc_diskbases[n]++;
if (idx >= VM_MAX_BASE_PER_DISK) {
log_warnx("too many bases for disk");
errno = EINVAL;
return (-1);
}
vm->vm_disks[n][idx] = fd;
return (0);
}
int
config_getif(struct privsep *ps, struct imsg *imsg)
{
struct vmd_vm *vm;
unsigned int n;
int fd;
uint32_t peer_id;
peer_id = imsg_get_id(imsg);
errno = 0;
if ((vm = vm_getbyvmid(peer_id)) == NULL) {
errno = ENOENT;
return (-1);
}
n = imsg_uint_read(imsg);
fd = imsg_get_fd(imsg);
if (n >= vm->vm_params.vmc_nnics ||
vm->vm_ifs[n].vif_fd != -1 || fd == -1) {
log_warnx("invalid interface id");
goto fail;
}
vm->vm_ifs[n].vif_fd = fd;
return (0);
fail:
if (fd != -1)
close(fd);
errno = EINVAL;
return (-1);
}
int
config_getcdrom(struct privsep *ps, struct imsg *imsg)
{
struct vmd_vm *vm;
int fd;
uint32_t peer_id;
peer_id = imsg_get_id(imsg);
errno = 0;
if ((vm = vm_getbyvmid(peer_id)) == NULL) {
errno = ENOENT;
return (-1);
}
fd = imsg_get_fd(imsg);
if (fd == -1) {
log_warnx("invalid cdrom id");
goto fail;
}
vm->vm_cdrom = fd;
return (0);
fail:
errno = EINVAL;
return (-1);
}