#include <stdint.h>
#include <dev/pci/virtio_pcireg.h>
#include <dev/pv/vioblkreg.h>
#include <dev/pv/virtioreg.h>
#include <errno.h>
#include <event.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include "atomicio.h"
#include "pci.h"
#include "virtio.h"
#include "vmd.h"
extern struct vmd_vm *current_vm;
struct iovec io_v[VIRTIO_QUEUE_SIZE_MAX];
static const char *disk_type(int);
static uint32_t vioblk_read(struct virtio_dev *, struct viodev_msg *, int *);
static int vioblk_write(struct virtio_dev *, struct viodev_msg *);
static uint32_t vioblk_dev_read(struct virtio_dev *, struct viodev_msg *);
static int vioblk_notifyq(struct virtio_dev *, uint16_t);
static ssize_t vioblk_io(struct vioblk_dev *, struct virtio_vq_info *, int,
off_t, struct vring_desc *, struct vring_desc **);
static void dev_dispatch_vm(int, short, void *);
static void handle_sync_io(int, short, void *);
static const char *
disk_type(int type)
{
switch (type) {
case VMDF_RAW: return "raw";
case VMDF_QCOW2: return "qcow2";
}
return "unknown";
}
__dead void
vioblk_main(int fd, int fd_vmm)
{
struct virtio_dev dev;
struct vioblk_dev *vioblk = NULL;
struct viodev_msg msg;
struct vmd_vm vm;
ssize_t sz;
off_t szp = 0;
int i, ret, type;
if (pledge("stdio vmm proc", NULL) == -1)
fatal("pledge");
memset(io_v, 0, nitems(io_v)*sizeof(io_v[0]));
memset(&dev, 0, sizeof(dev));
sz = atomicio(read, fd, &dev, sizeof(dev));
if (sz != sizeof(dev)) {
ret = errno;
log_warn("failed to receive vioblk");
goto fail;
}
if (dev.dev_type != VMD_DEVTYPE_DISK) {
ret = EINVAL;
log_warn("received invalid device type");
goto fail;
}
dev.sync_fd = fd;
vioblk = &dev.vioblk;
log_debug("%s: got viblk dev. num disk fds = %d, sync fd = %d, "
"async fd = %d, capacity = %lld seg_max = %u, vmm fd = %d",
__func__, vioblk->ndisk_fd, dev.sync_fd, dev.async_fd,
vioblk->capacity, vioblk->seg_max, fd_vmm);
memset(&vm, 0, sizeof(vm));
sz = atomicio(read, dev.sync_fd, &vm, sizeof(vm));
if (sz != sizeof(vm)) {
ret = EIO;
log_warnx("failed to receive vm details");
goto fail;
}
current_vm = &vm;
setproctitle("%s/vioblk%d", vm.vm_params.vmc_name, vioblk->idx);
log_procinit("vm/%s/vioblk%d", vm.vm_params.vmc_name, vioblk->idx);
ret = remap_guest_mem(&vm, fd_vmm);
if (ret) {
log_warnx("failed to remap guest memory");
goto fail;
}
close_fd(fd_vmm);
if (pledge("stdio", NULL) == -1)
fatal("pledge2");
type = vm.vm_params.vmc_disktypes[vioblk->idx];
switch (type) {
case VMDF_RAW:
ret = virtio_raw_init(&vioblk->file, &szp, vioblk->disk_fd,
vioblk->ndisk_fd);
break;
case VMDF_QCOW2:
ret = virtio_qcow2_init(&vioblk->file, &szp, vioblk->disk_fd,
vioblk->ndisk_fd);
break;
default:
log_warnx("invalid disk image type");
goto fail;
}
if (ret || szp < 0) {
log_warnx("failed to init disk %s image", disk_type(type));
goto fail;
}
vioblk->capacity = szp / 512;
log_debug("%s: initialized vioblk%d with %s image (capacity=%lld)",
__func__, vioblk->idx, disk_type(type), vioblk->capacity);
event_init();
log_debug("%s: wiring in async vm event handler (fd=%d)", __func__,
dev.async_fd);
if (vm_device_pipe(&dev, dev_dispatch_vm, NULL)) {
ret = EIO;
log_warnx("vm_device_pipe");
goto fail;
}
log_debug("%s: wiring in sync channel handler (fd=%d)", __func__,
dev.sync_fd);
if (imsgbuf_init(&dev.sync_iev.ibuf, dev.sync_fd) == -1) {
log_warn("imsgbuf_init");
goto fail;
}
imsgbuf_allow_fdpass(&dev.sync_iev.ibuf);
dev.sync_iev.handler = handle_sync_io;
dev.sync_iev.data = &dev;
dev.sync_iev.events = EV_READ;
imsg_event_add(&dev.sync_iev);
log_debug("%s: telling vm %s device is ready", __func__,
vm.vm_params.vmc_name);
memset(&msg, 0, sizeof(msg));
msg.type = VIODEV_MSG_READY;
imsg_compose_event(&dev.sync_iev, IMSG_DEVOP_MSG, 0, 0, -1, &msg,
sizeof(msg));
log_debug("%s: sending heartbeat", __func__);
ret = imsg_compose_event(&dev.async_iev, IMSG_DEVOP_MSG, 0, 0, -1,
&msg, sizeof(msg));
if (ret == -1) {
log_warnx("%s: failed to send async ready message!", __func__);
goto fail;
}
ret = event_dispatch();
if (ret == 0) {
close_fd(dev.sync_fd);
close_fd(dev.async_fd);
for (i = 0; i < vioblk->ndisk_fd; i++)
close_fd(vioblk->disk_fd[i]);
_exit(0);
}
fail:
memset(&msg, 0, sizeof(msg));
msg.type = VIODEV_MSG_ERROR;
msg.data = ret;
imsg_compose(&dev.sync_iev.ibuf, IMSG_DEVOP_MSG, 0, 0, -1, &msg,
sizeof(msg));
imsgbuf_flush(&dev.sync_iev.ibuf);
close_fd(dev.sync_fd);
close_fd(dev.async_fd);
if (vioblk != NULL) {
for (i = 0; i < vioblk->ndisk_fd; i++)
close_fd(vioblk->disk_fd[i]);
}
_exit(ret);
}
const char *
vioblk_cmd_name(uint32_t type)
{
switch (type) {
case VIRTIO_BLK_T_IN: return "read";
case VIRTIO_BLK_T_OUT: return "write";
case VIRTIO_BLK_T_SCSI_CMD: return "scsi read";
case VIRTIO_BLK_T_SCSI_CMD_OUT: return "scsi write";
case VIRTIO_BLK_T_FLUSH: return "flush";
case VIRTIO_BLK_T_FLUSH_OUT: return "flush out";
case VIRTIO_BLK_T_GET_ID: return "get id";
default: return "unknown";
}
}
static int
vioblk_notifyq(struct virtio_dev *dev, uint16_t vq_idx)
{
uint32_t cmd_len;
uint16_t idx, cmd_desc_idx;
uint8_t ds;
off_t offset;
ssize_t sz;
int is_write, notify = 0;
char *vr;
size_t i;
struct vring_desc *table, *desc;
struct vring_avail *avail;
struct vring_used *used;
struct virtio_blk_req_hdr *cmd;
struct virtio_vq_info *vq_info;
struct vioblk_dev *vioblk = &dev->vioblk;
if (vq_idx > dev->num_queues)
return (0);
vq_info = &dev->vq[vq_idx];
idx = vq_info->last_avail;
vr = vq_info->q_hva;
if (vr == NULL)
fatalx("%s: null vring", __func__);
table = (struct vring_desc *)(vr);
avail = (struct vring_avail *)(vr + vq_info->vq_availoffset);
used = (struct vring_used *)(vr + vq_info->vq_usedoffset);
while (idx != avail->idx) {
cmd_desc_idx = avail->ring[idx & vq_info->mask];
desc = &table[cmd_desc_idx];
cmd_len = desc->len;
if ((desc->flags & VRING_DESC_F_NEXT) == 0) {
log_warnx("%s: unchained cmd descriptor", __func__);
goto reset;
}
if (DESC_WRITABLE(desc)) {
log_warnx("%s: invalid cmd descriptor state", __func__);
goto reset;
}
cmd = hvaddr_mem(desc->addr, sizeof(*cmd));
if (cmd == NULL)
goto reset;
desc = &table[desc->next & vq_info->mask];
switch (cmd->type) {
case VIRTIO_BLK_T_IN:
case VIRTIO_BLK_T_OUT:
is_write = (cmd->type == VIRTIO_BLK_T_OUT) ? 1 : 0;
offset = cmd->sector * VIRTIO_BLK_SECTOR_SIZE;
sz = vioblk_io(vioblk, vq_info, is_write, offset, table,
&desc);
if (sz == -1)
ds = VIRTIO_BLK_S_IOERR;
else
ds = VIRTIO_BLK_S_OK;
break;
case VIRTIO_BLK_T_GET_ID:
ds = VIRTIO_BLK_S_UNSUPP;
break;
default:
log_warnx("%s: unsupported vioblk command %d", __func__,
cmd->type);
ds = VIRTIO_BLK_S_UNSUPP;
break;
}
i = 0;
while (desc->flags & VRING_DESC_F_NEXT) {
desc = &table[desc->next & vq_info->mask];
if (++i >= vq_info->qs) {
log_warnx("%s: descriptor chain overflow",
__func__);
goto reset;
}
}
if (!DESC_WRITABLE(desc)) {
log_warnx("%s: status descriptor unwritable", __func__);
goto reset;
}
if (write_mem(desc->addr, &ds, sizeof(ds)))
log_warnx("%s: can't write device status data "
"@ 0x%llx",__func__, desc->addr);
dev->isr |= 1;
notify = 1;
used->ring[used->idx & vq_info->mask].id = cmd_desc_idx;
used->ring[used->idx & vq_info->mask].len = cmd_len;
__sync_synchronize();
used->idx++;
idx++;
}
vq_info->last_avail = idx;
return (notify);
reset:
dev->status |= DEVICE_NEEDS_RESET;
dev->isr |= VIRTIO_CONFIG_ISR_CONFIG_CHANGE;
return (1);
}
static void
dev_dispatch_vm(int fd, short event, void *arg)
{
struct virtio_dev *dev = (struct virtio_dev *)arg;
struct imsgev *iev = &dev->async_iev;
struct imsgbuf *ibuf = &iev->ibuf;
struct imsg imsg;
ssize_t n = 0;
int verbose;
uint32_t type;
if (event & EV_READ) {
if ((n = imsgbuf_read(ibuf)) == -1)
fatal("%s: imsgbuf_read", __func__);
if (n == 0) {
log_debug("%s: pipe dead (EV_READ)", __func__);
event_del(&iev->ev);
event_loopexit(NULL);
return;
}
}
if (event & EV_WRITE) {
if (imsgbuf_write(ibuf) == -1) {
if (errno == EPIPE) {
log_debug("%s: pipe dead (EV_WRITE)", __func__);
event_del(&iev->ev);
event_loopexit(NULL);
return;
}
fatal("%s: imsgbuf_write", __func__);
}
}
for (;;) {
if ((n = imsg_get(ibuf, &imsg)) == -1)
fatal("%s: imsg_get", __func__);
if (n == 0)
break;
type = imsg_get_type(&imsg);
switch (type) {
case IMSG_VMDOP_PAUSE_VM:
log_debug("%s: pausing", __func__);
break;
case IMSG_VMDOP_UNPAUSE_VM:
log_debug("%s: unpausing", __func__);
break;
case IMSG_CTL_VERBOSE:
verbose = imsg_int_read(&imsg);
log_setverbose(verbose);
break;
default:
log_warnx("%s: unhandled imsg type %d", __func__, type);
break;
}
imsg_free(&imsg);
}
imsg_event_add(iev);
}
static void
handle_sync_io(int fd, short event, void *arg)
{
struct virtio_dev *dev = (struct virtio_dev *)arg;
struct imsgev *iev = &dev->sync_iev;
struct imsgbuf *ibuf = &iev->ibuf;
struct viodev_msg msg;
struct imsg imsg;
ssize_t n;
int deassert = 0;
if (event & EV_READ) {
if ((n = imsgbuf_read(ibuf)) == -1)
fatal("%s: imsgbuf_read", __func__);
if (n == 0) {
log_debug("%s: vioblk pipe dead (EV_READ)", __func__);
event_del(&iev->ev);
event_loopexit(NULL);
return;
}
}
if (event & EV_WRITE) {
if (imsgbuf_write(ibuf) == -1) {
if (errno == EPIPE) {
log_debug("%s: pipe dead (EV_WRITE)", __func__);
event_del(&iev->ev);
event_loopexit(NULL);
return;
}
fatal("%s: imsgbuf_write", __func__);
}
}
for (;;) {
if ((n = imsg_get(ibuf, &imsg)) == -1)
fatalx("%s: imsg_get (n=%ld)", __func__, n);
if (n == 0)
break;
viodev_msg_read(&imsg, &msg);
imsg_free(&imsg);
switch (msg.type) {
case VIODEV_MSG_IO_READ:
msg.data = vioblk_read(dev, &msg, &deassert);
msg.data_valid = 1;
if (deassert) {
msg.state = INTR_STATE_DEASSERT;
}
imsg_compose_event(iev, IMSG_DEVOP_MSG, 0, 0, -1, &msg,
sizeof(msg));
break;
case VIODEV_MSG_IO_WRITE:
if (vioblk_write(dev, &msg))
virtio_assert_irq(dev, 0);
break;
case VIODEV_MSG_SHUTDOWN:
event_del(&dev->sync_iev.ev);
event_loopbreak();
return;
default:
fatalx("%s: invalid msg type %d", __func__, msg.type);
}
}
imsg_event_add(iev);
}
static int
vioblk_write(struct virtio_dev *dev, struct viodev_msg *msg)
{
uint32_t data = msg->data;
uint16_t reg = msg->reg;
uint8_t sz = msg->io_sz;
int intr = 0;
switch (reg & 0xFF00) {
case VIO1_CFG_BAR_OFFSET:
(void)virtio_io_cfg(dev, VEI_DIR_OUT, (reg & 0x00FF), data, sz);
break;
case VIO1_DEV_BAR_OFFSET:
break;
case VIO1_NOTIFY_BAR_OFFSET:
intr = vioblk_notifyq(dev, (uint16_t)(msg->data));
break;
case VIO1_ISR_BAR_OFFSET:
break;
default:
log_debug("%s: no handler for reg 0x%04x", __func__, reg);
}
return (intr);
}
static uint32_t
vioblk_read(struct virtio_dev *dev, struct viodev_msg *msg, int *deassert)
{
uint32_t data = 0;
uint16_t reg = msg->reg;
uint8_t sz = msg->io_sz;
switch (reg & 0xFF00) {
case VIO1_CFG_BAR_OFFSET:
data = virtio_io_cfg(dev, VEI_DIR_IN, (uint8_t)reg, 0, sz);
break;
case VIO1_DEV_BAR_OFFSET:
data = vioblk_dev_read(dev, msg);
break;
case VIO1_NOTIFY_BAR_OFFSET:
break;
case VIO1_ISR_BAR_OFFSET:
data = dev->isr;
dev->isr = 0;
*deassert = 1;
break;
default:
log_debug("%s: no handler for reg 0x%04x", __func__, reg);
}
return (data);
}
static uint32_t
vioblk_dev_read(struct virtio_dev *dev, struct viodev_msg *msg)
{
struct vioblk_dev *vioblk = (struct vioblk_dev *)&dev->vioblk;
uint32_t data = 0;
uint16_t reg = msg->reg;
uint8_t sz = msg->io_sz;
switch (reg & 0xFF) {
case VIRTIO_BLK_CONFIG_CAPACITY:
if (sz != 4) {
log_warnx("%s: unaligned read from capacity register",
__func__);
break;
}
data = (uint32_t)(0xFFFFFFFF & vioblk->capacity);
break;
case VIRTIO_BLK_CONFIG_CAPACITY + 4:
if (sz != 4) {
log_warnx("%s: unaligned read from capacity register",
__func__);
break;
}
data = (uint32_t)(vioblk->capacity >> 32);
break;
case VIRTIO_BLK_CONFIG_SEG_MAX:
if (sz != 4) {
log_warnx("%s: unaligned read from segment max "
"register", __func__);
break;
}
data = vioblk->seg_max;
break;
case VIRTIO_BLK_CONFIG_GEOMETRY_C:
case VIRTIO_BLK_CONFIG_GEOMETRY_H:
case VIRTIO_BLK_CONFIG_GEOMETRY_S:
break;
default:
log_warnx("%s: invalid register 0x%04x", __func__, reg);
}
return (data);
}
static ssize_t
vioblk_io(struct vioblk_dev *dev, struct virtio_vq_info *vq_info, int is_write,
off_t offset, struct vring_desc *desc_tbl, struct vring_desc **desc)
{
struct iovec *iov = NULL;
ssize_t sz = 0;
size_t io_idx = 0;
size_t xfer_sz = 0;
do {
iov = &io_v[io_idx];
if ((!is_write) ^ DESC_WRITABLE(*desc)) {
log_warnx("%s: invalid descriptor for %s command",
__func__, is_write ? "write" : "read");
return (-1);
}
iov->iov_len = (size_t)(*desc)->len;
iov->iov_base = hvaddr_mem((*desc)->addr, iov->iov_len);
if (iov->iov_base == NULL)
return (-1);
xfer_sz += iov->iov_len;
io_idx++;
if (io_idx >= nitems(io_v)) {
log_warnx("%s: descriptor table "
"invalid", __func__);
return (-1);
}
*desc = &desc_tbl[(*desc)->next & vq_info->mask];
} while ((*desc)->flags & VRING_DESC_F_NEXT);
if (offset % VIRTIO_BLK_SECTOR_SIZE != 0 &&
xfer_sz % VIRTIO_BLK_SECTOR_SIZE != 0) {
log_warnx("%s: unaligned read", __func__);
return (-1);
}
if (xfer_sz > SSIZE_MAX) {
log_warnx("%s: invalid %s size: %zu", __func__,
is_write ? "write" : "read", xfer_sz);
return (-1);
}
if (is_write)
sz = dev->file.pwritev(dev->file.p, io_v, io_idx, offset);
else
sz = dev->file.preadv(dev->file.p, io_v, io_idx, offset);
if (sz != (ssize_t)xfer_sz) {
log_warnx("%s: %s failure at offset 0x%llx, xfer_sz=%zu, "
"sz=%ld", __func__, (is_write ? "write" : "read"), offset,
xfer_sz, sz);
return (-1);
}
return (sz);
}