#include <sys/param.h>
#include <sys/list.h>
#include <sys/stddef.h>
#include <sys/mman.h>
#include <sys/kdi_regs.h>
#include <sys/sysmacros.h>
#include <sys/controlregs.h>
#include <sys/note.h>
#include <sys/debug.h>
#include <errno.h>
#include <stdlib.h>
#include <strings.h>
#include <unistd.h>
#include <assert.h>
#include <machine/vmm.h>
#include <vmmapi.h>
#include <libvmm.h>
typedef struct vmm_memseg vmm_memseg_t;
#define VMM_MEMSEG_DEVMEM 0x1
struct vmm_memseg {
list_node_t vms_list;
int vms_segid;
int vms_prot;
int vms_flags;
uintptr_t vms_gpa;
off_t vms_segoff;
size_t vms_seglen;
size_t vms_maplen;
char vms_name[64];
};
struct vmm {
struct vmctx *vmm_ctx;
list_t vmm_memlist;
char *vmm_mem;
size_t vmm_memsize;
size_t vmm_ncpu;
struct vcpu **vmm_vcpu;
};
static void
vmm_update_ncpu(vmm_t *vmm)
{
cpuset_t cpuset;
assert(vm_active_cpus(vmm->vmm_ctx, &cpuset) == 0);
for (vmm->vmm_ncpu = 0;
CPU_ISSET(vmm->vmm_ncpu, &cpuset) == 1;
vmm->vmm_ncpu++)
;
}
vmm_t *
vmm_open_vm(const char *name)
{
vmm_t *vmm = NULL;
int _errno;
int i;
vmm = malloc(sizeof (vmm_t));
if (vmm == NULL)
return (NULL);
bzero(vmm, sizeof (vmm_t));
vmm->vmm_mem = MAP_FAILED;
list_create(&vmm->vmm_memlist, sizeof (vmm_memseg_t),
offsetof(vmm_memseg_t, vms_list));
vmm->vmm_ctx = vm_open(name);
if (vmm->vmm_ctx == NULL) {
list_destroy(&vmm->vmm_memlist);
free(vmm);
return (NULL);
}
vmm_update_ncpu(vmm);
if (vmm->vmm_ncpu == 0) {
do {
(void) usleep(10000);
vmm_update_ncpu(vmm);
} while (vmm->vmm_ncpu == 0);
}
vmm->vmm_vcpu = calloc(vmm->vmm_ncpu, sizeof (struct vcpu *));
if (vmm->vmm_vcpu == NULL)
goto fail;
for (i = 0; i < vmm->vmm_ncpu; i++) {
vmm->vmm_vcpu[i] = vm_vcpu_open(vmm->vmm_ctx, i);
if (vmm->vmm_vcpu[i] == NULL) {
_errno = errno;
while (i-- >= 0)
vm_vcpu_close(vmm->vmm_vcpu[i]);
free(vmm->vmm_vcpu);
errno = _errno;
goto fail;
}
}
return (vmm);
fail:
_errno = errno;
vmm_close_vm(vmm);
errno = _errno;
return (NULL);
}
void
vmm_close_vm(vmm_t *vmm)
{
uint_t i;
vmm_unmap(vmm);
for (i = 0; i < vmm->vmm_ncpu; i++)
vm_vcpu_close(vmm->vmm_vcpu[i]);
free(vmm->vmm_vcpu);
list_destroy(&vmm->vmm_memlist);
if (vmm->vmm_ctx != NULL)
vm_close(vmm->vmm_ctx);
free(vmm);
}
static vmm_memseg_t *
vmm_get_memseg(vmm_t *vmm, uintptr_t gpa)
{
vmm_memseg_t ms, *ret;
int error, flags;
bzero(&ms, sizeof (vmm_memseg_t));
ms.vms_gpa = gpa;
error = vm_mmap_getnext(vmm->vmm_ctx, &ms.vms_gpa, &ms.vms_segid,
&ms.vms_segoff, &ms.vms_maplen, &ms.vms_prot, &flags);
if (error)
return (NULL);
error = vm_get_memseg(vmm->vmm_ctx, ms.vms_segid, &ms.vms_seglen,
ms.vms_name, sizeof (ms.vms_name));
if (error)
return (NULL);
ms.vms_flags = ms.vms_name[0] != '\0' ? VMM_MEMSEG_DEVMEM : 0;
ret = malloc(sizeof (vmm_memseg_t));
if (ret == NULL)
return (NULL);
*ret = ms;
return (ret);
}
int
vmm_map(vmm_t *vmm, boolean_t writable)
{
uintptr_t last_gpa = 0;
vmm_memseg_t *ms;
int prot_write = writable ? PROT_WRITE : 0;
if (vmm->vmm_mem != MAP_FAILED) {
errno = EINVAL;
return (-1);
}
assert(list_is_empty(&vmm->vmm_memlist));
for (;;) {
ms = vmm_get_memseg(vmm, last_gpa);
if (ms == NULL)
break;
last_gpa = ms->vms_gpa + ms->vms_maplen;
list_insert_tail(&vmm->vmm_memlist, ms);
}
vmm->vmm_mem = mmap(NULL, last_gpa, PROT_NONE,
MAP_PRIVATE | MAP_ANON | MAP_NORESERVE, -1, 0);
if (vmm->vmm_mem == MAP_FAILED)
goto fail;
for (ms = list_head(&vmm->vmm_memlist);
ms != NULL;
ms = list_next(&vmm->vmm_memlist, ms)) {
off_t mapoff;
if ((ms->vms_flags & VMM_MEMSEG_DEVMEM) == 0) {
mapoff = ms->vms_gpa;
} else {
if (vm_get_devmem_offset(vmm->vmm_ctx, ms->vms_segid,
&mapoff) != 0) {
goto fail;
}
}
VERIFY(ms->vms_segoff >= 0);
mapoff += ms->vms_segoff;
vmm->vmm_memsize += ms->vms_maplen;
if (mmap(vmm->vmm_mem + ms->vms_gpa, ms->vms_maplen,
PROT_READ | prot_write, MAP_SHARED | MAP_FIXED,
vm_get_device_fd(vmm->vmm_ctx), mapoff) == MAP_FAILED)
goto fail;
}
return (0);
fail:
vmm_unmap(vmm);
return (-1);
}
void
vmm_unmap(vmm_t *vmm)
{
while (!list_is_empty(&vmm->vmm_memlist)) {
vmm_memseg_t *ms = list_remove_head(&vmm->vmm_memlist);
if (vmm->vmm_mem != MAP_FAILED) {
(void) munmap(vmm->vmm_mem + ms->vms_gpa,
ms->vms_maplen);
}
free(ms);
}
if (vmm->vmm_mem != MAP_FAILED)
(void) munmap(vmm->vmm_mem, vmm->vmm_memsize);
vmm->vmm_mem = MAP_FAILED;
vmm->vmm_memsize = 0;
}
ssize_t
vmm_pread(vmm_t *vmm, void *buf, size_t len, uintptr_t addr)
{
ssize_t count = 0;
vmm_memseg_t *ms;
ssize_t res = len;
for (ms = list_head(&vmm->vmm_memlist);
ms != NULL && len != 0;
ms = list_next(&vmm->vmm_memlist, ms)) {
if (addr >= ms->vms_gpa &&
addr < ms->vms_gpa + ms->vms_maplen) {
res = (addr + len) - (ms->vms_gpa + ms->vms_maplen);
if (res < 0)
res = 0;
bcopy(vmm->vmm_mem + addr, buf, len - res);
count += len - res;
addr += len - res;
len = res;
}
}
if (res)
errno = EFAULT;
else
errno = 0;
return (count);
}
ssize_t
vmm_pwrite(vmm_t *vmm, const void *buf, size_t len, uintptr_t addr)
{
ssize_t count = 0;
vmm_memseg_t *ms;
ssize_t res = len;
for (ms = list_head(&vmm->vmm_memlist);
ms != NULL;
ms = list_next(&vmm->vmm_memlist, ms)) {
if (addr >= ms->vms_gpa &&
addr < ms->vms_gpa + ms->vms_maplen) {
res = (addr + len) - (ms->vms_gpa + ms->vms_maplen);
if (res < 0)
res = 0;
bcopy(buf, vmm->vmm_mem + addr, len - res);
count += len - res;
addr += len - res;
len = res;
}
}
if (res)
errno = EFAULT;
else
errno = 0;
return (count);
}
size_t
vmm_ncpu(vmm_t *vmm)
{
return (vmm->vmm_ncpu);
}
size_t
vmm_memsize(vmm_t *vmm)
{
return (vmm->vmm_memsize);
}
int
vmm_cont(vmm_t *vmm)
{
return (vm_resume_all_cpus(vmm->vmm_ctx));
}
int
vmm_step(vmm_t *vmm, int vcpuid)
{
cpuset_t cpuset;
int ret;
if (vcpuid >= vmm->vmm_ncpu) {
errno = EINVAL;
return (-1);
}
ret = vm_set_capability(vmm->vmm_vcpu[vcpuid], VM_CAP_MTRAP_EXIT, 1);
if (ret != 0)
return (-1);
assert(vm_resume_cpu(vmm->vmm_vcpu[vcpuid]) == 0);
do {
(void) vm_debug_cpus(vmm->vmm_ctx, &cpuset);
} while (!CPU_ISSET(vcpuid, &cpuset));
(void) vm_set_capability(vmm->vmm_vcpu[vcpuid], VM_CAP_MTRAP_EXIT, 0);
return (ret);
}
int
vmm_stop(vmm_t *vmm)
{
int ret = vm_suspend_all_cpus(vmm->vmm_ctx);
if (ret == 0)
vmm_update_ncpu(vmm);
return (ret);
}
static int vmm_kdi_regmap[] = {
VM_REG_LAST,
VM_REG_LAST,
VM_REG_GUEST_RDI,
VM_REG_GUEST_RSI,
VM_REG_GUEST_RDX,
VM_REG_GUEST_RCX,
VM_REG_GUEST_R8,
VM_REG_GUEST_R9,
VM_REG_GUEST_RAX,
VM_REG_GUEST_RBX,
VM_REG_GUEST_RBP,
VM_REG_GUEST_R10,
VM_REG_GUEST_R11,
VM_REG_GUEST_R12,
VM_REG_GUEST_R13,
VM_REG_GUEST_R14,
VM_REG_GUEST_R15,
VM_REG_LAST,
VM_REG_LAST,
VM_REG_LAST,
VM_REG_GUEST_CR2,
VM_REG_GUEST_CR3,
VM_REG_GUEST_DS,
VM_REG_GUEST_ES,
VM_REG_GUEST_FS,
VM_REG_GUEST_GS,
VM_REG_LAST,
VM_REG_LAST,
VM_REG_GUEST_RIP,
VM_REG_GUEST_CS,
VM_REG_GUEST_RFLAGS,
VM_REG_GUEST_RSP,
VM_REG_GUEST_SS
};
CTASSERT(ARRAY_SIZE(vmm_kdi_regmap) == KDIREG_NGREG);
static int vmm_sys_regmap[] = {
VM_REG_GUEST_CR0,
VM_REG_GUEST_CR2,
VM_REG_GUEST_CR3,
VM_REG_GUEST_CR4,
VM_REG_GUEST_DR0,
VM_REG_GUEST_DR1,
VM_REG_GUEST_DR2,
VM_REG_GUEST_DR3,
VM_REG_GUEST_DR6,
VM_REG_GUEST_DR7,
VM_REG_GUEST_EFER,
VM_REG_GUEST_PDPTE0,
VM_REG_GUEST_PDPTE1,
VM_REG_GUEST_PDPTE2,
VM_REG_GUEST_PDPTE3,
VM_REG_GUEST_INTR_SHADOW,
};
static int vmm_descmap[] = {
VM_REG_GUEST_GDTR,
VM_REG_GUEST_LDTR,
VM_REG_GUEST_IDTR,
VM_REG_GUEST_TR,
VM_REG_GUEST_CS,
VM_REG_GUEST_DS,
VM_REG_GUEST_ES,
VM_REG_GUEST_FS,
VM_REG_GUEST_GS,
VM_REG_GUEST_SS
};
static int
vmm_mapreg(int reg)
{
errno = 0;
if (reg < 0)
goto fail;
if (reg < KDIREG_NGREG)
return (vmm_kdi_regmap[reg]);
if (reg >= VMM_REG_OFFSET &&
reg < VMM_REG_OFFSET + ARRAY_SIZE(vmm_sys_regmap))
return (vmm_sys_regmap[reg - VMM_REG_OFFSET]);
fail:
errno = EINVAL;
return (VM_REG_LAST);
}
static int
vmm_mapdesc(int desc)
{
errno = 0;
if (desc >= VMM_DESC_OFFSET &&
desc < VMM_DESC_OFFSET + ARRAY_SIZE(vmm_descmap))
return (vmm_descmap[desc - VMM_DESC_OFFSET]);
errno = EINVAL;
return (VM_REG_LAST);
}
int
vmm_getreg(vmm_t *vmm, int vcpuid, int reg, uint64_t *val)
{
reg = vmm_mapreg(reg);
if (reg == VM_REG_LAST)
return (-1);
return (vm_get_register(vmm->vmm_vcpu[vcpuid], reg, val));
}
int
vmm_setreg(vmm_t *vmm, int vcpuid, int reg, uint64_t val)
{
reg = vmm_mapreg(reg);
if (reg == VM_REG_LAST)
return (-1);
return (vm_set_register(vmm->vmm_vcpu[vcpuid], reg, val));
}
int
vmm_get_regset(vmm_t *vmm, int vcpuid, size_t nregs, const int *regnums,
uint64_t *regvals)
{
int *vm_regnums;
int i;
int ret = -1;
vm_regnums = malloc(sizeof (int) * nregs);
if (vm_regnums == NULL)
return (ret);
for (i = 0; i != nregs; i++) {
vm_regnums[i] = vmm_mapreg(regnums[i]);
if (vm_regnums[i] == VM_REG_LAST)
goto fail;
}
ret = vm_get_register_set(vmm->vmm_vcpu[vcpuid], nregs, vm_regnums,
regvals);
fail:
free(vm_regnums);
return (ret);
}
int
vmm_set_regset(vmm_t *vmm, int vcpuid, size_t nregs, const int *regnums,
uint64_t *regvals)
{
int *vm_regnums;
int i;
int ret = -1;
vm_regnums = malloc(sizeof (int) * nregs);
if (vm_regnums == NULL)
return (ret);
for (i = 0; i != nregs; i++) {
vm_regnums[i] = vmm_mapreg(regnums[i]);
if (vm_regnums[i] == VM_REG_LAST)
goto fail;
}
ret = vm_set_register_set(vmm->vmm_vcpu[vcpuid], nregs, vm_regnums,
regvals);
fail:
free(vm_regnums);
return (ret);
}
int
vmm_get_desc(vmm_t *vmm, int vcpuid, int desc, vmm_desc_t *vd)
{
desc = vmm_mapdesc(desc);
if (desc == VM_REG_LAST)
return (-1);
return (vm_get_desc(vmm->vmm_vcpu[vcpuid], desc, &vd->vd_base,
&vd->vd_lim,
&vd->vd_acc));
}
int
vmm_set_desc(vmm_t *vmm, int vcpuid, int desc, vmm_desc_t *vd)
{
desc = vmm_mapdesc(desc);
if (desc == VM_REG_LAST)
return (-1);
return (vm_set_desc(vmm->vmm_vcpu[vcpuid], desc, vd->vd_base,
vd->vd_lim, vd->vd_acc));
}
typedef struct vmm_mmu {
uint64_t vm_cr0;
uint64_t vm_cr3;
uint64_t vm_cr4;
uint64_t vm_efer;
} vmm_mmu_t;
static const int vmm_mmu_regnum[] = {
VMM_REG_CR0,
VMM_REG_CR3,
VMM_REG_CR4,
VMM_REG_EFER
};
#define X86_PTE_P 0x001ULL
#define X86_PTE_PS 0x080ULL
#define X86_PTE_PHYSMASK 0x000ffffffffff000ULL
#define X86_PAGE_SHIFT 12
#define X86_PAGE_SIZE (1ULL << X86_PAGE_SHIFT)
#define X86_SEG_CODE_DATA (1ULL << 4)
#define X86_SEG_PRESENT (1ULL << 7)
#define X86_SEG_LONG (1ULL << 13)
#define X86_SEG_BIG (1ULL << 14)
#define X86_SEG_GRANULARITY (1ULL << 15)
#define X86_SEG_UNUSABLE (1ULL << 16)
#define X86_SEG_USABLE (X86_SEG_PRESENT | X86_SEG_CODE_DATA)
#define X86_SEG_USABLE_MASK (X86_SEG_UNUSABLE | X86_SEG_USABLE)
static int
vmm_pte2paddr(vmm_t *vmm, uint64_t pte, boolean_t ia32, int level,
uint64_t vaddr, uint64_t *paddr)
{
int pte_size = ia32 ? sizeof (uint32_t) : sizeof (uint64_t);
int off_bits = ia32 ? 10 : 9;
boolean_t hugepage = B_FALSE;
uint64_t offset;
uint64_t off_mask, off_shift;
if (level < 4 && (pte & X86_PTE_P) == 0) {
errno = EFAULT;
return (-1);
}
off_shift = X86_PAGE_SHIFT + off_bits * level;
off_mask = (1ULL << off_shift) - 1;
offset = vaddr & off_mask;
if ((level == 1 || level == 2) && (pte & X86_PTE_PS) != 0) {
hugepage = B_TRUE;
} else {
if (level > 0) {
offset >>= off_shift - off_bits;
offset <<= X86_PAGE_SHIFT - off_bits;
}
off_mask = 0xfff;
}
*paddr = (pte & X86_PTE_PHYSMASK & ~off_mask) + offset;
if (level == 0 || hugepage)
return (0);
pte = 0;
if (vmm_pread(vmm, &pte, pte_size, *paddr) != pte_size)
return (-1);
return (vmm_pte2paddr(vmm, pte, ia32, level - 1, vaddr, paddr));
}
static vmm_mode_t
vmm_vcpu_mmu_mode(vmm_t *vmm, int vcpuid __unused, vmm_mmu_t *mmu)
{
if ((mmu->vm_cr0 & CR0_PE) == 0)
return (VMM_MODE_REAL);
else if ((mmu->vm_cr4 & CR4_PAE) == 0)
return (VMM_MODE_PROT);
else if ((mmu->vm_efer & AMD_EFER_LME) == 0)
return (VMM_MODE_PAE);
else
return (VMM_MODE_LONG);
}
vmm_mode_t
vmm_vcpu_mode(vmm_t *vmm, int vcpuid)
{
vmm_mmu_t mmu = { 0 };
if (vmm_get_regset(vmm, vcpuid, ARRAY_SIZE(vmm_mmu_regnum),
vmm_mmu_regnum, (uint64_t *)&mmu) != 0)
return (VMM_MODE_UNKNOWN);
return (vmm_vcpu_mmu_mode(vmm, vcpuid, &mmu));
}
vmm_isa_t
vmm_vcpu_isa(vmm_t *vmm, int vcpuid)
{
vmm_desc_t cs;
if (vmm_get_desc(vmm, vcpuid, VMM_DESC_CS, &cs) != 0)
return (VMM_ISA_UNKNOWN);
switch (cs.vd_acc & (X86_SEG_BIG | X86_SEG_LONG)) {
case 0x0:
return (VMM_ISA_16);
case X86_SEG_LONG:
return (VMM_ISA_64);
case X86_SEG_BIG:
return (VMM_ISA_32);
}
return (VMM_ISA_UNKNOWN);
}
int
vmm_vtol(vmm_t *vmm, int vcpuid, int seg, uint64_t vaddr, uint64_t *laddr)
{
vmm_desc_t desc;
uint64_t limit;
if (vmm_get_desc(vmm, vcpuid, seg, &desc) != 0)
return (-1);
switch (vmm_vcpu_mode(vmm, vcpuid)) {
case VMM_MODE_REAL:
if (seg == VMM_DESC_FS || seg == VMM_DESC_GS)
goto fault;
case VMM_MODE_PROT:
case VMM_MODE_PAE:
if ((desc.vd_acc & X86_SEG_USABLE_MASK) != X86_SEG_USABLE)
goto fault;
limit = desc.vd_lim;
if (desc.vd_acc & X86_SEG_GRANULARITY)
limit *= 4096;
if (vaddr > limit)
goto fault;
case VMM_MODE_LONG:
*laddr = desc.vd_base + vaddr;
return (0);
default:
fault:
errno = EFAULT;
return (-1);
}
}
int
vmm_vtop(vmm_t *vmm, int vcpuid, int seg, uint64_t vaddr, uint64_t *paddr)
{
vmm_mmu_t mmu = { 0 };
int ret = 0;
if (vmm_vtol(vmm, vcpuid, seg, vaddr, &vaddr) != 0)
return (-1);
if (vmm_get_regset(vmm, vcpuid, ARRAY_SIZE(vmm_mmu_regnum),
vmm_mmu_regnum, (uint64_t *)&mmu) != 0)
return (-1);
if ((mmu.vm_cr0 & CR0_PG) == 0) {
*paddr = vaddr;
return (0);
}
switch (vmm_vcpu_mmu_mode(vmm, vcpuid, &mmu)) {
case VMM_MODE_PROT:
ret = vmm_pte2paddr(vmm, mmu.vm_cr3, B_TRUE, 2, vaddr, paddr);
break;
case VMM_MODE_PAE:
ret = vmm_pte2paddr(vmm, mmu.vm_cr3, B_FALSE, 3, vaddr, paddr);
break;
case VMM_MODE_LONG:
ret = vmm_pte2paddr(vmm, mmu.vm_cr3, B_FALSE, 4, vaddr, paddr);
break;
default:
ret = -1;
}
return (ret);
}
ssize_t
vmm_vread(vmm_t *vmm, int vcpuid, int seg, void *buf, size_t len, uintptr_t
addr)
{
ssize_t res = 0;
uint64_t paddr;
size_t plen;
uint64_t boundary;
while (len != 0) {
if (vmm_vtop(vmm, vcpuid, seg, addr, &paddr) != 0) {
errno = EFAULT;
return (0);
}
boundary = (addr + X86_PAGE_SIZE) & ~(X86_PAGE_SIZE - 1);
if (addr + len > boundary)
plen = boundary - addr;
else
plen = len;
if (vmm_pread(vmm, buf, plen, paddr) != plen)
return (0);
len -= plen;
addr += plen;
buf += plen;
res += plen;
}
return (res);
}
ssize_t
vmm_vwrite(vmm_t *vmm, int vcpuid, int seg, const void *buf, size_t len,
uintptr_t addr)
{
ssize_t res = 0;
uint64_t paddr;
size_t plen;
uint64_t boundary;
while (len != 0) {
if (vmm_vtop(vmm, vcpuid, seg, addr, &paddr) != 0) {
errno = EFAULT;
return (0);
}
boundary = (addr + X86_PAGE_SIZE) & ~(X86_PAGE_SIZE - 1);
if (addr + len > boundary)
plen = boundary - addr;
else
plen = len;
if (vmm_pwrite(vmm, buf, plen, paddr) != plen)
return (0);
len -= plen;
addr += plen;
buf += plen;
res += plen;
}
return (res);
}