#include <sys/cdefs.h>
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/module.h>
#include <sys/sysctl.h>
#include <sys/kmem.h>
#include <sys/pcpu.h>
#include <sys/mutex.h>
#include <sys/proc.h>
#include <sys/rwlock.h>
#include <sys/sched.h>
#include <sys/systm.h>
#include <sys/sunddi.h>
#include <sys/hma.h>
#include <sys/archsystm.h>
#include <machine/md_var.h>
#include <x86/psl.h>
#include <x86/apicreg.h>
#include <machine/specialreg.h>
#include <machine/vmm.h>
#include <machine/vmm_dev.h>
#include <machine/vmparam.h>
#include <sys/vmm_instruction_emul.h>
#include <sys/vmm_vm.h>
#include <sys/vmm_gpt.h>
#include <sys/vmm_data.h>
#include "vmm_ioport.h"
#include "vmm_host.h"
#include "vmm_util.h"
#include "vatpic.h"
#include "vatpit.h"
#include "vhpet.h"
#include "vioapic.h"
#include "vlapic.h"
#include "vpmtmr.h"
#include "vrtc.h"
#include "vmm_stat.h"
#include "vmm_lapic.h"
#include "io/ppt.h"
#include "io/iommu.h"
struct vlapic;
#define VTCS_FPU_RESTORED 1
#define VTCS_FPU_CTX_CRITICAL 2
typedef struct vm_thread_ctx {
struct vm *vtc_vm;
int vtc_vcpuid;
uint_t vtc_status;
enum vcpu_ustate vtc_ustate;
} vm_thread_ctx_t;
#define VMM_MTRR_VAR_MAX 10
#define VMM_MTRR_DEF_MASK \
(MTRR_DEF_ENABLE | MTRR_DEF_FIXED_ENABLE | MTRR_DEF_TYPE)
#define VMM_MTRR_PHYSBASE_MASK (MTRR_PHYSBASE_PHYSBASE | MTRR_PHYSBASE_TYPE)
#define VMM_MTRR_PHYSMASK_MASK (MTRR_PHYSMASK_PHYSMASK | MTRR_PHYSMASK_VALID)
struct vm_mtrr {
uint64_t def_type;
uint64_t fixed4k[8];
uint64_t fixed16k[2];
uint64_t fixed64k;
struct {
uint64_t base;
uint64_t mask;
} var[VMM_MTRR_VAR_MAX];
};
struct vcpu {
kmutex_t lock;
enum vcpu_state state;
enum vcpu_run_state run_state;
kcondvar_t vcpu_cv;
kcondvar_t state_cv;
int hostcpu;
int lastloccpu;
bool reqidle;
bool reqconsist;
bool reqbarrier;
struct vlapic *vlapic;
enum x2apic_state x2apic_state;
uint64_t exit_intinfo;
uint64_t exc_pending;
bool nmi_pending;
bool extint_pending;
uint8_t sipi_vector;
hma_fpu_t *guestfpu;
uint64_t guest_xcr0;
void *stats;
struct vm_exit exitinfo;
uint64_t nextrip;
struct vie *vie_ctx;
vm_client_t *vmclient;
uint64_t tsc_offset;
struct vm_mtrr mtrr;
vcpu_cpuid_config_t cpuid_cfg;
enum vcpu_ustate ustate;
hrtime_t ustate_when;
uint64_t ustate_total[VU_MAX];
vm_thread_ctx_t vtc;
struct ctxop *ctxop;
};
#define vcpu_lock(v) mutex_enter(&((v)->lock))
#define vcpu_unlock(v) mutex_exit(&((v)->lock))
#define vcpu_assert_locked(v) ASSERT(MUTEX_HELD(&((v)->lock)))
struct mem_seg {
size_t len;
bool sysmem;
vm_object_t *object;
};
#define VM_MAX_MEMSEGS 5
struct mem_map {
vm_paddr_t gpa;
size_t len;
uintptr_t segoff;
int segid;
int prot;
int flags;
};
#define VM_MAX_MEMMAPS 8
static uint_t mmiohook_entry_limit = 64;
typedef struct mmiohook_entry {
mmio_handler_t mhe_func;
void *mhe_arg;
uint64_t mhe_addr;
uint32_t mhe_size;
uint32_t mhe_pad;
} mmiohook_entry_t;
struct mmiohook_config {
mmiohook_entry_t *mhc_entries;
uint_t mhc_count;
};
struct vm {
void *cookie;
void *iommu;
struct vhpet *vhpet;
struct vioapic *vioapic;
struct vatpic *vatpic;
struct vatpit *vatpit;
struct vpmtmr *vpmtmr;
struct vrtc *vrtc;
volatile cpuset_t active_cpus;
volatile cpuset_t debug_cpus;
volatile cpuset_t halted_cpus;
int suspend_how;
int suspend_source;
hrtime_t suspend_when;
struct mem_map mem_maps[VM_MAX_MEMMAPS];
struct mem_seg mem_segs[VM_MAX_MEMSEGS];
struct vmspace *vmspace;
struct vcpu vcpu[VM_MAXCPU];
uint16_t sockets;
uint16_t cores;
uint16_t threads;
uint16_t maxcpus;
hrtime_t boot_hrtime;
uint64_t tsc_offset;
uint64_t guest_freq;
uint64_t freq_multiplier;
struct ioport_config ioports;
struct mmiohook_config mmiohooks;
bool mem_transient;
bool is_paused;
};
static int vmm_initialized;
static uint64_t vmm_host_freq;
static void
nullop_panic(void)
{
panic("null vmm operation call");
}
static struct vmm_ops vmm_ops_null = {
.init = (vmm_init_func_t)nullop_panic,
.resume = (vmm_resume_func_t)nullop_panic,
.vminit = (vmi_init_func_t)nullop_panic,
.vmrun = (vmi_run_func_t)nullop_panic,
.vmcleanup = (vmi_cleanup_func_t)nullop_panic,
.vmgetreg = (vmi_get_register_t)nullop_panic,
.vmsetreg = (vmi_set_register_t)nullop_panic,
.vmgetdesc = (vmi_get_desc_t)nullop_panic,
.vmsetdesc = (vmi_set_desc_t)nullop_panic,
.vmgetcap = (vmi_get_cap_t)nullop_panic,
.vmsetcap = (vmi_set_cap_t)nullop_panic,
.vlapic_init = (vmi_vlapic_init)nullop_panic,
.vlapic_cleanup = (vmi_vlapic_cleanup)nullop_panic,
.vmpause = (vmi_pause_t)nullop_panic,
.vmsavectx = (vmi_savectx)nullop_panic,
.vmrestorectx = (vmi_restorectx)nullop_panic,
.vmgetmsr = (vmi_get_msr_t)nullop_panic,
.vmsetmsr = (vmi_set_msr_t)nullop_panic,
.vmfreqratio = (vmi_freqratio_t)nullop_panic,
.fr_fracsize = 0,
.fr_intsize = 0,
};
static struct vmm_ops *ops = &vmm_ops_null;
#define VMM_INIT() ((*ops->init)())
#define VMM_RESUME() ((*ops->resume)())
#define VMINIT(vm) ((*ops->vminit)(vm))
#define VMRUN(vmi, vcpu, rip) ((*ops->vmrun)(vmi, vcpu, rip))
#define VMCLEANUP(vmi) ((*ops->vmcleanup)(vmi))
#define VMGETREG(vmi, vcpu, num, rv) ((*ops->vmgetreg)(vmi, vcpu, num, rv))
#define VMSETREG(vmi, vcpu, num, val) ((*ops->vmsetreg)(vmi, vcpu, num, val))
#define VMGETDESC(vmi, vcpu, num, dsc) ((*ops->vmgetdesc)(vmi, vcpu, num, dsc))
#define VMSETDESC(vmi, vcpu, num, dsc) ((*ops->vmsetdesc)(vmi, vcpu, num, dsc))
#define VMGETCAP(vmi, vcpu, num, rv) ((*ops->vmgetcap)(vmi, vcpu, num, rv))
#define VMSETCAP(vmi, vcpu, num, val) ((*ops->vmsetcap)(vmi, vcpu, num, val))
#define VLAPIC_INIT(vmi, vcpu) ((*ops->vlapic_init)(vmi, vcpu))
#define VLAPIC_CLEANUP(vmi, vlapic) ((*ops->vlapic_cleanup)(vmi, vlapic))
#define fpu_start_emulating() load_cr0(rcr0() | CR0_TS)
#define fpu_stop_emulating() clts()
SDT_PROVIDER_DEFINE(vmm);
SYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL,
NULL);
int halt_detection_enabled = 1;
int trace_guest_exceptions;
int trap_wbinvd = 1;
static void vm_free_memmap(struct vm *vm, int ident);
static bool sysmem_mapping(struct vm *vm, struct mem_map *mm);
static void vcpu_notify_event_locked(struct vcpu *vcpu, vcpu_notify_t);
static bool vcpu_sleep_bailout_checks(struct vm *vm, int vcpuid);
static int vcpu_vector_sipi(struct vm *vm, int vcpuid, uint8_t vector);
static bool vm_is_suspended(struct vm *, struct vm_exit *);
static void vm_mmiohook_init(struct vm *, struct mmiohook_config *);
static void vm_mmiohook_cleanup(struct vm *, struct mmiohook_config *);
static void vmm_savectx(void *);
static void vmm_restorectx(void *);
static const struct ctxop_template vmm_ctxop_tpl = {
.ct_rev = CTXOP_TPL_REV,
.ct_save = vmm_savectx,
.ct_restore = vmm_restorectx,
};
static uint64_t calc_tsc_offset(uint64_t base_host_tsc, uint64_t base_guest_tsc,
uint64_t mult);
static uint64_t calc_guest_tsc(uint64_t host_tsc, uint64_t mult,
uint64_t offset);
uint64_t calc_freq_multiplier(uint64_t guest_hz, uint64_t host_hz,
uint32_t frac_size);
uint64_t scale_tsc(uint64_t tsc, uint64_t multiplier, uint32_t frac_size);
#ifdef KTR
static const char *
vcpu_state2str(enum vcpu_state state)
{
switch (state) {
case VCPU_IDLE:
return ("idle");
case VCPU_FROZEN:
return ("frozen");
case VCPU_RUNNING:
return ("running");
case VCPU_SLEEPING:
return ("sleeping");
default:
return ("unknown");
}
}
#endif
static void
vcpu_cleanup(struct vm *vm, int i, bool destroy)
{
struct vcpu *vcpu = &vm->vcpu[i];
VLAPIC_CLEANUP(vm->cookie, vcpu->vlapic);
if (destroy) {
vmm_stat_free(vcpu->stats);
vcpu_cpuid_cleanup(&vcpu->cpuid_cfg);
hma_fpu_free(vcpu->guestfpu);
vcpu->guestfpu = NULL;
vie_free(vcpu->vie_ctx);
vcpu->vie_ctx = NULL;
vmc_destroy(vcpu->vmclient);
vcpu->vmclient = NULL;
ctxop_free(vcpu->ctxop);
mutex_destroy(&vcpu->lock);
}
}
static void
vcpu_init(struct vm *vm, int vcpu_id, bool create)
{
struct vcpu *vcpu;
KASSERT(vcpu_id >= 0 && vcpu_id < vm->maxcpus,
("vcpu_init: invalid vcpu %d", vcpu_id));
vcpu = &vm->vcpu[vcpu_id];
if (create) {
mutex_init(&vcpu->lock, NULL, MUTEX_ADAPTIVE, NULL);
vcpu->state = VCPU_IDLE;
vcpu->hostcpu = NOCPU;
vcpu->lastloccpu = NOCPU;
vcpu->guestfpu = hma_fpu_alloc(KM_SLEEP);
vcpu->stats = vmm_stat_alloc();
vcpu->vie_ctx = vie_alloc();
vcpu_cpuid_init(&vcpu->cpuid_cfg);
vcpu->ustate = VU_INIT;
vcpu->ustate_when = gethrtime();
vcpu->vtc.vtc_vm = vm;
vcpu->vtc.vtc_vcpuid = vcpu_id;
vcpu->ctxop = ctxop_allocate(&vmm_ctxop_tpl, &vcpu->vtc);
} else {
vie_reset(vcpu->vie_ctx);
bzero(&vcpu->exitinfo, sizeof (vcpu->exitinfo));
vcpu_ustate_change(vm, vcpu_id, VU_INIT);
bzero(&vcpu->mtrr, sizeof (vcpu->mtrr));
}
vcpu->run_state = VRS_HALT;
vcpu->vlapic = VLAPIC_INIT(vm->cookie, vcpu_id);
(void) vm_set_x2apic_state(vm, vcpu_id, X2APIC_DISABLED);
vcpu->reqidle = false;
vcpu->reqconsist = false;
vcpu->reqbarrier = false;
vcpu->exit_intinfo = 0;
vcpu->nmi_pending = false;
vcpu->extint_pending = false;
vcpu->exc_pending = 0;
vcpu->guest_xcr0 = XFEATURE_ENABLED_X87;
(void) hma_fpu_init(vcpu->guestfpu);
vmm_stat_init(vcpu->stats);
vcpu->tsc_offset = 0;
}
int
vcpu_trace_exceptions(struct vm *vm, int vcpuid)
{
return (trace_guest_exceptions);
}
int
vcpu_trap_wbinvd(struct vm *vm, int vcpuid)
{
return (trap_wbinvd);
}
struct vm_exit *
vm_exitinfo(struct vm *vm, int cpuid)
{
struct vcpu *vcpu;
if (cpuid < 0 || cpuid >= vm->maxcpus)
panic("vm_exitinfo: invalid cpuid %d", cpuid);
vcpu = &vm->vcpu[cpuid];
return (&vcpu->exitinfo);
}
struct vie *
vm_vie_ctx(struct vm *vm, int cpuid)
{
if (cpuid < 0 || cpuid >= vm->maxcpus)
panic("vm_vie_ctx: invalid cpuid %d", cpuid);
return (vm->vcpu[cpuid].vie_ctx);
}
static int
vmm_init(void)
{
vmm_host_state_init();
vmm_host_freq = unscalehrtime(NANOSEC);
if (vmm_is_intel()) {
ops = &vmm_ops_intel;
} else if (vmm_is_svm()) {
ops = &vmm_ops_amd;
} else {
return (ENXIO);
}
if (!vmm_vm_init()) {
return (ENXIO);
}
const int err = VMM_INIT();
if (err != 0) {
vmm_vm_fini();
ops = &vmm_ops_null;
return (err);
}
return (0);
}
int
vmm_mod_load()
{
VERIFY(vmm_initialized == 0);
const int err = vmm_init();
if (err == 0) {
vmm_initialized = 1;
}
return (err);
}
void
vmm_mod_unload()
{
VERIFY(vmm_initialized == 1);
vmm_vm_fini();
vmm_initialized = 0;
}
bool
vmm_check_iommu(void)
{
void *domain;
const size_t arb_test_sz = (1UL << 32);
domain = iommu_create_domain(arb_test_sz);
if (domain == NULL) {
return (false);
}
iommu_destroy_domain(domain);
return (true);
}
static void
vm_init(struct vm *vm, bool create)
{
int i;
vm->cookie = VMINIT(vm);
vm->iommu = NULL;
vm->vioapic = vioapic_init(vm);
vm->vhpet = vhpet_init(vm);
vm->vatpic = vatpic_init(vm);
vm->vatpit = vatpit_init(vm);
vm->vpmtmr = vpmtmr_init(vm);
if (create)
vm->vrtc = vrtc_init(vm);
vm_inout_init(vm, &vm->ioports);
vm_mmiohook_init(vm, &vm->mmiohooks);
CPU_ZERO(&vm->active_cpus);
CPU_ZERO(&vm->debug_cpus);
vm->suspend_how = 0;
vm->suspend_source = 0;
vm->suspend_when = 0;
for (i = 0; i < vm->maxcpus; i++)
vcpu_init(vm, i, create);
const uint64_t boot_tsc = rdtsc_offset();
vm->boot_hrtime = (hrtime_t)boot_tsc;
scalehrtime(&vm->boot_hrtime);
vm->guest_freq = vmm_host_freq;
vm->freq_multiplier = VM_TSCM_NOSCALE;
vm->tsc_offset = calc_tsc_offset(boot_tsc, 0, vm->freq_multiplier);
}
uint_t cores_per_package = 1;
uint_t threads_per_core = 1;
int
vm_create(uint64_t flags, struct vm **retvm)
{
struct vm *vm;
struct vmspace *vmspace;
if (!vmm_initialized)
return (ENXIO);
vmspace = vmspace_alloc(VM_MAXUSER_ADDRESS);
if (vmspace == NULL) {
return (ENOMEM);
}
if ((flags & VCF_TRACK_DIRTY) != 0) {
if (vmspace_set_tracking(vmspace, true) != 0) {
vmspace_destroy(vmspace);
return (ENOTSUP);
}
}
vm = kmem_zalloc(sizeof (struct vm), KM_SLEEP);
vm->vmspace = vmspace;
vm->mem_transient = (flags & VCF_RESERVOIR_MEM) == 0;
for (uint_t i = 0; i < VM_MAXCPU; i++) {
vm->vcpu[i].vmclient = vmspace_client_alloc(vmspace);
}
vm->sockets = 1;
vm->cores = cores_per_package;
vm->threads = threads_per_core;
vm->maxcpus = VM_MAXCPU;
vm_init(vm, true);
*retvm = vm;
return (0);
}
void
vm_get_topology(struct vm *vm, uint16_t *sockets, uint16_t *cores,
uint16_t *threads, uint16_t *maxcpus)
{
*sockets = vm->sockets;
*cores = vm->cores;
*threads = vm->threads;
*maxcpus = vm->maxcpus;
}
uint16_t
vm_get_maxcpus(struct vm *vm)
{
return (vm->maxcpus);
}
int
vm_set_topology(struct vm *vm, uint16_t sockets, uint16_t cores,
uint16_t threads, uint16_t maxcpus)
{
if (maxcpus != 0)
return (EINVAL);
if ((sockets * cores * threads) > vm->maxcpus)
return (EINVAL);
vm->sockets = sockets;
vm->cores = cores;
vm->threads = threads;
vm->maxcpus = VM_MAXCPU;
return (0);
}
static void
vm_cleanup(struct vm *vm, bool destroy)
{
struct mem_map *mm;
int i;
ppt_unassign_all(vm);
if (vm->iommu != NULL)
iommu_destroy_domain(vm->iommu);
vpmtmr_cleanup(vm->vpmtmr);
vm_inout_cleanup(vm, &vm->ioports);
vm_mmiohook_cleanup(vm, &vm->mmiohooks);
if (destroy)
vrtc_cleanup(vm->vrtc);
else
vrtc_reset(vm->vrtc);
vatpit_cleanup(vm->vatpit);
vhpet_cleanup(vm->vhpet);
vatpic_cleanup(vm->vatpic);
vioapic_cleanup(vm->vioapic);
for (i = 0; i < vm->maxcpus; i++)
vcpu_cleanup(vm, i, destroy);
VMCLEANUP(vm->cookie);
for (i = 0; i < VM_MAX_MEMMAPS; i++) {
mm = &vm->mem_maps[i];
if (destroy || !sysmem_mapping(vm, mm)) {
vm_free_memmap(vm, i);
} else {
mm->flags &= ~VM_MEMMAP_F_IOMMU;
}
}
if (destroy) {
for (i = 0; i < VM_MAX_MEMSEGS; i++)
vm_free_memseg(vm, i);
vmspace_destroy(vm->vmspace);
vm->vmspace = NULL;
}
}
void
vm_destroy(struct vm *vm)
{
vm_cleanup(vm, true);
kmem_free(vm, sizeof (*vm));
}
int
vm_reinit(struct vm *vm, uint64_t flags)
{
vm_cleanup(vm, false);
vm_init(vm, false);
return (0);
}
bool
vm_is_paused(struct vm *vm)
{
return (vm->is_paused);
}
int
vm_pause_instance(struct vm *vm)
{
if (vm->is_paused) {
return (EALREADY);
}
vm->is_paused = true;
for (uint_t i = 0; i < vm->maxcpus; i++) {
struct vcpu *vcpu = &vm->vcpu[i];
if (!CPU_ISSET(i, &vm->active_cpus)) {
continue;
}
vlapic_pause(vcpu->vlapic);
ops->vmpause(vm->cookie, i);
}
vhpet_pause(vm->vhpet);
vatpit_pause(vm->vatpit);
vrtc_pause(vm->vrtc);
return (0);
}
int
vm_resume_instance(struct vm *vm)
{
if (!vm->is_paused) {
return (EALREADY);
}
vm->is_paused = false;
vrtc_resume(vm->vrtc);
vatpit_resume(vm->vatpit);
vhpet_resume(vm->vhpet);
for (uint_t i = 0; i < vm->maxcpus; i++) {
struct vcpu *vcpu = &vm->vcpu[i];
if (!CPU_ISSET(i, &vm->active_cpus)) {
continue;
}
vlapic_resume(vcpu->vlapic);
}
return (0);
}
int
vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa)
{
vm_object_t *obj;
if ((obj = vmm_mmio_alloc(vm->vmspace, gpa, len, hpa)) == NULL)
return (ENOMEM);
else
return (0);
}
int
vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len)
{
return (vmspace_unmap(vm->vmspace, gpa, len));
}
bool
vm_mem_allocated(struct vm *vm, int vcpuid, vm_paddr_t gpa)
{
struct mem_map *mm;
int i;
#ifdef INVARIANTS
int hostcpu, state;
state = vcpu_get_state(vm, vcpuid, &hostcpu);
KASSERT(state == VCPU_RUNNING && hostcpu == curcpu,
("%s: invalid vcpu state %d/%d", __func__, state, hostcpu));
#endif
for (i = 0; i < VM_MAX_MEMMAPS; i++) {
mm = &vm->mem_maps[i];
if (mm->len != 0 && gpa >= mm->gpa && gpa < mm->gpa + mm->len)
return (true);
}
if (ppt_is_mmio(vm, gpa))
return (true);
return (false);
}
int
vm_alloc_memseg(struct vm *vm, int ident, size_t len, bool sysmem)
{
struct mem_seg *seg;
vm_object_t *obj;
if (ident < 0 || ident >= VM_MAX_MEMSEGS)
return (EINVAL);
if (len == 0 || (len & PAGEOFFSET))
return (EINVAL);
seg = &vm->mem_segs[ident];
if (seg->object != NULL) {
if (seg->len == len && seg->sysmem == sysmem)
return (EEXIST);
else
return (EINVAL);
}
obj = vm_object_mem_allocate(len, vm->mem_transient);
if (obj == NULL)
return (ENOMEM);
seg->len = len;
seg->object = obj;
seg->sysmem = sysmem;
return (0);
}
int
vm_get_memseg(struct vm *vm, int ident, size_t *len, bool *sysmem,
vm_object_t **objptr)
{
struct mem_seg *seg;
if (ident < 0 || ident >= VM_MAX_MEMSEGS)
return (EINVAL);
seg = &vm->mem_segs[ident];
if (len)
*len = seg->len;
if (sysmem)
*sysmem = seg->sysmem;
if (objptr)
*objptr = seg->object;
return (0);
}
void
vm_free_memseg(struct vm *vm, int ident)
{
struct mem_seg *seg;
KASSERT(ident >= 0 && ident < VM_MAX_MEMSEGS,
("%s: invalid memseg ident %d", __func__, ident));
seg = &vm->mem_segs[ident];
if (seg->object != NULL) {
vm_object_release(seg->object);
bzero(seg, sizeof (struct mem_seg));
}
}
int
vm_mmap_memseg(struct vm *vm, vm_paddr_t gpa, int segid, uintptr_t off,
size_t len, int prot, int flags)
{
if (prot == 0 || (prot & ~(PROT_ALL)) != 0)
return (EINVAL);
if (flags & ~VM_MEMMAP_F_WIRED)
return (EINVAL);
if (segid < 0 || segid >= VM_MAX_MEMSEGS)
return (EINVAL);
const struct mem_seg *seg = &vm->mem_segs[segid];
if (seg->object == NULL)
return (EINVAL);
const uintptr_t end = off + len;
if (((gpa | off | end) & PAGEOFFSET) != 0)
return (EINVAL);
if (end < off || end > seg->len)
return (EINVAL);
struct mem_map *map = NULL;
for (int i = 0; i < VM_MAX_MEMMAPS; i++) {
struct mem_map *m = &vm->mem_maps[i];
if (m->len == 0) {
map = m;
break;
}
}
if (map == NULL)
return (ENOSPC);
int error = vmspace_map(vm->vmspace, seg->object, off, gpa, len, prot);
if (error != 0)
return (EFAULT);
vm_object_reference(seg->object);
if ((flags & VM_MEMMAP_F_WIRED) != 0) {
error = vmspace_populate(vm->vmspace, gpa, len);
if (error != 0) {
VERIFY0(vmspace_unmap(vm->vmspace, gpa, len));
return (EFAULT);
}
}
map->gpa = gpa;
map->len = len;
map->segoff = off;
map->segid = segid;
map->prot = prot;
map->flags = flags;
return (0);
}
int
vm_munmap_memseg(struct vm *vm, vm_paddr_t gpa, size_t len)
{
struct mem_map *m;
int i;
for (i = 0; i < VM_MAX_MEMMAPS; i++) {
m = &vm->mem_maps[i];
if (m->gpa == gpa && m->len == len &&
(m->flags & VM_MEMMAP_F_IOMMU) == 0) {
vm_free_memmap(vm, i);
return (0);
}
}
return (EINVAL);
}
int
vm_mmap_getnext(struct vm *vm, vm_paddr_t *gpa, int *segid,
uintptr_t *segoff, size_t *len, int *prot, int *flags)
{
struct mem_map *mm, *mmnext;
int i;
mmnext = NULL;
for (i = 0; i < VM_MAX_MEMMAPS; i++) {
mm = &vm->mem_maps[i];
if (mm->len == 0 || mm->gpa < *gpa)
continue;
if (mmnext == NULL || mm->gpa < mmnext->gpa)
mmnext = mm;
}
if (mmnext != NULL) {
*gpa = mmnext->gpa;
if (segid)
*segid = mmnext->segid;
if (segoff)
*segoff = mmnext->segoff;
if (len)
*len = mmnext->len;
if (prot)
*prot = mmnext->prot;
if (flags)
*flags = mmnext->flags;
return (0);
} else {
return (ENOENT);
}
}
static void
vm_free_memmap(struct vm *vm, int ident)
{
struct mem_map *mm;
int error;
mm = &vm->mem_maps[ident];
if (mm->len) {
error = vmspace_unmap(vm->vmspace, mm->gpa, mm->len);
VERIFY0(error);
bzero(mm, sizeof (struct mem_map));
}
}
static __inline bool
sysmem_mapping(struct vm *vm, struct mem_map *mm)
{
if (mm->len != 0 && vm->mem_segs[mm->segid].sysmem)
return (true);
else
return (false);
}
vm_paddr_t
vmm_sysmem_maxaddr(struct vm *vm)
{
struct mem_map *mm;
vm_paddr_t maxaddr;
int i;
maxaddr = 0;
for (i = 0; i < VM_MAX_MEMMAPS; i++) {
mm = &vm->mem_maps[i];
if (sysmem_mapping(vm, mm)) {
if (maxaddr < mm->gpa + mm->len)
maxaddr = mm->gpa + mm->len;
}
}
return (maxaddr);
}
static void
vm_iommu_modify(struct vm *vm, bool map)
{
int i, sz;
vm_paddr_t gpa, hpa;
struct mem_map *mm;
vm_client_t *vmc;
sz = PAGE_SIZE;
vmc = vmspace_client_alloc(vm->vmspace);
for (i = 0; i < VM_MAX_MEMMAPS; i++) {
mm = &vm->mem_maps[i];
if (!sysmem_mapping(vm, mm))
continue;
if (map) {
KASSERT((mm->flags & VM_MEMMAP_F_IOMMU) == 0,
("iommu map found invalid memmap %lx/%lx/%x",
mm->gpa, mm->len, mm->flags));
if ((mm->flags & VM_MEMMAP_F_WIRED) == 0)
continue;
mm->flags |= VM_MEMMAP_F_IOMMU;
} else {
if ((mm->flags & VM_MEMMAP_F_IOMMU) == 0)
continue;
mm->flags &= ~VM_MEMMAP_F_IOMMU;
KASSERT((mm->flags & VM_MEMMAP_F_WIRED) != 0,
("iommu unmap found invalid memmap %lx/%lx/%x",
mm->gpa, mm->len, mm->flags));
}
gpa = mm->gpa;
while (gpa < mm->gpa + mm->len) {
vm_page_t *vmp;
vmp = vmc_hold(vmc, gpa, PROT_WRITE);
ASSERT(vmp != NULL);
hpa = ((uintptr_t)vmp_get_pfn(vmp) << PAGESHIFT);
(void) vmp_release(vmp);
if (map) {
iommu_create_mapping(vm->iommu, gpa, hpa, sz);
} else {
iommu_remove_mapping(vm->iommu, gpa, sz);
}
gpa += PAGE_SIZE;
}
}
vmc_destroy(vmc);
iommu_invalidate_tlb(vm->iommu);
}
int
vm_unassign_pptdev(struct vm *vm, int pptfd)
{
int error;
error = ppt_unassign_device(vm, pptfd);
if (error)
return (error);
if (ppt_assigned_devices(vm) == 0)
vm_iommu_modify(vm, false);
return (0);
}
int
vm_assign_pptdev(struct vm *vm, int pptfd)
{
int error;
vm_paddr_t maxaddr;
if (ppt_assigned_devices(vm) == 0) {
KASSERT(vm->iommu == NULL,
("vm_assign_pptdev: iommu must be NULL"));
maxaddr = vmm_sysmem_maxaddr(vm);
vm->iommu = iommu_create_domain(maxaddr);
if (vm->iommu == NULL)
return (ENXIO);
vm_iommu_modify(vm, true);
}
error = ppt_assign_device(vm, pptfd);
return (error);
}
int
vm_get_register(struct vm *vm, int vcpuid, int reg, uint64_t *retval)
{
if (vcpuid < 0 || vcpuid >= vm->maxcpus)
return (EINVAL);
if (reg >= VM_REG_LAST)
return (EINVAL);
struct vcpu *vcpu = &vm->vcpu[vcpuid];
switch (reg) {
case VM_REG_GUEST_XCR0:
*retval = vcpu->guest_xcr0;
return (0);
default:
return (VMGETREG(vm->cookie, vcpuid, reg, retval));
}
}
int
vm_set_register(struct vm *vm, int vcpuid, int reg, uint64_t val)
{
if (vcpuid < 0 || vcpuid >= vm->maxcpus)
return (EINVAL);
if (reg >= VM_REG_LAST)
return (EINVAL);
int error;
struct vcpu *vcpu = &vm->vcpu[vcpuid];
switch (reg) {
case VM_REG_GUEST_RIP:
error = VMSETREG(vm->cookie, vcpuid, reg, val);
if (error == 0) {
vcpu->nextrip = val;
}
return (error);
case VM_REG_GUEST_XCR0:
if (!validate_guest_xcr0(val, vmm_get_host_xcr0())) {
return (EINVAL);
}
vcpu->guest_xcr0 = val;
return (0);
default:
return (VMSETREG(vm->cookie, vcpuid, reg, val));
}
}
static bool
is_descriptor_table(int reg)
{
switch (reg) {
case VM_REG_GUEST_IDTR:
case VM_REG_GUEST_GDTR:
return (true);
default:
return (false);
}
}
static bool
is_segment_register(int reg)
{
switch (reg) {
case VM_REG_GUEST_ES:
case VM_REG_GUEST_CS:
case VM_REG_GUEST_SS:
case VM_REG_GUEST_DS:
case VM_REG_GUEST_FS:
case VM_REG_GUEST_GS:
case VM_REG_GUEST_TR:
case VM_REG_GUEST_LDTR:
return (true);
default:
return (false);
}
}
int
vm_get_seg_desc(struct vm *vm, int vcpu, int reg, struct seg_desc *desc)
{
if (vcpu < 0 || vcpu >= vm->maxcpus)
return (EINVAL);
if (!is_segment_register(reg) && !is_descriptor_table(reg))
return (EINVAL);
return (VMGETDESC(vm->cookie, vcpu, reg, desc));
}
int
vm_set_seg_desc(struct vm *vm, int vcpu, int reg, const struct seg_desc *desc)
{
if (vcpu < 0 || vcpu >= vm->maxcpus)
return (EINVAL);
if (!is_segment_register(reg) && !is_descriptor_table(reg))
return (EINVAL);
return (VMSETDESC(vm->cookie, vcpu, reg, desc));
}
static int
translate_hma_xsave_result(hma_fpu_xsave_result_t res)
{
switch (res) {
case HFXR_OK:
return (0);
case HFXR_NO_SPACE:
return (ENOSPC);
case HFXR_BAD_ALIGN:
case HFXR_UNSUP_FMT:
case HFXR_UNSUP_FEAT:
case HFXR_INVALID_DATA:
return (EINVAL);
default:
panic("unexpected xsave result");
}
}
int
vm_get_fpu(struct vm *vm, int vcpuid, void *buf, size_t len)
{
if (vcpuid < 0 || vcpuid >= vm->maxcpus)
return (EINVAL);
struct vcpu *vcpu = &vm->vcpu[vcpuid];
hma_fpu_xsave_result_t res;
res = hma_fpu_get_xsave_state(vcpu->guestfpu, buf, len);
return (translate_hma_xsave_result(res));
}
int
vm_set_fpu(struct vm *vm, int vcpuid, void *buf, size_t len)
{
if (vcpuid < 0 || vcpuid >= vm->maxcpus)
return (EINVAL);
struct vcpu *vcpu = &vm->vcpu[vcpuid];
hma_fpu_xsave_result_t res;
res = hma_fpu_set_xsave_state(vcpu->guestfpu, buf, len);
return (translate_hma_xsave_result(res));
}
int
vm_get_run_state(struct vm *vm, int vcpuid, uint32_t *state, uint8_t *sipi_vec)
{
struct vcpu *vcpu;
if (vcpuid < 0 || vcpuid >= vm->maxcpus) {
return (EINVAL);
}
vcpu = &vm->vcpu[vcpuid];
vcpu_lock(vcpu);
*state = vcpu->run_state;
*sipi_vec = vcpu->sipi_vector;
vcpu_unlock(vcpu);
return (0);
}
int
vm_set_run_state(struct vm *vm, int vcpuid, uint32_t state, uint8_t sipi_vec)
{
struct vcpu *vcpu;
if (vcpuid < 0 || vcpuid >= vm->maxcpus) {
return (EINVAL);
}
if (!VRS_IS_VALID(state)) {
return (EINVAL);
}
vcpu = &vm->vcpu[vcpuid];
vcpu_lock(vcpu);
vcpu->run_state = state;
vcpu->sipi_vector = sipi_vec;
vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT);
vcpu_unlock(vcpu);
return (0);
}
int
vm_track_dirty_pages(struct vm *vm, uint64_t gpa, size_t len, uint8_t *bitmap)
{
ASSERT0(gpa & PAGEOFFSET);
ASSERT0(len & PAGEOFFSET);
if (!vmspace_get_tracking(vm->vmspace)) {
return (EPERM);
}
vmspace_bits_operate(vm->vmspace, gpa, len,
VBO_RESET_DIRTY | VBO_FLAG_BITMAP_OUT, bitmap);
return (0);
}
int
vm_npt_do_operation(struct vm *vm, uint64_t gpa, size_t len, uint32_t oper,
uint8_t *bitmap, int *rvalp)
{
ASSERT0(gpa & PAGEOFFSET);
ASSERT0(len & PAGEOFFSET);
CTASSERT(VNO_OP_RESET_DIRTY == VBO_RESET_DIRTY);
CTASSERT(VNO_OP_SET_DIRTY == VBO_SET_DIRTY);
CTASSERT(VNO_OP_GET_DIRTY == VBO_GET_DIRTY);
CTASSERT(VNO_FLAG_BITMAP_IN == VBO_FLAG_BITMAP_IN);
CTASSERT(VNO_FLAG_BITMAP_OUT == VBO_FLAG_BITMAP_OUT);
const uint32_t oper_only =
oper & ~(VNO_FLAG_BITMAP_IN | VNO_FLAG_BITMAP_OUT);
switch (oper_only) {
case VNO_OP_RESET_DIRTY:
case VNO_OP_SET_DIRTY:
case VNO_OP_GET_DIRTY:
if (len == 0) {
break;
}
vmspace_bits_operate(vm->vmspace, gpa, len, oper, bitmap);
break;
case VNO_OP_GET_TRACK_DIRTY:
ASSERT3P(rvalp, !=, NULL);
*rvalp = vmspace_get_tracking(vm->vmspace) ? 1 : 0;
break;
case VNO_OP_EN_TRACK_DIRTY:
return (vmspace_set_tracking(vm->vmspace, true));
case VNO_OP_DIS_TRACK_DIRTY:
return (vmspace_set_tracking(vm->vmspace, false));
default:
return (EINVAL);
}
return (0);
}
static void
restore_guest_fpustate(struct vcpu *vcpu)
{
fpu_stop_emulating();
hma_fpu_start_guest(vcpu->guestfpu);
if (rcr4() & CR4_XSAVE)
load_xcr(0, vcpu->guest_xcr0);
fpu_start_emulating();
}
static void
save_guest_fpustate(struct vcpu *vcpu)
{
if ((rcr0() & CR0_TS) == 0)
panic("fpu emulation not enabled in host!");
if (rcr4() & CR4_XSAVE) {
vcpu->guest_xcr0 = rxcr(0);
load_xcr(0, vmm_get_host_xcr0());
}
fpu_stop_emulating();
hma_fpu_stop_guest(vcpu->guestfpu);
}
static int
vcpu_set_state_locked(struct vm *vm, int vcpuid, enum vcpu_state newstate,
bool from_idle)
{
struct vcpu *vcpu;
int error;
vcpu = &vm->vcpu[vcpuid];
vcpu_assert_locked(vcpu);
if (from_idle) {
while (vcpu->state != VCPU_IDLE) {
vcpu->reqidle = true;
vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT);
cv_wait(&vcpu->state_cv, &vcpu->lock);
vcpu->reqidle = false;
}
} else {
KASSERT(vcpu->state != VCPU_IDLE, ("invalid transition from "
"vcpu idle state"));
}
if (vcpu->state == VCPU_RUNNING) {
KASSERT(vcpu->hostcpu == curcpu, ("curcpu %d and hostcpu %d "
"mismatch for running vcpu", curcpu, vcpu->hostcpu));
} else {
KASSERT(vcpu->hostcpu == NOCPU, ("Invalid hostcpu %d for a "
"vcpu that is not running", vcpu->hostcpu));
}
switch (vcpu->state) {
case VCPU_IDLE:
case VCPU_RUNNING:
case VCPU_SLEEPING:
error = (newstate != VCPU_FROZEN);
break;
case VCPU_FROZEN:
error = (newstate == VCPU_FROZEN);
break;
default:
error = 1;
break;
}
if (error)
return (EBUSY);
vcpu->state = newstate;
if (newstate == VCPU_RUNNING)
vcpu->hostcpu = curcpu;
else
vcpu->hostcpu = NOCPU;
if (newstate == VCPU_IDLE) {
cv_broadcast(&vcpu->state_cv);
}
return (0);
}
static void
vcpu_require_state(struct vm *vm, int vcpuid, enum vcpu_state newstate)
{
int error;
if ((error = vcpu_set_state(vm, vcpuid, newstate, false)) != 0)
panic("Error %d setting state to %d\n", error, newstate);
}
static void
vcpu_require_state_locked(struct vm *vm, int vcpuid, enum vcpu_state newstate)
{
int error;
if ((error = vcpu_set_state_locked(vm, vcpuid, newstate, false)) != 0)
panic("Error %d setting state to %d", error, newstate);
}
static int
vm_handle_hlt(struct vm *vm, int vcpuid, bool intr_disabled)
{
struct vcpu *vcpu;
int vcpu_halted, vm_halted;
bool userspace_exit = false;
KASSERT(!CPU_ISSET(vcpuid, &vm->halted_cpus), ("vcpu already halted"));
vcpu = &vm->vcpu[vcpuid];
vcpu_halted = 0;
vm_halted = 0;
vcpu_lock(vcpu);
while (1) {
if (vm_nmi_pending(vm, vcpuid))
break;
if (vcpu_run_state_pending(vm, vcpuid))
break;
if (!intr_disabled) {
if (vm_extint_pending(vm, vcpuid) ||
vlapic_pending_intr(vcpu->vlapic, NULL)) {
break;
}
}
if (vcpu_sleep_bailout_checks(vm, vcpuid)) {
userspace_exit = true;
break;
}
if (intr_disabled) {
if (!vcpu_halted && halt_detection_enabled) {
vcpu_halted = 1;
CPU_SET_ATOMIC(vcpuid, &vm->halted_cpus);
}
if (CPU_CMP(&vm->halted_cpus, &vm->active_cpus) == 0) {
vm_halted = 1;
break;
}
}
vcpu_ustate_change(vm, vcpuid, VU_IDLE);
vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING);
(void) cv_wait_sig(&vcpu->vcpu_cv, &vcpu->lock);
vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN);
vcpu_ustate_change(vm, vcpuid, VU_EMU_KERN);
}
if (vcpu_halted)
CPU_CLR_ATOMIC(vcpuid, &vm->halted_cpus);
vcpu_unlock(vcpu);
if (vm_halted) {
(void) vm_suspend(vm, VM_SUSPEND_HALT, -1);
}
return (userspace_exit ? -1 : 0);
}
static int
vm_handle_paging(struct vm *vm, int vcpuid)
{
struct vcpu *vcpu = &vm->vcpu[vcpuid];
vm_client_t *vmc = vcpu->vmclient;
struct vm_exit *vme = &vcpu->exitinfo;
const int ftype = vme->u.paging.fault_type;
ASSERT0(vme->inst_length);
ASSERT(ftype == PROT_READ || ftype == PROT_WRITE || ftype == PROT_EXEC);
if (vmc_fault(vmc, vme->u.paging.gpa, ftype) != 0) {
return (-1);
}
return (0);
}
int
vm_service_mmio_read(struct vm *vm, int cpuid, uint64_t gpa, uint64_t *rval,
int rsize)
{
int err = ESRCH;
if (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE) {
struct vlapic *vlapic = vm_lapic(vm, cpuid);
err = vlapic_mmio_read(vlapic, gpa, rval, rsize);
} else if (gpa >= VIOAPIC_BASE && gpa < VIOAPIC_BASE + VIOAPIC_SIZE) {
err = vioapic_mmio_read(vm, cpuid, gpa, rval, rsize);
} else if (gpa >= VHPET_BASE && gpa < VHPET_BASE + VHPET_SIZE) {
err = vhpet_mmio_read(vm, cpuid, gpa, rval, rsize);
} else if (vm->mmiohooks.mhc_count > 0) {
for (uint_t i = 0; i < vm->mmiohooks.mhc_count; i++) {
mmiohook_entry_t *e = &vm->mmiohooks.mhc_entries[i];
const uint64_t end = e->mhe_addr + e->mhe_size;
if (gpa >= e->mhe_addr && gpa < end) {
err = e->mhe_func(e->mhe_arg, false, gpa, rsize,
rval);
break;
}
}
}
return (err);
}
int
vm_service_mmio_write(struct vm *vm, int cpuid, uint64_t gpa, uint64_t wval,
int wsize)
{
int err = ESRCH;
if (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE) {
struct vlapic *vlapic = vm_lapic(vm, cpuid);
err = vlapic_mmio_write(vlapic, gpa, wval, wsize);
} else if (gpa >= VIOAPIC_BASE && gpa < VIOAPIC_BASE + VIOAPIC_SIZE) {
err = vioapic_mmio_write(vm, cpuid, gpa, wval, wsize);
} else if (gpa >= VHPET_BASE && gpa < VHPET_BASE + VHPET_SIZE) {
err = vhpet_mmio_write(vm, cpuid, gpa, wval, wsize);
} else if (vm->mmiohooks.mhc_count > 0) {
for (uint_t i = 0; i < vm->mmiohooks.mhc_count; i++) {
mmiohook_entry_t *e = &vm->mmiohooks.mhc_entries[i];
const uint64_t end = e->mhe_addr + e->mhe_size;
if (gpa >= e->mhe_addr && gpa < end) {
err = e->mhe_func(e->mhe_arg, true, gpa, wsize,
&wval);
break;
}
}
}
return (err);
}
static int
vm_handle_mmio_emul(struct vm *vm, int vcpuid)
{
struct vie *vie;
struct vcpu *vcpu;
struct vm_exit *vme;
uint64_t inst_addr;
int error, fault, cs_d;
vcpu = &vm->vcpu[vcpuid];
vme = &vcpu->exitinfo;
vie = vcpu->vie_ctx;
KASSERT(vme->inst_length == 0, ("%s: invalid inst_length %d",
__func__, vme->inst_length));
inst_addr = vme->rip + vme->u.mmio_emul.cs_base;
cs_d = vme->u.mmio_emul.cs_d;
if (vie_needs_fetch(vie)) {
error = vie_fetch_instruction(vie, vm, vcpuid, inst_addr,
&fault);
if (error != 0) {
return (error);
} else if (fault) {
return (0);
}
}
if (vie_decode_instruction(vie, vm, vcpuid, cs_d) != 0) {
vie_fallback_exitinfo(vie, vme);
return (-1);
}
if (vme->u.mmio_emul.gla != VIE_INVALID_GLA &&
vie_verify_gla(vie, vm, vcpuid, vme->u.mmio_emul.gla) != 0) {
vie_fallback_exitinfo(vie, vme);
return (-1);
}
repeat:
error = vie_emulate_mmio(vie, vm, vcpuid);
if (error < 0) {
vie_exitinfo(vie, vme);
} else if (error == EAGAIN) {
if (!vcpu_should_yield(vm, vcpuid)) {
goto repeat;
} else {
vie_reset(vie);
vme->exitcode = VM_EXITCODE_BOGUS;
return (-1);
}
} else if (error == 0) {
vie_advance_pc(vie, &vcpu->nextrip);
}
return (error);
}
static int
vm_handle_inout(struct vm *vm, int vcpuid, struct vm_exit *vme)
{
struct vcpu *vcpu;
struct vie *vie;
int err;
vcpu = &vm->vcpu[vcpuid];
vie = vcpu->vie_ctx;
repeat:
err = vie_emulate_inout(vie, vm, vcpuid);
if (err < 0) {
vie_exitinfo(vie, vme);
return (err);
} else if (err == EAGAIN) {
if (!vcpu_should_yield(vm, vcpuid)) {
goto repeat;
} else {
vie_reset(vie);
vme->exitcode = VM_EXITCODE_BOGUS;
return (-1);
}
} else if (err != 0) {
vme->exitcode = VM_EXITCODE_INST_EMUL;
bzero(&vme->u.inst_emul, sizeof (vme->u.inst_emul));
return (-1);
}
vie_advance_pc(vie, &vcpu->nextrip);
return (0);
}
static int
vm_handle_inst_emul(struct vm *vm, int vcpuid)
{
struct vie *vie;
struct vcpu *vcpu;
struct vm_exit *vme;
uint64_t cs_base;
int error, fault, cs_d;
vcpu = &vm->vcpu[vcpuid];
vme = &vcpu->exitinfo;
vie = vcpu->vie_ctx;
vie_cs_info(vie, vm, vcpuid, &cs_base, &cs_d);
ASSERT(vie_needs_fetch(vie));
error = vie_fetch_instruction(vie, vm, vcpuid, vme->rip + cs_base,
&fault);
if (error != 0) {
return (error);
} else if (fault) {
return (0);
}
if (vie_decode_instruction(vie, vm, vcpuid, cs_d) != 0) {
vie_fallback_exitinfo(vie, vme);
return (-1);
}
error = vie_emulate_other(vie, vm, vcpuid);
if (error != 0) {
vie_fallback_exitinfo(vie, vme);
} else {
vie_advance_pc(vie, &vcpu->nextrip);
}
return (error);
}
static int
vm_handle_run_state(struct vm *vm, int vcpuid)
{
struct vcpu *vcpu = &vm->vcpu[vcpuid];
bool handled = false;
vcpu_lock(vcpu);
while (1) {
if ((vcpu->run_state & VRS_PEND_INIT) != 0) {
vcpu_unlock(vcpu);
VERIFY0(vcpu_arch_reset(vm, vcpuid, true));
vcpu_lock(vcpu);
vcpu->run_state &= ~(VRS_RUN | VRS_PEND_INIT);
vcpu->run_state |= VRS_INIT;
}
if ((vcpu->run_state & (VRS_INIT | VRS_RUN | VRS_PEND_SIPI)) ==
(VRS_INIT | VRS_PEND_SIPI)) {
const uint8_t vector = vcpu->sipi_vector;
vcpu_unlock(vcpu);
VERIFY0(vcpu_vector_sipi(vm, vcpuid, vector));
vcpu_lock(vcpu);
vcpu->run_state &= ~VRS_PEND_SIPI;
vcpu->run_state |= VRS_RUN;
}
if ((vcpu->run_state & VRS_RUN) != 0) {
handled = true;
break;
}
if (vcpu_sleep_bailout_checks(vm, vcpuid)) {
break;
}
vcpu_ustate_change(vm, vcpuid, VU_IDLE);
vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING);
(void) cv_wait_sig(&vcpu->vcpu_cv, &vcpu->lock);
vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN);
vcpu_ustate_change(vm, vcpuid, VU_EMU_KERN);
}
vcpu_unlock(vcpu);
return (handled ? 0 : -1);
}
static int
vm_rdmtrr(const struct vm_mtrr *mtrr, uint32_t num, uint64_t *val)
{
switch (num) {
case MSR_MTRRcap:
*val = MTRR_CAP_WC | MTRR_CAP_FIXED | VMM_MTRR_VAR_MAX;
break;
case MSR_MTRRdefType:
*val = mtrr->def_type;
break;
case MSR_MTRR4kBase ... MSR_MTRR4kBase + 7:
*val = mtrr->fixed4k[num - MSR_MTRR4kBase];
break;
case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1:
*val = mtrr->fixed16k[num - MSR_MTRR16kBase];
break;
case MSR_MTRR64kBase:
*val = mtrr->fixed64k;
break;
case MSR_MTRRVarBase ... MSR_MTRRVarBase + (VMM_MTRR_VAR_MAX * 2) - 1: {
uint_t offset = num - MSR_MTRRVarBase;
if (offset % 2 == 0) {
*val = mtrr->var[offset / 2].base;
} else {
*val = mtrr->var[offset / 2].mask;
}
break;
}
default:
return (EINVAL);
}
return (0);
}
static int
vm_wrmtrr(struct vm_mtrr *mtrr, uint32_t num, uint64_t val)
{
switch (num) {
case MSR_MTRRcap:
return (EPERM);
case MSR_MTRRdefType:
if (val & ~VMM_MTRR_DEF_MASK) {
return (EINVAL);
}
mtrr->def_type = val;
break;
case MSR_MTRR4kBase ... MSR_MTRR4kBase + 7:
mtrr->fixed4k[num - MSR_MTRR4kBase] = val;
break;
case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1:
mtrr->fixed16k[num - MSR_MTRR16kBase] = val;
break;
case MSR_MTRR64kBase:
mtrr->fixed64k = val;
break;
case MSR_MTRRVarBase ... MSR_MTRRVarBase + (VMM_MTRR_VAR_MAX * 2) - 1: {
uint_t offset = num - MSR_MTRRVarBase;
if (offset % 2 == 0) {
if (val & ~VMM_MTRR_PHYSBASE_MASK) {
return (EINVAL);
}
mtrr->var[offset / 2].base = val;
} else {
if (val & ~VMM_MTRR_PHYSMASK_MASK) {
return (EINVAL);
}
mtrr->var[offset / 2].mask = val;
}
break;
}
default:
return (EINVAL);
}
return (0);
}
static bool
is_mtrr_msr(uint32_t msr)
{
switch (msr) {
case MSR_MTRRcap:
case MSR_MTRRdefType:
case MSR_MTRR4kBase ... MSR_MTRR4kBase + 7:
case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1:
case MSR_MTRR64kBase:
case MSR_MTRRVarBase ... MSR_MTRRVarBase + (VMM_MTRR_VAR_MAX * 2) - 1:
return (true);
default:
return (false);
}
}
static int
vm_handle_rdmsr(struct vm *vm, int vcpuid, struct vm_exit *vme)
{
struct vcpu *vcpu = &vm->vcpu[vcpuid];
const uint32_t code = vme->u.msr.code;
uint64_t val = 0;
switch (code) {
case MSR_MCG_CAP:
case MSR_MCG_STATUS:
val = 0;
break;
case MSR_MTRRcap:
case MSR_MTRRdefType:
case MSR_MTRR4kBase ... MSR_MTRR4kBase + 7:
case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1:
case MSR_MTRR64kBase:
case MSR_MTRRVarBase ... MSR_MTRRVarBase + (VMM_MTRR_VAR_MAX * 2) - 1:
if (vm_rdmtrr(&vcpu->mtrr, code, &val) != 0)
vm_inject_gp(vm, vcpuid);
break;
case MSR_TSC:
val = calc_guest_tsc(rdtsc_offset(), vm->freq_multiplier,
vcpu_tsc_offset(vm, vcpuid, false));
break;
default:
return (-1);
}
VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RAX,
val & 0xffffffff));
VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RDX,
val >> 32));
return (0);
}
static int
vm_handle_wrmsr(struct vm *vm, int vcpuid, struct vm_exit *vme)
{
struct vcpu *vcpu = &vm->vcpu[vcpuid];
const uint32_t code = vme->u.msr.code;
const uint64_t val = vme->u.msr.wval;
switch (code) {
case MSR_MCG_CAP:
case MSR_MCG_STATUS:
break;
case MSR_MTRRcap:
case MSR_MTRRdefType:
case MSR_MTRR4kBase ... MSR_MTRR4kBase + 7:
case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1:
case MSR_MTRR64kBase:
case MSR_MTRRVarBase ... MSR_MTRRVarBase + (VMM_MTRR_VAR_MAX * 2) - 1:
if (vm_wrmtrr(&vcpu->mtrr, code, val) != 0)
vm_inject_gp(vm, vcpuid);
break;
case MSR_TSC:
vcpu->tsc_offset = val - calc_guest_tsc(rdtsc_offset(),
vm->freq_multiplier, vm->tsc_offset);
break;
default:
return (-1);
}
return (0);
}
static bool
vm_is_suspended(struct vm *vm, struct vm_exit *vme)
{
const int val = vm->suspend_how;
if (val == 0) {
return (false);
} else {
if (vme != NULL) {
vme->exitcode = VM_EXITCODE_SUSPENDED;
vme->u.suspended.how = val;
vme->u.suspended.source = vm->suspend_source;
vme->u.suspended.when = (uint64_t)
MAX(vm_normalize_hrtime(vm, vm->suspend_when), 0);
}
return (true);
}
}
int
vm_suspend(struct vm *vm, enum vm_suspend_how how, int source)
{
if (how <= VM_SUSPEND_NONE || how >= VM_SUSPEND_LAST) {
return (EINVAL);
}
const hrtime_t now = gethrtime();
if (atomic_cmpset_long((ulong_t *)&vm->suspend_when, 0, now) == 0) {
return (EALREADY);
}
vm->suspend_source = source;
membar_producer();
vm->suspend_how = how;
for (uint_t i = 0; i < vm->maxcpus; i++) {
struct vcpu *vcpu = &vm->vcpu[i];
vcpu_lock(vcpu);
if (!CPU_ISSET(i, &vm->active_cpus)) {
vcpu_unlock(vcpu);
continue;
}
switch (vcpu->state) {
case VCPU_IDLE:
case VCPU_FROZEN:
vcpu_ustate_change(vm, i, VU_INIT);
break;
default:
vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT);
break;
}
vcpu_unlock(vcpu);
}
return (0);
}
void
vm_exit_run_state(struct vm *vm, int vcpuid, uint64_t rip)
{
struct vm_exit *vmexit;
vmexit = vm_exitinfo(vm, vcpuid);
vmexit->rip = rip;
vmexit->inst_length = 0;
vmexit->exitcode = VM_EXITCODE_RUN_STATE;
vmm_stat_incr(vm, vcpuid, VMEXIT_RUN_STATE, 1);
}
static void
vm_localize_resources(struct vm *vm, struct vcpu *vcpu)
{
VERIFY(curthread->t_preempt == 0);
if (vcpu->lastloccpu == curcpu)
return;
if (vcpu == &vm->vcpu[0]) {
vhpet_localize_resources(vm->vhpet);
vrtc_localize_resources(vm->vrtc);
vatpit_localize_resources(vm->vatpit);
}
vlapic_localize_resources(vcpu->vlapic);
vcpu->lastloccpu = curcpu;
}
static void
vmm_savectx(void *arg)
{
vm_thread_ctx_t *vtc = arg;
struct vm *vm = vtc->vtc_vm;
const int vcpuid = vtc->vtc_vcpuid;
if (ops->vmsavectx != NULL) {
ops->vmsavectx(vm->cookie, vcpuid);
}
if (vm->vcpu[vcpuid].ustate != VU_IDLE) {
vtc->vtc_ustate = vm->vcpu[vcpuid].ustate;
vcpu_ustate_change(vm, vcpuid, VU_SCHED);
}
if ((vtc->vtc_status & VTCS_FPU_RESTORED) != 0) {
struct vcpu *vcpu = &vm->vcpu[vcpuid];
save_guest_fpustate(vcpu);
vtc->vtc_status &= ~VTCS_FPU_RESTORED;
}
}
static void
vmm_restorectx(void *arg)
{
vm_thread_ctx_t *vtc = arg;
struct vm *vm = vtc->vtc_vm;
const int vcpuid = vtc->vtc_vcpuid;
if (vm->vcpu[vcpuid].ustate != VU_IDLE) {
vcpu_ustate_change(vm, vcpuid, vtc->vtc_ustate);
}
VERIFY((vtc->vtc_status & VTCS_FPU_RESTORED) == 0);
if ((vtc->vtc_status & VTCS_FPU_CTX_CRITICAL) != 0) {
struct vcpu *vcpu = &vm->vcpu[vcpuid];
restore_guest_fpustate(vcpu);
vtc->vtc_status |= VTCS_FPU_RESTORED;
}
if (ops->vmrestorectx != NULL) {
ops->vmrestorectx(vm->cookie, vcpuid);
}
}
#define VEC_MASK_FLAGS (VEC_FLAG_EXIT_CONSISTENT)
#define VEC_MASK_CMD (~VEC_MASK_FLAGS)
static int
vm_entry_actions(struct vm *vm, int vcpuid, const struct vm_entry *entry,
struct vm_exit *vme)
{
struct vcpu *vcpu = &vm->vcpu[vcpuid];
struct vie *vie = vcpu->vie_ctx;
int err = 0;
const uint_t cmd = entry->cmd & VEC_MASK_CMD;
const uint_t flags = entry->cmd & VEC_MASK_FLAGS;
switch (cmd) {
case VEC_DEFAULT:
break;
case VEC_DISCARD_INSTR:
vie_reset(vie);
break;
case VEC_FULFILL_MMIO:
err = vie_fulfill_mmio(vie, &entry->u.mmio);
if (err == 0) {
err = vie_emulate_mmio(vie, vm, vcpuid);
if (err == 0) {
vie_advance_pc(vie, &vcpu->nextrip);
} else if (err < 0) {
vie_exitinfo(vie, vme);
} else if (err == EAGAIN) {
vie_reset(vie);
err = 0;
}
}
break;
case VEC_FULFILL_INOUT:
err = vie_fulfill_inout(vie, &entry->u.inout);
if (err == 0) {
err = vie_emulate_inout(vie, vm, vcpuid);
if (err == 0) {
vie_advance_pc(vie, &vcpu->nextrip);
} else if (err < 0) {
vie_exitinfo(vie, vme);
} else if (err == EAGAIN) {
vie_reset(vie);
err = 0;
}
}
break;
default:
return (EINVAL);
}
if ((flags & VEC_FLAG_EXIT_CONSISTENT) != 0 && err == 0) {
vcpu->reqconsist = true;
}
return (err);
}
static int
vm_loop_checks(struct vm *vm, int vcpuid, struct vm_exit *vme)
{
struct vie *vie;
vie = vm->vcpu[vcpuid].vie_ctx;
if (vie_pending(vie)) {
vie_exitinfo(vie, vme);
return (-1);
}
return (0);
}
int
vm_run(struct vm *vm, int vcpuid, const struct vm_entry *entry)
{
int error;
struct vcpu *vcpu;
struct vm_exit *vme;
bool intr_disabled;
int affinity_type = CPU_CURRENT;
if (vcpuid < 0 || vcpuid >= vm->maxcpus)
return (EINVAL);
if (!CPU_ISSET(vcpuid, &vm->active_cpus))
return (EINVAL);
if (vm->is_paused) {
return (EBUSY);
}
vcpu = &vm->vcpu[vcpuid];
vme = &vcpu->exitinfo;
vcpu_ustate_change(vm, vcpuid, VU_EMU_KERN);
vcpu->vtc.vtc_status = 0;
ctxop_attach(curthread, vcpu->ctxop);
error = vm_entry_actions(vm, vcpuid, entry, vme);
if (error != 0) {
goto exit;
}
restart:
error = vm_loop_checks(vm, vcpuid, vme);
if (error != 0) {
goto exit;
}
thread_affinity_set(curthread, affinity_type);
vm_localize_resources(vm, vcpu);
affinity_type = CPU_CURRENT;
critical_enter();
PCB_SET_UPDATE_SEGS(&ttolwp(curthread)->lwp_pcb);
if ((vcpu->vtc.vtc_status & VTCS_FPU_RESTORED) == 0) {
restore_guest_fpustate(vcpu);
vcpu->vtc.vtc_status |= VTCS_FPU_RESTORED;
}
vcpu->vtc.vtc_status |= VTCS_FPU_CTX_CRITICAL;
vcpu_require_state(vm, vcpuid, VCPU_RUNNING);
error = VMRUN(vm->cookie, vcpuid, vcpu->nextrip);
vcpu_require_state(vm, vcpuid, VCPU_FROZEN);
vcpu->vtc.vtc_status &= ~VTCS_FPU_CTX_CRITICAL;
thread_affinity_clear(curthread);
critical_exit();
if (error != 0) {
goto exit;
}
vcpu->nextrip = vme->rip + vme->inst_length;
switch (vme->exitcode) {
case VM_EXITCODE_RUN_STATE:
error = vm_handle_run_state(vm, vcpuid);
break;
case VM_EXITCODE_IOAPIC_EOI:
vioapic_process_eoi(vm, vcpuid,
vme->u.ioapic_eoi.vector);
break;
case VM_EXITCODE_HLT:
intr_disabled = ((vme->u.hlt.rflags & PSL_I) == 0);
error = vm_handle_hlt(vm, vcpuid, intr_disabled);
break;
case VM_EXITCODE_PAGING:
error = vm_handle_paging(vm, vcpuid);
break;
case VM_EXITCODE_MMIO_EMUL:
error = vm_handle_mmio_emul(vm, vcpuid);
break;
case VM_EXITCODE_INOUT:
error = vm_handle_inout(vm, vcpuid, vme);
break;
case VM_EXITCODE_INST_EMUL:
error = vm_handle_inst_emul(vm, vcpuid);
break;
case VM_EXITCODE_MONITOR:
case VM_EXITCODE_MWAIT:
case VM_EXITCODE_VMINSN:
vm_inject_ud(vm, vcpuid);
break;
case VM_EXITCODE_RDMSR:
error = vm_handle_rdmsr(vm, vcpuid, vme);
break;
case VM_EXITCODE_WRMSR:
error = vm_handle_wrmsr(vm, vcpuid, vme);
break;
case VM_EXITCODE_HT:
affinity_type = CPU_BEST;
break;
case VM_EXITCODE_MTRAP:
VERIFY0(vm_suspend_cpu(vm, vcpuid));
error = -1;
break;
default:
error = -1;
break;
}
if (error == 0) {
goto restart;
}
exit:
kpreempt_disable();
ctxop_detach(curthread, vcpu->ctxop);
vmm_savectx(&vcpu->vtc);
kpreempt_enable();
vcpu_ustate_change(vm, vcpuid,
vm_is_suspended(vm, NULL) ? VU_INIT : VU_EMU_USER);
return (error);
}
int
vm_restart_instruction(void *arg, int vcpuid)
{
struct vm *vm;
struct vcpu *vcpu;
enum vcpu_state state;
uint64_t rip;
int error;
vm = arg;
if (vcpuid < 0 || vcpuid >= vm->maxcpus)
return (EINVAL);
vcpu = &vm->vcpu[vcpuid];
state = vcpu_get_state(vm, vcpuid, NULL);
if (state == VCPU_RUNNING) {
vcpu->exitinfo.inst_length = 0;
} else if (state == VCPU_FROZEN) {
error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RIP, &rip);
KASSERT(!error, ("%s: error %d getting rip", __func__, error));
vcpu->nextrip = rip;
} else {
panic("%s: invalid state %d", __func__, state);
}
return (0);
}
int
vm_exit_intinfo(struct vm *vm, int vcpuid, uint64_t info)
{
struct vcpu *vcpu;
if (vcpuid < 0 || vcpuid >= vm->maxcpus)
return (EINVAL);
vcpu = &vm->vcpu[vcpuid];
if (VM_INTINFO_PENDING(info)) {
const uint32_t type = VM_INTINFO_TYPE(info);
const uint8_t vector = VM_INTINFO_VECTOR(info);
if (type == VM_INTINFO_NMI && vector != IDT_NMI)
return (EINVAL);
if (type == VM_INTINFO_HWEXCP && vector >= 32)
return (EINVAL);
if (info & VM_INTINFO_MASK_RSVD)
return (EINVAL);
} else {
info = 0;
}
vcpu->exit_intinfo = info;
return (0);
}
enum exc_class {
EXC_BENIGN,
EXC_CONTRIBUTORY,
EXC_PAGEFAULT
};
#define IDT_VE 20
static enum exc_class
exception_class(uint64_t info)
{
ASSERT(VM_INTINFO_PENDING(info));
switch (VM_INTINFO_TYPE(info)) {
case VM_INTINFO_HWINTR:
case VM_INTINFO_SWINTR:
case VM_INTINFO_NMI:
return (EXC_BENIGN);
default:
break;
}
switch (VM_INTINFO_VECTOR(info)) {
case IDT_PF:
case IDT_VE:
return (EXC_PAGEFAULT);
case IDT_DE:
case IDT_TS:
case IDT_NP:
case IDT_SS:
case IDT_GP:
return (EXC_CONTRIBUTORY);
default:
return (EXC_BENIGN);
}
}
bool
vm_entry_intinfo(struct vm *vm, int vcpuid, uint64_t *retinfo)
{
struct vcpu *vcpu = &vm->vcpu[vcpuid];
const uint64_t info1 = vcpu->exit_intinfo;
vcpu->exit_intinfo = 0;
const uint64_t info2 = vcpu->exc_pending;
vcpu->exc_pending = 0;
if (VM_INTINFO_PENDING(info1) && VM_INTINFO_PENDING(info2)) {
if (VM_INTINFO_TYPE(info1) == VM_INTINFO_HWEXCP &&
VM_INTINFO_VECTOR(info1) == IDT_DF) {
(void) vm_suspend(vm, VM_SUSPEND_TRIPLEFAULT, vcpuid);
*retinfo = 0;
return (false);
}
const enum exc_class exc1 = exception_class(info1);
const enum exc_class exc2 = exception_class(info2);
if ((exc1 == EXC_CONTRIBUTORY && exc2 == EXC_CONTRIBUTORY) ||
(exc1 == EXC_PAGEFAULT && exc2 != EXC_BENIGN)) {
*retinfo =
VM_INTINFO_VALID |
VM_INTINFO_DEL_ERRCODE |
VM_INTINFO_HWEXCP |
IDT_DF;
} else {
vcpu->exit_intinfo = info1;
*retinfo = info2;
}
return (true);
} else if (VM_INTINFO_PENDING(info1)) {
*retinfo = info1;
return (true);
} else if (VM_INTINFO_PENDING(info2)) {
*retinfo = info2;
return (true);
}
return (false);
}
int
vm_get_intinfo(struct vm *vm, int vcpuid, uint64_t *info1, uint64_t *info2)
{
struct vcpu *vcpu;
if (vcpuid < 0 || vcpuid >= vm->maxcpus)
return (EINVAL);
vcpu = &vm->vcpu[vcpuid];
*info1 = vcpu->exit_intinfo;
*info2 = vcpu->exc_pending;
return (0);
}
int
vm_inject_exception(struct vm *vm, int vcpuid, uint8_t vector,
bool errcode_valid, uint32_t errcode, bool restart_instruction)
{
struct vcpu *vcpu;
uint64_t regval;
int error;
if (vcpuid < 0 || vcpuid >= vm->maxcpus)
return (EINVAL);
if (vector >= 32)
return (EINVAL);
if (vector == IDT_NMI) {
return (EINVAL);
}
if (vector == IDT_DF) {
return (EINVAL);
}
vcpu = &vm->vcpu[vcpuid];
if (VM_INTINFO_PENDING(vcpu->exc_pending)) {
return (EBUSY);
}
if (errcode_valid) {
error = vm_get_register(vm, vcpuid, VM_REG_GUEST_CR0, ®val);
VERIFY0(error);
if ((regval & CR0_PE) == 0) {
errcode_valid = false;
}
}
error = vm_set_register(vm, vcpuid, VM_REG_GUEST_INTR_SHADOW, 0);
VERIFY0(error);
if (restart_instruction) {
VERIFY0(vm_restart_instruction(vm, vcpuid));
}
uint64_t val = VM_INTINFO_VALID | VM_INTINFO_HWEXCP | vector;
if (errcode_valid) {
val |= VM_INTINFO_DEL_ERRCODE;
val |= (uint64_t)errcode << VM_INTINFO_SHIFT_ERRCODE;
}
vcpu->exc_pending = val;
return (0);
}
void
vm_inject_ud(struct vm *vm, int vcpuid)
{
VERIFY0(vm_inject_exception(vm, vcpuid, IDT_UD, false, 0, true));
}
void
vm_inject_gp(struct vm *vm, int vcpuid)
{
VERIFY0(vm_inject_exception(vm, vcpuid, IDT_GP, true, 0, true));
}
void
vm_inject_ac(struct vm *vm, int vcpuid, uint32_t errcode)
{
VERIFY0(vm_inject_exception(vm, vcpuid, IDT_AC, true, errcode, true));
}
void
vm_inject_ss(struct vm *vm, int vcpuid, uint32_t errcode)
{
VERIFY0(vm_inject_exception(vm, vcpuid, IDT_SS, true, errcode, true));
}
void
vm_inject_pf(struct vm *vm, int vcpuid, uint32_t errcode, uint64_t cr2)
{
VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_CR2, cr2));
VERIFY0(vm_inject_exception(vm, vcpuid, IDT_PF, true, errcode, true));
}
static VMM_STAT(VCPU_NMI_COUNT, "number of NMIs delivered to vcpu");
int
vm_inject_nmi(struct vm *vm, int vcpuid)
{
struct vcpu *vcpu;
if (vcpuid < 0 || vcpuid >= vm->maxcpus)
return (EINVAL);
vcpu = &vm->vcpu[vcpuid];
vcpu->nmi_pending = true;
vcpu_notify_event(vm, vcpuid);
return (0);
}
bool
vm_nmi_pending(struct vm *vm, int vcpuid)
{
struct vcpu *vcpu = &vm->vcpu[vcpuid];
return (vcpu->nmi_pending);
}
void
vm_nmi_clear(struct vm *vm, int vcpuid)
{
struct vcpu *vcpu = &vm->vcpu[vcpuid];
ASSERT(vcpu->nmi_pending);
vcpu->nmi_pending = false;
vmm_stat_incr(vm, vcpuid, VCPU_NMI_COUNT, 1);
}
static VMM_STAT(VCPU_EXTINT_COUNT, "number of ExtINTs delivered to vcpu");
int
vm_inject_extint(struct vm *vm, int vcpuid)
{
struct vcpu *vcpu;
if (vcpuid < 0 || vcpuid >= vm->maxcpus)
return (EINVAL);
vcpu = &vm->vcpu[vcpuid];
vcpu->extint_pending = true;
vcpu_notify_event(vm, vcpuid);
return (0);
}
bool
vm_extint_pending(struct vm *vm, int vcpuid)
{
struct vcpu *vcpu = &vm->vcpu[vcpuid];
return (vcpu->extint_pending);
}
void
vm_extint_clear(struct vm *vm, int vcpuid)
{
struct vcpu *vcpu = &vm->vcpu[vcpuid];
ASSERT(vcpu->extint_pending);
vcpu->extint_pending = false;
vmm_stat_incr(vm, vcpuid, VCPU_EXTINT_COUNT, 1);
}
int
vm_inject_init(struct vm *vm, int vcpuid)
{
struct vcpu *vcpu;
if (vcpuid < 0 || vcpuid >= vm->maxcpus)
return (EINVAL);
vcpu = &vm->vcpu[vcpuid];
vcpu_lock(vcpu);
vcpu->run_state |= VRS_PEND_INIT;
vcpu->run_state &= ~VRS_PEND_SIPI;
vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT);
vcpu_unlock(vcpu);
return (0);
}
int
vm_inject_sipi(struct vm *vm, int vcpuid, uint8_t vector)
{
struct vcpu *vcpu;
if (vcpuid < 0 || vcpuid >= vm->maxcpus)
return (EINVAL);
vcpu = &vm->vcpu[vcpuid];
vcpu_lock(vcpu);
vcpu->run_state |= VRS_PEND_SIPI;
vcpu->sipi_vector = vector;
if ((vcpu->run_state & (VRS_INIT | VRS_RUN)) == VRS_INIT) {
vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT);
}
vcpu_unlock(vcpu);
return (0);
}
bool
vcpu_run_state_pending(struct vm *vm, int vcpuid)
{
struct vcpu *vcpu;
ASSERT(vcpuid >= 0 && vcpuid < vm->maxcpus);
vcpu = &vm->vcpu[vcpuid];
return ((vcpu->run_state & (VRS_RUN | VRS_PEND_INIT)) != VRS_RUN);
}
int
vcpu_arch_reset(struct vm *vm, int vcpuid, bool init_only)
{
struct seg_desc desc;
const enum vm_reg_name clear_regs[] = {
VM_REG_GUEST_CR2,
VM_REG_GUEST_CR3,
VM_REG_GUEST_CR4,
VM_REG_GUEST_RAX,
VM_REG_GUEST_RBX,
VM_REG_GUEST_RCX,
VM_REG_GUEST_RSI,
VM_REG_GUEST_RDI,
VM_REG_GUEST_RBP,
VM_REG_GUEST_RSP,
VM_REG_GUEST_R8,
VM_REG_GUEST_R9,
VM_REG_GUEST_R10,
VM_REG_GUEST_R11,
VM_REG_GUEST_R12,
VM_REG_GUEST_R13,
VM_REG_GUEST_R14,
VM_REG_GUEST_R15,
VM_REG_GUEST_DR0,
VM_REG_GUEST_DR1,
VM_REG_GUEST_DR2,
VM_REG_GUEST_DR3,
VM_REG_GUEST_EFER,
};
const enum vm_reg_name data_segs[] = {
VM_REG_GUEST_SS,
VM_REG_GUEST_DS,
VM_REG_GUEST_ES,
VM_REG_GUEST_FS,
VM_REG_GUEST_GS,
};
struct vcpu *vcpu = &vm->vcpu[vcpuid];
if (vcpuid < 0 || vcpuid >= vm->maxcpus)
return (EINVAL);
for (uint_t i = 0; i < nitems(clear_regs); i++) {
VERIFY0(vm_set_register(vm, vcpuid, clear_regs[i], 0));
}
VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, 2));
VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RIP, 0xfff0));
VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_CR0, 0x60000010));
VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RDX, 0x600));
VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_DR6, 0xffff0ff0));
VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_DR7, 0x400));
desc.access = 0x0093;
desc.base = 0xffff0000;
desc.limit = 0xffff;
VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_CS, &desc));
VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_CS, 0xf000));
desc.access = 0x0093;
desc.base = 0;
desc.limit = 0xffff;
for (uint_t i = 0; i < nitems(data_segs); i++) {
VERIFY0(vm_set_seg_desc(vm, vcpuid, data_segs[i], &desc));
VERIFY0(vm_set_register(vm, vcpuid, data_segs[i], 0));
}
desc.base = 0;
desc.limit = 0xffff;
VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_GDTR, &desc));
VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_IDTR, &desc));
desc.access = 0x0082;
desc.base = 0;
desc.limit = 0xffff;
VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_LDTR, &desc));
VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_LDTR, 0));
desc.access = 0x008b;
desc.base = 0;
desc.limit = 0xffff;
VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_TR, &desc));
VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_TR, 0));
vlapic_reset(vm_lapic(vm, vcpuid));
VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_INTR_SHADOW, 0));
vcpu->exit_intinfo = 0;
vcpu->exc_pending = 0;
vcpu->nmi_pending = false;
vcpu->extint_pending = 0;
if (!init_only) {
vcpu->guest_xcr0 = XFEATURE_ENABLED_X87;
(void) hma_fpu_init(vcpu->guestfpu);
bzero(&vcpu->mtrr, sizeof (vcpu->mtrr));
}
return (0);
}
static int
vcpu_vector_sipi(struct vm *vm, int vcpuid, uint8_t vector)
{
struct seg_desc desc;
if (vcpuid < 0 || vcpuid >= vm->maxcpus)
return (EINVAL);
desc.access = 0x0093;
desc.base = (uint64_t)vector << 12;
desc.limit = 0xffff;
VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_CS, &desc));
VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_CS,
(uint64_t)vector << 8));
VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RIP, 0));
return (0);
}
int
vm_get_capability(struct vm *vm, int vcpu, int type, int *retval)
{
if (vcpu < 0 || vcpu >= vm->maxcpus)
return (EINVAL);
if (type < 0 || type >= VM_CAP_MAX)
return (EINVAL);
return (VMGETCAP(vm->cookie, vcpu, type, retval));
}
int
vm_set_capability(struct vm *vm, int vcpu, int type, int val)
{
if (vcpu < 0 || vcpu >= vm->maxcpus)
return (EINVAL);
if (type < 0 || type >= VM_CAP_MAX)
return (EINVAL);
return (VMSETCAP(vm->cookie, vcpu, type, val));
}
vcpu_cpuid_config_t *
vm_cpuid_config(struct vm *vm, int vcpuid)
{
ASSERT3S(vcpuid, >=, 0);
ASSERT3S(vcpuid, <, VM_MAXCPU);
return (&vm->vcpu[vcpuid].cpuid_cfg);
}
struct vlapic *
vm_lapic(struct vm *vm, int cpu)
{
ASSERT3S(cpu, >=, 0);
ASSERT3S(cpu, <, VM_MAXCPU);
return (vm->vcpu[cpu].vlapic);
}
struct vioapic *
vm_ioapic(struct vm *vm)
{
return (vm->vioapic);
}
struct vhpet *
vm_hpet(struct vm *vm)
{
return (vm->vhpet);
}
void *
vm_iommu_domain(struct vm *vm)
{
return (vm->iommu);
}
int
vcpu_set_state(struct vm *vm, int vcpuid, enum vcpu_state newstate,
bool from_idle)
{
int error;
struct vcpu *vcpu;
if (vcpuid < 0 || vcpuid >= vm->maxcpus)
panic("vcpu_set_state: invalid vcpuid %d", vcpuid);
vcpu = &vm->vcpu[vcpuid];
vcpu_lock(vcpu);
error = vcpu_set_state_locked(vm, vcpuid, newstate, from_idle);
vcpu_unlock(vcpu);
return (error);
}
enum vcpu_state
vcpu_get_state(struct vm *vm, int vcpuid, int *hostcpu)
{
struct vcpu *vcpu;
enum vcpu_state state;
if (vcpuid < 0 || vcpuid >= vm->maxcpus)
panic("vcpu_get_state: invalid vcpuid %d", vcpuid);
vcpu = &vm->vcpu[vcpuid];
vcpu_lock(vcpu);
state = vcpu->state;
if (hostcpu != NULL)
*hostcpu = vcpu->hostcpu;
vcpu_unlock(vcpu);
return (state);
}
uint64_t
vcpu_tsc_offset(struct vm *vm, int vcpuid, bool phys_adj)
{
ASSERT(vcpuid >= 0 && vcpuid < vm->maxcpus);
uint64_t vcpu_off = vm->tsc_offset + vm->vcpu[vcpuid].tsc_offset;
if (phys_adj) {
vcpu_off += vmm_host_tsc_delta();
}
return (vcpu_off);
}
uint64_t
vm_get_freq_multiplier(struct vm *vm)
{
return (vm->freq_multiplier);
}
hrtime_t
vm_normalize_hrtime(struct vm *vm, hrtime_t hrt)
{
return ((hrtime_t)((uint64_t)hrt - (uint64_t)vm->boot_hrtime));
}
hrtime_t
vm_denormalize_hrtime(struct vm *vm, hrtime_t hrt)
{
return ((hrtime_t)((uint64_t)hrt + (uint64_t)vm->boot_hrtime));
}
int
vm_activate_cpu(struct vm *vm, int vcpuid)
{
if (vcpuid < 0 || vcpuid >= vm->maxcpus)
return (EINVAL);
if (CPU_ISSET(vcpuid, &vm->active_cpus))
return (EBUSY);
if (vm_is_suspended(vm, NULL)) {
return (EBUSY);
}
CPU_SET_ATOMIC(vcpuid, &vm->active_cpus);
if (vm_is_suspended(vm, NULL)) {
return (EBUSY);
}
return (0);
}
int
vm_suspend_cpu(struct vm *vm, int vcpuid)
{
int i;
if (vcpuid < -1 || vcpuid >= vm->maxcpus)
return (EINVAL);
if (vcpuid == -1) {
vm->debug_cpus = vm->active_cpus;
for (i = 0; i < vm->maxcpus; i++) {
if (CPU_ISSET(i, &vm->active_cpus))
vcpu_notify_event(vm, i);
}
} else {
if (!CPU_ISSET(vcpuid, &vm->active_cpus))
return (EINVAL);
CPU_SET_ATOMIC(vcpuid, &vm->debug_cpus);
vcpu_notify_event(vm, vcpuid);
}
return (0);
}
int
vm_resume_cpu(struct vm *vm, int vcpuid)
{
if (vcpuid < -1 || vcpuid >= vm->maxcpus)
return (EINVAL);
if (vcpuid == -1) {
CPU_ZERO(&vm->debug_cpus);
} else {
if (!CPU_ISSET(vcpuid, &vm->debug_cpus))
return (EINVAL);
CPU_CLR_ATOMIC(vcpuid, &vm->debug_cpus);
}
return (0);
}
static bool
vcpu_bailout_checks(struct vm *vm, int vcpuid)
{
struct vcpu *vcpu = &vm->vcpu[vcpuid];
struct vm_exit *vme = &vcpu->exitinfo;
ASSERT(vcpuid >= 0 && vcpuid < vm->maxcpus);
if (vm_is_suspended(vm, vme)) {
VERIFY3S(vme->exitcode, ==, VM_EXITCODE_SUSPENDED);
VERIFY(vme->u.suspended.how > VM_SUSPEND_NONE &&
vme->u.suspended.how < VM_SUSPEND_LAST);
return (true);
}
if (vcpu->reqidle) {
vme->exitcode = VM_EXITCODE_BOGUS;
vmm_stat_incr(vm, vcpuid, VMEXIT_REQIDLE, 1);
return (true);
}
if (vcpu->reqbarrier) {
vme->exitcode = VM_EXITCODE_BOGUS;
vcpu->reqbarrier = false;
return (true);
}
if (vcpu->reqconsist) {
vme->exitcode = VM_EXITCODE_BOGUS;
vcpu->reqconsist = false;
return (true);
}
if (vcpu_should_yield(vm, vcpuid)) {
vme->exitcode = VM_EXITCODE_BOGUS;
vmm_stat_incr(vm, vcpuid, VMEXIT_ASTPENDING, 1);
return (true);
}
if (CPU_ISSET(vcpuid, &vm->debug_cpus)) {
vme->exitcode = VM_EXITCODE_DEBUG;
return (true);
}
return (false);
}
static bool
vcpu_sleep_bailout_checks(struct vm *vm, int vcpuid)
{
if (vcpu_bailout_checks(vm, vcpuid)) {
struct vcpu *vcpu = &vm->vcpu[vcpuid];
struct vm_exit *vme = &vcpu->exitinfo;
vme->inst_length = 0;
return (true);
}
return (false);
}
bool
vcpu_entry_bailout_checks(struct vm *vm, int vcpuid, uint64_t rip)
{
if (vcpu_bailout_checks(vm, vcpuid)) {
struct vcpu *vcpu = &vm->vcpu[vcpuid];
struct vm_exit *vme = &vcpu->exitinfo;
vme->rip = rip;
vme->inst_length = 0;
return (true);
}
return (false);
}
int
vm_vcpu_barrier(struct vm *vm, int vcpuid)
{
if (vcpuid >= 0 && vcpuid < vm->maxcpus) {
struct vcpu *vcpu = &vm->vcpu[vcpuid];
vcpu_lock(vcpu);
if (CPU_ISSET(vcpuid, &vm->active_cpus)) {
vcpu->reqbarrier = true;
vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT);
}
vcpu_unlock(vcpu);
return (0);
} else if (vcpuid == -1) {
for (int i = 0; i < vm->maxcpus; i++) {
struct vcpu *vcpu = &vm->vcpu[i];
vcpu_lock(vcpu);
if (CPU_ISSET(vcpuid, &vm->active_cpus)) {
vcpu->reqbarrier = true;
vcpu_notify_event_locked(vcpu,
VCPU_NOTIFY_EXIT);
}
vcpu_unlock(vcpu);
}
return (0);
} else {
return (EINVAL);
}
}
cpuset_t
vm_active_cpus(struct vm *vm)
{
return (vm->active_cpus);
}
cpuset_t
vm_debug_cpus(struct vm *vm)
{
return (vm->debug_cpus);
}
void *
vcpu_stats(struct vm *vm, int vcpuid)
{
return (vm->vcpu[vcpuid].stats);
}
int
vm_get_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state *state)
{
if (vcpuid < 0 || vcpuid >= vm->maxcpus)
return (EINVAL);
*state = vm->vcpu[vcpuid].x2apic_state;
return (0);
}
int
vm_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state state)
{
if (vcpuid < 0 || vcpuid >= vm->maxcpus)
return (EINVAL);
if (state >= X2APIC_STATE_LAST)
return (EINVAL);
vm->vcpu[vcpuid].x2apic_state = state;
vlapic_set_x2apic_state(vm, vcpuid, state);
return (0);
}
static void
vcpu_notify_event_locked(struct vcpu *vcpu, vcpu_notify_t ntype)
{
int hostcpu;
ASSERT(ntype == VCPU_NOTIFY_APIC || VCPU_NOTIFY_EXIT);
hostcpu = vcpu->hostcpu;
if (vcpu->state == VCPU_RUNNING) {
KASSERT(hostcpu != NOCPU, ("vcpu running on invalid hostcpu"));
if (hostcpu != curcpu) {
if (ntype == VCPU_NOTIFY_APIC) {
vlapic_post_intr(vcpu->vlapic, hostcpu);
} else {
poke_cpu(hostcpu);
}
} else {
}
} else {
KASSERT(hostcpu == NOCPU, ("vcpu state %d not consistent "
"with hostcpu %d", vcpu->state, hostcpu));
if (vcpu->state == VCPU_SLEEPING) {
cv_signal(&vcpu->vcpu_cv);
}
}
}
void
vcpu_notify_event(struct vm *vm, int vcpuid)
{
struct vcpu *vcpu = &vm->vcpu[vcpuid];
vcpu_lock(vcpu);
vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT);
vcpu_unlock(vcpu);
}
void
vcpu_notify_event_type(struct vm *vm, int vcpuid, vcpu_notify_t ntype)
{
struct vcpu *vcpu = &vm->vcpu[vcpuid];
if (ntype == VCPU_NOTIFY_NONE) {
return;
}
vcpu_lock(vcpu);
vcpu_notify_event_locked(vcpu, ntype);
vcpu_unlock(vcpu);
}
void
vcpu_ustate_change(struct vm *vm, int vcpuid, enum vcpu_ustate ustate)
{
struct vcpu *vcpu = &vm->vcpu[vcpuid];
const hrtime_t now = gethrtime();
ASSERT3S(ustate, <, VU_MAX);
ASSERT3S(ustate, >=, VU_INIT);
if (ustate == vcpu->ustate) {
return;
}
const hrtime_t delta = now - vcpu->ustate_when;
vcpu->ustate_total[vcpu->ustate] += delta;
membar_producer();
vcpu->ustate_when = now;
vcpu->ustate = ustate;
}
void *
vm_get_cookie(struct vm *vm)
{
return (vm->cookie);
}
struct vmspace *
vm_get_vmspace(struct vm *vm)
{
return (vm->vmspace);
}
struct vm_client *
vm_get_vmclient(struct vm *vm, int vcpuid)
{
return (vm->vcpu[vcpuid].vmclient);
}
int
vm_apicid2vcpuid(struct vm *vm, int apicid)
{
return (apicid);
}
struct vatpic *
vm_atpic(struct vm *vm)
{
return (vm->vatpic);
}
struct vatpit *
vm_atpit(struct vm *vm)
{
return (vm->vatpit);
}
struct vpmtmr *
vm_pmtmr(struct vm *vm)
{
return (vm->vpmtmr);
}
struct vrtc *
vm_rtc(struct vm *vm)
{
return (vm->vrtc);
}
enum vm_reg_name
vm_segment_name(int seg)
{
static enum vm_reg_name seg_names[] = {
VM_REG_GUEST_ES,
VM_REG_GUEST_CS,
VM_REG_GUEST_SS,
VM_REG_GUEST_DS,
VM_REG_GUEST_FS,
VM_REG_GUEST_GS
};
KASSERT(seg >= 0 && seg < nitems(seg_names),
("%s: invalid segment encoding %d", __func__, seg));
return (seg_names[seg]);
}
void
vm_copy_teardown(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo,
uint_t num_copyinfo)
{
for (uint_t idx = 0; idx < num_copyinfo; idx++) {
if (copyinfo[idx].cookie != NULL) {
(void) vmp_release((vm_page_t *)copyinfo[idx].cookie);
}
}
bzero(copyinfo, num_copyinfo * sizeof (struct vm_copyinfo));
}
int
vm_copy_setup(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
uint64_t gla, size_t len, int prot, struct vm_copyinfo *copyinfo,
uint_t num_copyinfo, int *fault)
{
uint_t idx, nused;
size_t n, off, remaining;
vm_client_t *vmc = vm_get_vmclient(vm, vcpuid);
bzero(copyinfo, sizeof (struct vm_copyinfo) * num_copyinfo);
nused = 0;
remaining = len;
while (remaining > 0) {
uint64_t gpa;
int error;
if (nused >= num_copyinfo)
return (EFAULT);
error = vm_gla2gpa(vm, vcpuid, paging, gla, prot, &gpa, fault);
if (error || *fault)
return (error);
off = gpa & PAGEOFFSET;
n = min(remaining, PAGESIZE - off);
copyinfo[nused].gpa = gpa;
copyinfo[nused].len = n;
remaining -= n;
gla += n;
nused++;
}
for (idx = 0; idx < nused; idx++) {
vm_page_t *vmp;
caddr_t hva;
vmp = vmc_hold(vmc, copyinfo[idx].gpa & PAGEMASK, prot);
if (vmp == NULL) {
break;
}
if ((prot & PROT_WRITE) != 0) {
hva = (caddr_t)vmp_get_writable(vmp);
} else {
hva = (caddr_t)vmp_get_readable(vmp);
}
copyinfo[idx].hva = hva + (copyinfo[idx].gpa & PAGEOFFSET);
copyinfo[idx].cookie = vmp;
copyinfo[idx].prot = prot;
}
if (idx != nused) {
vm_copy_teardown(vm, vcpuid, copyinfo, num_copyinfo);
return (EFAULT);
} else {
*fault = 0;
return (0);
}
}
void
vm_copyin(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo, void *kaddr,
size_t len)
{
char *dst;
int idx;
dst = kaddr;
idx = 0;
while (len > 0) {
ASSERT(copyinfo[idx].prot & PROT_READ);
bcopy(copyinfo[idx].hva, dst, copyinfo[idx].len);
len -= copyinfo[idx].len;
dst += copyinfo[idx].len;
idx++;
}
}
void
vm_copyout(struct vm *vm, int vcpuid, const void *kaddr,
struct vm_copyinfo *copyinfo, size_t len)
{
const char *src;
int idx;
src = kaddr;
idx = 0;
while (len > 0) {
ASSERT(copyinfo[idx].prot & PROT_WRITE);
bcopy(src, copyinfo[idx].hva, copyinfo[idx].len);
len -= copyinfo[idx].len;
src += copyinfo[idx].len;
idx++;
}
}
VMM_STAT_DECLARE(VMM_MEM_RESIDENT);
static void
vm_get_rescnt(struct vm *vm, int vcpu, struct vmm_stat_type *stat)
{
if (vcpu == 0) {
vmm_stat_set(vm, vcpu, VMM_MEM_RESIDENT,
PAGE_SIZE * vmspace_resident_count(vm->vmspace));
}
}
VMM_STAT_FUNC(VMM_MEM_RESIDENT, "Resident memory", vm_get_rescnt);
int
vm_ioport_access(struct vm *vm, int vcpuid, bool in, uint16_t port,
uint8_t bytes, uint32_t *val)
{
return (vm_inout_access(&vm->ioports, in, port, bytes, val));
}
int
vm_ioport_attach(struct vm *vm, uint16_t port, ioport_handler_t func, void *arg,
void **cookie)
{
int err;
err = vm_inout_attach(&vm->ioports, port, IOPF_DEFAULT, func, arg);
if (err == 0) {
*cookie = (void *)IOP_GEN_COOKIE(func, arg, port);
}
return (err);
}
int
vm_ioport_detach(struct vm *vm, void **cookie, ioport_handler_t *old_func,
void **old_arg)
{
uint16_t port = IOP_PORT_FROM_COOKIE((uintptr_t)*cookie);
int err;
err = vm_inout_detach(&vm->ioports, port, false, old_func, old_arg);
if (err == 0) {
*cookie = NULL;
}
return (err);
}
int
vm_ioport_hook(struct vm *vm, uint16_t port, ioport_handler_t func,
void *arg, void **cookie)
{
int err;
if (port == 0) {
return (EINVAL);
}
err = vm_inout_attach(&vm->ioports, port, IOPF_DRV_HOOK, func, arg);
if (err == 0) {
*cookie = (void *)IOP_GEN_COOKIE(func, arg, port);
}
return (err);
}
void
vm_ioport_unhook(struct vm *vm, void **cookie)
{
uint16_t port = IOP_PORT_FROM_COOKIE((uintptr_t)*cookie);
ioport_handler_t old_func;
void *old_arg;
int err;
err = vm_inout_detach(&vm->ioports, port, true, &old_func, &old_arg);
VERIFY0(err);
VERIFY(IOP_GEN_COOKIE(old_func, old_arg, port) == (uintptr_t)*cookie);
*cookie = NULL;
}
int
vmm_kstat_update_vcpu(struct kstat *ksp, int rw)
{
struct vm *vm = ksp->ks_private;
vmm_vcpu_kstats_t *vvk = ksp->ks_data;
const int vcpuid = vvk->vvk_vcpu.value.ui32;
struct vcpu *vcpu = &vm->vcpu[vcpuid];
ASSERT3U(vcpuid, <, VM_MAXCPU);
vvk->vvk_time_init.value.ui64 = vcpu->ustate_total[VU_INIT];
vvk->vvk_time_run.value.ui64 = vcpu->ustate_total[VU_RUN];
vvk->vvk_time_idle.value.ui64 = vcpu->ustate_total[VU_IDLE];
vvk->vvk_time_emu_kern.value.ui64 = vcpu->ustate_total[VU_EMU_KERN];
vvk->vvk_time_emu_user.value.ui64 = vcpu->ustate_total[VU_EMU_USER];
vvk->vvk_time_sched.value.ui64 = vcpu->ustate_total[VU_SCHED];
return (0);
}
SET_DECLARE(vmm_data_version_entries, const vmm_data_version_entry_t);
static int
vmm_data_find(const vmm_data_req_t *req, const vmm_data_version_entry_t **resp)
{
const vmm_data_version_entry_t **vdpp, *vdp;
ASSERT(resp != NULL);
ASSERT(req->vdr_result_len != NULL);
SET_FOREACH(vdpp, vmm_data_version_entries) {
vdp = *vdpp;
if (vdp->vdve_class != req->vdr_class ||
vdp->vdve_version != req->vdr_version) {
continue;
}
if (vdp->vdve_len_expect != 0 &&
vdp->vdve_len_expect > req->vdr_len) {
*req->vdr_result_len = vdp->vdve_len_expect;
return (ENOSPC);
}
if (vdp->vdve_readf != NULL || vdp->vdve_writef != NULL) {
} else if (vdp->vdve_vcpu_readf != NULL ||
vdp->vdve_vcpu_writef != NULL) {
const int llimit = vdp->vdve_vcpu_wildcard ? -1 : 0;
if (req->vdr_vcpuid < llimit ||
req->vdr_vcpuid >= VM_MAXCPU) {
return (EINVAL);
}
} else {
return (EINVAL);
}
*resp = vdp;
return (0);
}
return (EINVAL);
}
static void *
vmm_data_from_class(const vmm_data_req_t *req, struct vm *vm)
{
switch (req->vdr_class) {
case VDC_REGISTER:
case VDC_MSR:
case VDC_FPU:
case VDC_LAPIC:
case VDC_VMM_ARCH:
panic("Unexpected per-vcpu class %u", req->vdr_class);
break;
case VDC_IOAPIC:
return (vm->vioapic);
case VDC_ATPIT:
return (vm->vatpit);
case VDC_ATPIC:
return (vm->vatpic);
case VDC_HPET:
return (vm->vhpet);
case VDC_PM_TIMER:
return (vm->vpmtmr);
case VDC_RTC:
return (vm->vrtc);
case VDC_VMM_TIME:
return (vm);
case VDC_VERSION:
return (vm);
default:
panic("Unexpected class %u", req->vdr_class);
}
}
const uint32_t default_msr_iter[] = {
MSR_EFER,
MSR_KGSBASE,
MSR_STAR,
MSR_LSTAR,
MSR_CSTAR,
MSR_SF_MASK,
MSR_SYSENTER_CS_MSR,
MSR_SYSENTER_ESP_MSR,
MSR_SYSENTER_EIP_MSR,
MSR_PAT,
MSR_TSC,
MSR_MTRRcap,
MSR_MTRRdefType,
MSR_MTRR4kBase, MSR_MTRR4kBase + 1, MSR_MTRR4kBase + 2,
MSR_MTRR4kBase + 3, MSR_MTRR4kBase + 4, MSR_MTRR4kBase + 5,
MSR_MTRR4kBase + 6, MSR_MTRR4kBase + 7,
MSR_MTRR16kBase, MSR_MTRR16kBase + 1,
MSR_MTRR64kBase,
};
static int
vmm_data_read_msr(struct vm *vm, int vcpuid, uint32_t msr, uint64_t *value)
{
int err = 0;
switch (msr) {
case MSR_TSC:
*value = vm->vcpu[vcpuid].tsc_offset;
return (0);
default:
if (is_mtrr_msr(msr)) {
err = vm_rdmtrr(&vm->vcpu[vcpuid].mtrr, msr, value);
} else {
err = ops->vmgetmsr(vm->cookie, vcpuid, msr, value);
}
break;
}
return (err);
}
static int
vmm_data_write_msr(struct vm *vm, int vcpuid, uint32_t msr, uint64_t value)
{
int err = 0;
switch (msr) {
case MSR_TSC:
vm->vcpu[vcpuid].tsc_offset = value;
return (0);
case MSR_MTRRcap: {
uint64_t comp;
err = vm_rdmtrr(&vm->vcpu[vcpuid].mtrr, msr, &comp);
if (err == 0 && comp != value) {
return (EINVAL);
}
break;
}
default:
if (is_mtrr_msr(msr)) {
ASSERT3U(msr, !=, MSR_MTRRcap);
err = vm_wrmtrr(&vm->vcpu[vcpuid].mtrr, msr, value);
} else {
err = ops->vmsetmsr(vm->cookie, vcpuid, msr, value);
}
break;
}
return (err);
}
static int
vmm_data_read_msrs(struct vm *vm, int vcpuid, const vmm_data_req_t *req)
{
VERIFY3U(req->vdr_class, ==, VDC_MSR);
VERIFY3U(req->vdr_version, ==, 1);
struct vdi_field_entry_v1 *entryp = req->vdr_data;
if ((req->vdr_flags & VDX_FLAG_READ_COPYIN) != 0) {
const uint_t count =
req->vdr_len / sizeof (struct vdi_field_entry_v1);
for (uint_t i = 0; i < count; i++, entryp++) {
int err = vmm_data_read_msr(vm, vcpuid,
entryp->vfe_ident, &entryp->vfe_value);
if (err != 0) {
return (err);
}
}
*req->vdr_result_len =
count * sizeof (struct vdi_field_entry_v1);
return (0);
}
const uint_t num_msrs = nitems(default_msr_iter) +
(VMM_MTRR_VAR_MAX * 2);
const uint32_t output_len =
num_msrs * sizeof (struct vdi_field_entry_v1);
*req->vdr_result_len = output_len;
if (req->vdr_len < output_len) {
return (ENOSPC);
}
for (uint_t i = 0; i < nitems(default_msr_iter); i++, entryp++) {
entryp->vfe_ident = default_msr_iter[i];
VERIFY0(vmm_data_read_msr(vm, vcpuid, entryp->vfe_ident,
&entryp->vfe_value));
}
for (uint_t i = 0; i < (VMM_MTRR_VAR_MAX * 2); i++, entryp++) {
entryp->vfe_ident = MSR_MTRRVarBase + i;
VERIFY0(vmm_data_read_msr(vm, vcpuid, entryp->vfe_ident,
&entryp->vfe_value));
}
return (0);
}
static int
vmm_data_write_msrs(struct vm *vm, int vcpuid, const vmm_data_req_t *req)
{
VERIFY3U(req->vdr_class, ==, VDC_MSR);
VERIFY3U(req->vdr_version, ==, 1);
const struct vdi_field_entry_v1 *entryp = req->vdr_data;
const uint_t entry_count =
req->vdr_len / sizeof (struct vdi_field_entry_v1);
for (uint_t i = 0; i < entry_count; i++, entryp++) {
const uint64_t msr = entryp->vfe_ident;
uint64_t val;
if (vmm_data_read_msr(vm, vcpuid, msr, &val) != 0) {
return (EINVAL);
}
}
entryp = req->vdr_data;
for (uint_t i = 0; i < entry_count; i++, entryp++) {
int err = vmm_data_write_msr(vm, vcpuid, entryp->vfe_ident,
entryp->vfe_value);
if (err != 0) {
return (err);
}
}
*req->vdr_result_len = entry_count * sizeof (struct vdi_field_entry_v1);
return (0);
}
static const vmm_data_version_entry_t msr_v1 = {
.vdve_class = VDC_MSR,
.vdve_version = 1,
.vdve_len_per_item = sizeof (struct vdi_field_entry_v1),
.vdve_vcpu_readf = vmm_data_read_msrs,
.vdve_vcpu_writef = vmm_data_write_msrs,
};
VMM_DATA_VERSION(msr_v1);
static const uint32_t vmm_arch_v1_fields[] = {
VAI_VM_IS_PAUSED,
};
static const uint32_t vmm_arch_v1_vcpu_fields[] = {
VAI_PEND_NMI,
VAI_PEND_EXTINT,
VAI_PEND_EXCP,
VAI_PEND_INTINFO,
};
static bool
vmm_read_arch_field(struct vm *vm, int vcpuid, uint32_t ident, uint64_t *valp)
{
ASSERT(valp != NULL);
if (vcpuid == -1) {
switch (ident) {
case VAI_VM_IS_PAUSED:
*valp = vm->is_paused ? 1 : 0;
return (true);
default:
break;
}
} else {
VERIFY(vcpuid >= 0 && vcpuid <= VM_MAXCPU);
struct vcpu *vcpu = &vm->vcpu[vcpuid];
switch (ident) {
case VAI_PEND_NMI:
*valp = vcpu->nmi_pending != 0 ? 1 : 0;
return (true);
case VAI_PEND_EXTINT:
*valp = vcpu->extint_pending != 0 ? 1 : 0;
return (true);
case VAI_PEND_EXCP:
*valp = vcpu->exc_pending;
return (true);
case VAI_PEND_INTINFO:
*valp = vcpu->exit_intinfo;
return (true);
default:
break;
}
}
return (false);
}
static int
vmm_data_read_varch(struct vm *vm, int vcpuid, const vmm_data_req_t *req)
{
VERIFY3U(req->vdr_class, ==, VDC_VMM_ARCH);
VERIFY3U(req->vdr_version, ==, 1);
if (vcpuid != -1 && (vcpuid < 0 || vcpuid >= VM_MAXCPU)) {
return (EINVAL);
}
struct vdi_field_entry_v1 *entryp = req->vdr_data;
if ((req->vdr_flags & VDX_FLAG_READ_COPYIN) != 0) {
const uint_t count =
req->vdr_len / sizeof (struct vdi_field_entry_v1);
for (uint_t i = 0; i < count; i++, entryp++) {
if (!vmm_read_arch_field(vm, vcpuid, entryp->vfe_ident,
&entryp->vfe_value)) {
return (EINVAL);
}
}
*req->vdr_result_len =
count * sizeof (struct vdi_field_entry_v1);
return (0);
}
const uint32_t *idents;
uint_t ident_count;
if (vcpuid == -1) {
idents = vmm_arch_v1_fields;
ident_count = nitems(vmm_arch_v1_fields);
} else {
idents = vmm_arch_v1_vcpu_fields;
ident_count = nitems(vmm_arch_v1_vcpu_fields);
}
const uint32_t total_size =
ident_count * sizeof (struct vdi_field_entry_v1);
*req->vdr_result_len = total_size;
if (req->vdr_len < total_size) {
return (ENOSPC);
}
for (uint_t i = 0; i < ident_count; i++, entryp++) {
entryp->vfe_ident = idents[i];
VERIFY(vmm_read_arch_field(vm, vcpuid, entryp->vfe_ident,
&entryp->vfe_value));
}
return (0);
}
static int
vmm_data_write_varch_vcpu(struct vm *vm, int vcpuid, const vmm_data_req_t *req)
{
VERIFY3U(req->vdr_class, ==, VDC_VMM_ARCH);
VERIFY3U(req->vdr_version, ==, 1);
if (vcpuid < 0 || vcpuid >= VM_MAXCPU) {
return (EINVAL);
}
const struct vdi_field_entry_v1 *entryp = req->vdr_data;
const uint_t entry_count =
req->vdr_len / sizeof (struct vdi_field_entry_v1);
struct vcpu *vcpu = &vm->vcpu[vcpuid];
for (uint_t i = 0; i < entry_count; i++, entryp++) {
const uint64_t val = entryp->vfe_value;
switch (entryp->vfe_ident) {
case VAI_PEND_NMI:
vcpu->nmi_pending = (val != 0);
break;
case VAI_PEND_EXTINT:
vcpu->extint_pending = (val != 0);
break;
case VAI_PEND_EXCP:
if (!VM_INTINFO_PENDING(val)) {
vcpu->exc_pending = 0;
} else if (VM_INTINFO_TYPE(val) != VM_INTINFO_HWEXCP ||
(val & VM_INTINFO_MASK_RSVD) != 0) {
return (EINVAL);
} else {
vcpu->exc_pending = val;
}
break;
case VAI_PEND_INTINFO:
if (vm_exit_intinfo(vm, vcpuid, val) != 0) {
return (EINVAL);
}
break;
default:
return (EINVAL);
}
}
*req->vdr_result_len = entry_count * sizeof (struct vdi_field_entry_v1);
return (0);
}
static int
vmm_data_write_varch(struct vm *vm, int vcpuid, const vmm_data_req_t *req)
{
VERIFY3U(req->vdr_class, ==, VDC_VMM_ARCH);
VERIFY3U(req->vdr_version, ==, 1);
if (vcpuid != -1) {
return (vmm_data_write_varch_vcpu(vm, vcpuid, req));
}
const struct vdi_field_entry_v1 *entryp = req->vdr_data;
const uint_t entry_count =
req->vdr_len / sizeof (struct vdi_field_entry_v1);
if (entry_count > 0) {
if (entryp->vfe_ident == VAI_VM_IS_PAUSED) {
return (EPERM);
} else {
return (EINVAL);
}
}
*req->vdr_result_len = entry_count * sizeof (struct vdi_field_entry_v1);
return (0);
}
static const vmm_data_version_entry_t vmm_arch_v1 = {
.vdve_class = VDC_VMM_ARCH,
.vdve_version = 1,
.vdve_len_per_item = sizeof (struct vdi_field_entry_v1),
.vdve_vcpu_readf = vmm_data_read_varch,
.vdve_vcpu_writef = vmm_data_write_varch,
.vdve_vcpu_wildcard = true,
};
VMM_DATA_VERSION(vmm_arch_v1);
static uint64_t
vmm_scale_tsc(uint64_t tsc, uint64_t mult)
{
const uint32_t frac_size = ops->fr_fracsize;
if (mult != VM_TSCM_NOSCALE) {
VERIFY3U(frac_size, >, 0);
return (scale_tsc(tsc, mult, frac_size));
} else {
return (tsc);
}
}
uint64_t
vmm_calc_freq_multiplier(uint64_t guest_hz, uint64_t host_hz,
uint32_t frac_size)
{
VERIFY3U(guest_hz, !=, 0);
VERIFY3U(frac_size, >, 0);
VERIFY3U(frac_size, <, 64);
return (calc_freq_multiplier(guest_hz, host_hz, frac_size));
}
static uint64_t
calc_tsc_offset(uint64_t base_host_tsc, uint64_t base_guest_tsc, uint64_t mult)
{
const uint64_t htsc_scaled = vmm_scale_tsc(base_host_tsc, mult);
if (htsc_scaled > base_guest_tsc) {
return ((uint64_t)(- (int64_t)(htsc_scaled - base_guest_tsc)));
} else {
return (base_guest_tsc - htsc_scaled);
}
}
static uint64_t
calc_guest_tsc(uint64_t host_tsc, uint64_t mult, uint64_t offset)
{
return (vmm_scale_tsc(host_tsc, mult) + offset);
}
static void
vmm_time_snapshot(uint64_t *tsc, hrtime_t *hrtime, timespec_t *hrestime)
{
ulong_t iflag = intr_clear();
hrtime_t hrt = gethrtimeunscaledf();
*tsc = (uint64_t)hrt;
*hrtime = hrt;
scalehrtime(hrtime);
gethrestime(hrestime);
intr_restore(iflag);
}
static int
vmm_data_read_vmm_time(void *arg, const vmm_data_req_t *req)
{
VERIFY3U(req->vdr_class, ==, VDC_VMM_TIME);
VERIFY3U(req->vdr_version, ==, 1);
VERIFY3U(req->vdr_len, >=, sizeof (struct vdi_time_info_v1));
struct vm *vm = arg;
struct vdi_time_info_v1 *out = req->vdr_data;
if (req->vdr_vcpuid != -1) {
return (EINVAL);
}
uint64_t tsc;
hrtime_t hrtime;
timespec_t hrestime;
vmm_time_snapshot(&tsc, &hrtime, &hrestime);
out->vt_guest_freq = vm->guest_freq;
out->vt_guest_tsc = calc_guest_tsc(tsc, vm->freq_multiplier,
vm->tsc_offset);
out->vt_boot_hrtime = vm->boot_hrtime;
out->vt_hrtime = hrtime;
out->vt_hres_sec = hrestime.tv_sec;
out->vt_hres_ns = hrestime.tv_nsec;
return (0);
}
static int
vmm_data_write_vmm_time(void *arg, const vmm_data_req_t *req)
{
VERIFY3U(req->vdr_class, ==, VDC_VMM_TIME);
VERIFY3U(req->vdr_version, ==, 1);
VERIFY3U(req->vdr_len, >=, sizeof (struct vdi_time_info_v1));
struct vm *vm = arg;
const struct vdi_time_info_v1 *src = req->vdr_data;
if (req->vdr_vcpuid != -1) {
return (EINVAL);
}
if (src->vt_guest_freq == 0) {
return (EINVAL);
}
uint64_t mult = VM_TSCM_NOSCALE;
freqratio_res_t res = ops->vmfreqratio(src->vt_guest_freq,
vmm_host_freq, &mult);
switch (res) {
case FR_SCALING_NOT_SUPPORTED:
return (EPERM);
case FR_OUT_OF_RANGE:
return (EINVAL);
case FR_SCALING_NOT_NEEDED:
VERIFY3U(mult, ==, VM_TSCM_NOSCALE);
break;
case FR_VALID:
VERIFY3U(mult, !=, VM_TSCM_NOSCALE);
break;
}
uint64_t tsc;
hrtime_t hrtime;
timespec_t hrestime;
vmm_time_snapshot(&tsc, &hrtime, &hrestime);
if ((src->vt_hrtime > hrtime) || (src->vt_boot_hrtime > hrtime)) {
return (EINVAL);
}
hrtime_t hrt_delta = hrtime - src->vt_hrtime;
const uint64_t host_ticks = unscalehrtime(hrt_delta);
const uint64_t guest_ticks = vmm_scale_tsc(host_ticks,
vm->freq_multiplier);
const uint64_t base_guest_tsc = src->vt_guest_tsc + guest_ticks;
vm->freq_multiplier = mult;
vm->guest_freq = src->vt_guest_freq;
vm->boot_hrtime = src->vt_boot_hrtime;
vm->tsc_offset = calc_tsc_offset(tsc, base_guest_tsc,
vm->freq_multiplier);
return (0);
}
static const vmm_data_version_entry_t vmm_time_v1 = {
.vdve_class = VDC_VMM_TIME,
.vdve_version = 1,
.vdve_len_expect = sizeof (struct vdi_time_info_v1),
.vdve_readf = vmm_data_read_vmm_time,
.vdve_writef = vmm_data_write_vmm_time,
};
VMM_DATA_VERSION(vmm_time_v1);
static int
vmm_data_read_versions(void *arg, const vmm_data_req_t *req)
{
VERIFY3U(req->vdr_class, ==, VDC_VERSION);
VERIFY3U(req->vdr_version, ==, 1);
const uint32_t total_size = SET_COUNT(vmm_data_version_entries) *
sizeof (struct vdi_version_entry_v1);
*req->vdr_result_len = total_size;
if (req->vdr_len < *req->vdr_result_len) {
return (ENOSPC);
}
struct vdi_version_entry_v1 *entryp = req->vdr_data;
const vmm_data_version_entry_t **vdpp;
SET_FOREACH(vdpp, vmm_data_version_entries) {
const vmm_data_version_entry_t *vdp = *vdpp;
entryp->vve_class = vdp->vdve_class;
entryp->vve_version = vdp->vdve_version;
entryp->vve_len_expect = vdp->vdve_len_expect;
entryp->vve_len_per_item = vdp->vdve_len_per_item;
entryp++;
}
return (0);
}
static int
vmm_data_write_versions(void *arg, const vmm_data_req_t *req)
{
return (EPERM);
}
static const vmm_data_version_entry_t versions_v1 = {
.vdve_class = VDC_VERSION,
.vdve_version = 1,
.vdve_len_per_item = sizeof (struct vdi_version_entry_v1),
.vdve_readf = vmm_data_read_versions,
.vdve_writef = vmm_data_write_versions,
};
VMM_DATA_VERSION(versions_v1);
int
vmm_data_read(struct vm *vm, const vmm_data_req_t *req)
{
int err = 0;
const vmm_data_version_entry_t *entry = NULL;
err = vmm_data_find(req, &entry);
if (err != 0) {
return (err);
}
ASSERT(entry != NULL);
if (entry->vdve_readf != NULL) {
void *datap = vmm_data_from_class(req, vm);
err = entry->vdve_readf(datap, req);
} else if (entry->vdve_vcpu_readf != NULL) {
err = entry->vdve_vcpu_readf(vm, req->vdr_vcpuid, req);
} else {
err = EINVAL;
}
if (err == 0 && entry->vdve_len_expect != 0) {
*req->vdr_result_len = entry->vdve_len_expect;
}
return (err);
}
int
vmm_data_write(struct vm *vm, const vmm_data_req_t *req)
{
int err = 0;
const vmm_data_version_entry_t *entry = NULL;
err = vmm_data_find(req, &entry);
if (err != 0) {
return (err);
}
ASSERT(entry != NULL);
if (entry->vdve_writef != NULL) {
void *datap = vmm_data_from_class(req, vm);
err = entry->vdve_writef(datap, req);
} else if (entry->vdve_vcpu_writef != NULL) {
err = entry->vdve_vcpu_writef(vm, req->vdr_vcpuid, req);
} else {
err = EINVAL;
}
if (err == 0 && entry->vdve_len_expect != 0) {
*req->vdr_result_len = entry->vdve_len_expect;
}
return (err);
}
static void
vm_mmiohook_init(struct vm *vm, struct mmiohook_config *mh)
{
VERIFY3P(mh->mhc_entries, ==, NULL);
VERIFY0(mh->mhc_count);
}
static void
vm_mmiohook_cleanup(struct vm *vm, struct mmiohook_config *mh)
{
if (mh->mhc_count == 0)
return;
kmem_free(mh->mhc_entries, sizeof (mmiohook_entry_t) * mh->mhc_count);
mh->mhc_entries = NULL;
mh->mhc_count = 0;
}
int
vm_mmio_hook(struct vm *vm, uint64_t address, uint32_t size,
mmio_handler_t func, void *arg, void **cookiep)
{
struct mmiohook_config *mh = &vm->mmiohooks;
mmiohook_entry_t *old_ents = mh->mhc_entries;
uint_t old_count = mh->mhc_count;
mmiohook_entry_t *ents;
uint_t count = old_count + 1;
const uint64_t end = address + size;
const size_t esz = sizeof (mmiohook_entry_t);
if (size == 0 || end < address)
return (EINVAL);
if (old_count >= mmiohook_entry_limit)
return (ENOSPC);
for (uint_t i = 0; i < old_count; i++) {
mmiohook_entry_t *e = &old_ents[i];
const uint64_t old_end = e->mhe_addr + e->mhe_size;
if (address < old_end && e->mhe_addr < end)
return (EEXIST);
}
ents = kmem_alloc(count * esz, KM_SLEEP);
if (old_count > 0)
bcopy(old_ents, ents, old_count * esz);
mmiohook_entry_t *ne = &ents[old_count];
ne->mhe_func = func;
ne->mhe_arg = arg;
ne->mhe_addr = address;
ne->mhe_size = size;
*cookiep = (void *)(uintptr_t)ne->mhe_addr;
mh->mhc_entries = ents;
mh->mhc_count = count;
if (old_count > 0)
kmem_free(old_ents, old_count * esz);
return (0);
}
int
vm_mmio_unhook(struct vm *vm, void **cookie)
{
struct mmiohook_config *mh = &vm->mmiohooks;
mmiohook_entry_t *old_ents = mh->mhc_entries;
uint_t old_count = mh->mhc_count;
const size_t esz = sizeof (mmiohook_entry_t);
mmiohook_entry_t *ents;
uint_t i;
for (i = 0; i < old_count; i++) {
mmiohook_entry_t *e = &old_ents[i];
if (e->mhe_addr == (uint64_t)(uintptr_t)*cookie)
break;
}
if (i >= old_count)
return (ENOENT);
if (old_count == 1) {
mh->mhc_entries = NULL;
mh->mhc_count = 0;
} else {
uint_t count = old_count - 1;
ents = kmem_alloc(count * esz, KM_SLEEP);
if (i > 0)
bcopy(old_ents, ents, esz * i);
if (i < count)
bcopy(old_ents + i + 1, ents + i, (count - i) * esz);
mh->mhc_entries = ents;
mh->mhc_count = count;
}
kmem_free(old_ents, old_count * esz);
*cookie = NULL;
return (0);
}