root/sys/arm64/vmm/vmm.c
/*-
 * SPDX-License-Identifier: BSD-2-Clause
 *
 * Copyright (C) 2015 Mihai Carabas <mihai.carabas@gmail.com>
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/cpuset.h>
#include <sys/kernel.h>
#include <sys/linker.h>
#include <sys/lock.h>
#include <sys/malloc.h>
#include <sys/mutex.h>
#include <sys/pcpu.h>
#include <sys/proc.h>
#include <sys/queue.h>
#include <sys/rwlock.h>
#include <sys/sched.h>
#include <sys/smp.h>

#include <vm/vm.h>
#include <vm/vm_object.h>
#include <vm/vm_page.h>
#include <vm/pmap.h>
#include <vm/vm_map.h>
#include <vm/vm_extern.h>
#include <vm/vm_param.h>

#include <machine/cpu.h>
#include <machine/fpu.h>
#include <machine/machdep.h>
#include <machine/pcb.h>
#include <machine/smp.h>
#include <machine/vm.h>
#include <machine/vmparam.h>
#include <machine/vmm.h>
#include <machine/vmm_instruction_emul.h>

#include <dev/pci/pcireg.h>

#include <dev/vmm/vmm_dev.h>
#include <dev/vmm/vmm_ktr.h>
#include <dev/vmm/vmm_mem.h>
#include <dev/vmm/vmm_stat.h>
#include <dev/vmm/vmm_vm.h>

#include "arm64.h"
#include "mmu.h"

#include "io/vgic.h"
#include "io/vtimer.h"

static MALLOC_DEFINE(M_VMM, "vmm", "vmm");

/* statistics */
static VMM_STAT(VCPU_TOTAL_RUNTIME, "vcpu total runtime");

struct vmm_regs {
        uint64_t        id_aa64afr0;
        uint64_t        id_aa64afr1;
        uint64_t        id_aa64dfr0;
        uint64_t        id_aa64dfr1;
        uint64_t        id_aa64isar0;
        uint64_t        id_aa64isar1;
        uint64_t        id_aa64isar2;
        uint64_t        id_aa64mmfr0;
        uint64_t        id_aa64mmfr1;
        uint64_t        id_aa64mmfr2;
        uint64_t        id_aa64pfr0;
        uint64_t        id_aa64pfr1;
};

static const struct vmm_regs vmm_arch_regs_masks = {
        .id_aa64dfr0 =
            ID_AA64DFR0_CTX_CMPs_MASK |
            ID_AA64DFR0_WRPs_MASK |
            ID_AA64DFR0_BRPs_MASK |
            ID_AA64DFR0_PMUVer_3_9 |
            ID_AA64DFR0_DebugVer_8,
        .id_aa64isar0 =
            ID_AA64ISAR0_TLB_TLBIOSR |
            ID_AA64ISAR0_SHA3_IMPL |
            ID_AA64ISAR0_RDM_IMPL |
            ID_AA64ISAR0_Atomic_IMPL |
            ID_AA64ISAR0_CRC32_BASE |
            ID_AA64ISAR0_SHA2_512 |
            ID_AA64ISAR0_SHA1_BASE |
            ID_AA64ISAR0_AES_PMULL,
        .id_aa64mmfr0 =
            ID_AA64MMFR0_TGran4_IMPL |
            ID_AA64MMFR0_TGran64_IMPL |
            ID_AA64MMFR0_TGran16_IMPL |
            ID_AA64MMFR0_ASIDBits_16 |
            ID_AA64MMFR0_PARange_4P,
        .id_aa64mmfr1 =
            ID_AA64MMFR1_SpecSEI_IMPL |
            ID_AA64MMFR1_PAN_ATS1E1 |
            ID_AA64MMFR1_HAFDBS_AF,
        .id_aa64pfr0 =
            ID_AA64PFR0_GIC_CPUIF_NONE |
            ID_AA64PFR0_AdvSIMD_HP |
            ID_AA64PFR0_FP_HP |
            ID_AA64PFR0_EL3_64 |
            ID_AA64PFR0_EL2_64 |
            ID_AA64PFR0_EL1_64 |
            ID_AA64PFR0_EL0_64,
};

/* Host registers masked by vmm_arch_regs_masks. */
static struct vmm_regs vmm_arch_regs;

/* global statistics */
VMM_STAT(VMEXIT_COUNT, "total number of vm exits");
VMM_STAT(VMEXIT_UNKNOWN, "number of vmexits for the unknown exception");
VMM_STAT(VMEXIT_WFI, "number of times wfi was intercepted");
VMM_STAT(VMEXIT_WFE, "number of times wfe was intercepted");
VMM_STAT(VMEXIT_HVC, "number of times hvc was intercepted");
VMM_STAT(VMEXIT_MSR, "number of times msr/mrs was intercepted");
VMM_STAT(VMEXIT_DATA_ABORT, "number of vmexits for a data abort");
VMM_STAT(VMEXIT_INSN_ABORT, "number of vmexits for an instruction abort");
VMM_STAT(VMEXIT_UNHANDLED_SYNC, "number of vmexits for an unhandled synchronous exception");
VMM_STAT(VMEXIT_IRQ, "number of vmexits for an irq");
VMM_STAT(VMEXIT_FIQ, "number of vmexits for an interrupt");
VMM_STAT(VMEXIT_BRK, "number of vmexits for a breakpoint exception");
VMM_STAT(VMEXIT_SS, "number of vmexits for a single-step exception");
VMM_STAT(VMEXIT_UNHANDLED_EL2, "number of vmexits for an unhandled EL2 exception");
VMM_STAT(VMEXIT_UNHANDLED, "number of vmexits for an unhandled exception");

static int
vmm_regs_init(struct vmm_regs *regs, const struct vmm_regs *masks)
{
#define _FETCH_KERN_REG(reg, field) do {                                \
        regs->field = vmm_arch_regs_masks.field;                        \
        get_kernel_reg_iss_masked(reg ## _ISS, &regs->field,            \
            masks->field);                                              \
} while (0)
        _FETCH_KERN_REG(ID_AA64AFR0_EL1, id_aa64afr0);
        _FETCH_KERN_REG(ID_AA64AFR1_EL1, id_aa64afr1);
        _FETCH_KERN_REG(ID_AA64DFR0_EL1, id_aa64dfr0);
        _FETCH_KERN_REG(ID_AA64DFR1_EL1, id_aa64dfr1);
        _FETCH_KERN_REG(ID_AA64ISAR0_EL1, id_aa64isar0);
        _FETCH_KERN_REG(ID_AA64ISAR1_EL1, id_aa64isar1);
        _FETCH_KERN_REG(ID_AA64ISAR2_EL1, id_aa64isar2);
        _FETCH_KERN_REG(ID_AA64MMFR0_EL1, id_aa64mmfr0);
        _FETCH_KERN_REG(ID_AA64MMFR1_EL1, id_aa64mmfr1);
        _FETCH_KERN_REG(ID_AA64MMFR2_EL1, id_aa64mmfr2);
        _FETCH_KERN_REG(ID_AA64PFR0_EL1, id_aa64pfr0);
        _FETCH_KERN_REG(ID_AA64PFR1_EL1, id_aa64pfr1);
#undef _FETCH_KERN_REG
        return (0);
}

static void
vcpu_cleanup(struct vcpu *vcpu, bool destroy)
{
        vmmops_vcpu_cleanup(vcpu->cookie);
        vcpu->cookie = NULL;
        if (destroy) {
                vmm_stat_free(vcpu->stats);
                fpu_save_area_free(vcpu->guestfpu);
                vcpu_lock_destroy(vcpu);
                free(vcpu, M_VMM);
        }
}

static struct vcpu *
vcpu_alloc(struct vm *vm, int vcpu_id)
{
        struct vcpu *vcpu;

        KASSERT(vcpu_id >= 0 && vcpu_id < vm->maxcpus,
            ("vcpu_alloc: invalid vcpu %d", vcpu_id));

        vcpu = malloc(sizeof(*vcpu), M_VMM, M_WAITOK | M_ZERO);
        vcpu_lock_init(vcpu);
        vcpu->state = VCPU_IDLE;
        vcpu->hostcpu = NOCPU;
        vcpu->vcpuid = vcpu_id;
        vcpu->vm = vm;
        vcpu->guestfpu = fpu_save_area_alloc();
        vcpu->stats = vmm_stat_alloc();
        return (vcpu);
}

static void
vcpu_init(struct vcpu *vcpu)
{
        vcpu->cookie = vmmops_vcpu_init(vcpu->vm->cookie, vcpu, vcpu->vcpuid);
        MPASS(vcpu->cookie != NULL);
        fpu_save_area_reset(vcpu->guestfpu);
        vmm_stat_init(vcpu->stats);
}

struct vm_exit *
vm_exitinfo(struct vcpu *vcpu)
{
        return (&vcpu->exitinfo);
}

static int
vmm_unsupported_quirk(void)
{
        /*
         * Known to not load on Ampere eMAG
         * https://bugs.freebsd.org/bugzilla/show_bug.cgi?id=285051
         */
        if (CPU_MATCH(CPU_IMPL_MASK | CPU_PART_MASK, CPU_IMPL_APM,
            CPU_PART_EMAG8180, 0, 0))
                return (ENXIO);

        return (0);
}

int
vmm_modinit(void)
{
        int error;

        error = vmm_unsupported_quirk();
        if (error != 0)
                return (error);

        error = vmm_regs_init(&vmm_arch_regs, &vmm_arch_regs_masks);
        if (error != 0)
                return (error);

        return (vmmops_modinit(0));
}

int
vmm_modcleanup(void)
{
        return (vmmops_modcleanup());
}

static void
vm_init(struct vm *vm, bool create)
{
        int i;

        vm->cookie = vmmops_init(vm, vmspace_pmap(vm_vmspace(vm)));
        MPASS(vm->cookie != NULL);

        CPU_ZERO(&vm->active_cpus);
        CPU_ZERO(&vm->debug_cpus);

        vm->suspend = 0;
        CPU_ZERO(&vm->suspended_cpus);

        memset(vm->mmio_region, 0, sizeof(vm->mmio_region));
        memset(vm->special_reg, 0, sizeof(vm->special_reg));

        if (!create) {
                for (i = 0; i < vm->maxcpus; i++) {
                        if (vm->vcpu[i] != NULL)
                                vcpu_init(vm->vcpu[i]);
                }
        }
}

struct vcpu *
vm_alloc_vcpu(struct vm *vm, int vcpuid)
{
        struct vcpu *vcpu;

        if (vcpuid < 0 || vcpuid >= vm_get_maxcpus(vm))
                return (NULL);

        vcpu = (struct vcpu *)
            atomic_load_acq_ptr((uintptr_t *)&vm->vcpu[vcpuid]);
        if (__predict_true(vcpu != NULL))
                return (vcpu);

        sx_xlock(&vm->vcpus_init_lock);
        vcpu = vm->vcpu[vcpuid];
        if (vcpu == NULL && !vm->dying) {
                /* Some interrupt controllers may have a CPU limit */
                if (vcpuid >= vgic_max_cpu_count(vm->cookie)) {
                        sx_xunlock(&vm->vcpus_init_lock);
                        return (NULL);
                }

                vcpu = vcpu_alloc(vm, vcpuid);
                vcpu_init(vcpu);

                /*
                 * Ensure vCPU is fully created before updating pointer
                 * to permit unlocked reads above.
                 */
                atomic_store_rel_ptr((uintptr_t *)&vm->vcpu[vcpuid],
                    (uintptr_t)vcpu);
        }
        sx_xunlock(&vm->vcpus_init_lock);
        return (vcpu);
}

int
vm_create(const char *name, struct vm **retvm)
{
        struct vm *vm;
        int error;

        vm = malloc(sizeof(struct vm), M_VMM, M_WAITOK | M_ZERO);
        error = vm_mem_init(&vm->mem, 0, 1ul << 39);
        if (error != 0) {
                free(vm, M_VMM);
                return (error);
        }
        strcpy(vm->name, name);
        mtx_init(&vm->rendezvous_mtx, "vm rendezvous lock", 0, MTX_DEF);
        sx_init(&vm->vcpus_init_lock, "vm vcpus");

        vm->sockets = 1;
        vm->cores = 1;                  /* XXX backwards compatibility */
        vm->threads = 1;                /* XXX backwards compatibility */
        vm->maxcpus = vm_maxcpu;

        vm->vcpu = malloc(sizeof(*vm->vcpu) * vm->maxcpus, M_VMM,
            M_WAITOK | M_ZERO);

        vm_init(vm, true);

        *retvm = vm;
        return (0);
}

static void
vm_cleanup(struct vm *vm, bool destroy)
{
        pmap_t pmap __diagused;
        int i;

        if (destroy) {
                vm_xlock_memsegs(vm);
                pmap = vmspace_pmap(vm_vmspace(vm));
                sched_pin();
                PCPU_SET(curvmpmap, NULL);
                sched_unpin();
                CPU_FOREACH(i) {
                        MPASS(cpuid_to_pcpu[i]->pc_curvmpmap != pmap);
                }
        } else
                vm_assert_memseg_xlocked(vm);


        vgic_detach_from_vm(vm->cookie);

        for (i = 0; i < vm->maxcpus; i++) {
                if (vm->vcpu[i] != NULL)
                        vcpu_cleanup(vm->vcpu[i], destroy);
        }

        vmmops_cleanup(vm->cookie);

        vm_mem_cleanup(vm);
        if (destroy) {
                vm_mem_destroy(vm);

                free(vm->vcpu, M_VMM);
                sx_destroy(&vm->vcpus_init_lock);
        }
}

void
vm_destroy(struct vm *vm)
{
        vm_cleanup(vm, true);
        free(vm, M_VMM);
}

void
vm_reset(struct vm *vm)
{
        vm_cleanup(vm, false);
        vm_init(vm, false);
}

int
vm_gla2gpa_nofault(struct vcpu *vcpu, struct vm_guest_paging *paging,
    uint64_t gla, int prot, uint64_t *gpa, int *is_fault)
{
        return (vmmops_gla2gpa(vcpu->cookie, paging, gla, prot, gpa, is_fault));
}

static int
vmm_reg_raz(struct vcpu *vcpu, uint64_t *rval, void *arg)
{
        *rval = 0;
        return (0);
}

static int
vmm_reg_read_arg(struct vcpu *vcpu, uint64_t *rval, void *arg)
{
        *rval = *(uint64_t *)arg;
        return (0);
}

static int
vmm_reg_wi(struct vcpu *vcpu, uint64_t wval, void *arg)
{
        return (0);
}

static int
vmm_write_oslar_el1(struct vcpu *vcpu, uint64_t wval, void *arg)
{
        struct hypctx *hypctx;

        hypctx = vcpu_get_cookie(vcpu);
        /* All other fields are RES0 & we don't do anything with this */
        /* TODO: Disable access to other debug state when locked */
        hypctx->dbg_oslock = (wval & OSLAR_OSLK) == OSLAR_OSLK;
        return (0);
}

static int
vmm_read_oslsr_el1(struct vcpu *vcpu, uint64_t *rval, void *arg)
{
        struct hypctx *hypctx;
        uint64_t val;

        hypctx = vcpu_get_cookie(vcpu);
        val = OSLSR_OSLM_1;
        if (hypctx->dbg_oslock)
                val |= OSLSR_OSLK;
        *rval = val;

        return (0);
}

static const struct vmm_special_reg vmm_special_regs[] = {
#define SPECIAL_REG(_reg, _read, _write)                                \
        {                                                               \
                .esr_iss = ((_reg ## _op0) << ISS_MSR_OP0_SHIFT) |      \
                    ((_reg ## _op1) << ISS_MSR_OP1_SHIFT) |             \
                    ((_reg ## _CRn) << ISS_MSR_CRn_SHIFT) |             \
                    ((_reg ## _CRm) << ISS_MSR_CRm_SHIFT) |             \
                    ((_reg ## _op2) << ISS_MSR_OP2_SHIFT),              \
                .esr_mask = ISS_MSR_REG_MASK,                           \
                .reg_read = (_read),                                    \
                .reg_write = (_write),                                  \
                .arg = NULL,                                            \
        }
#define ID_SPECIAL_REG(_reg, _name)                                     \
        {                                                               \
                .esr_iss = ((_reg ## _op0) << ISS_MSR_OP0_SHIFT) |      \
                    ((_reg ## _op1) << ISS_MSR_OP1_SHIFT) |             \
                    ((_reg ## _CRn) << ISS_MSR_CRn_SHIFT) |             \
                    ((_reg ## _CRm) << ISS_MSR_CRm_SHIFT) |             \
                    ((_reg ## _op2) << ISS_MSR_OP2_SHIFT),              \
                .esr_mask = ISS_MSR_REG_MASK,                           \
                .reg_read = vmm_reg_read_arg,                           \
                .reg_write = vmm_reg_wi,                                \
                .arg = &(vmm_arch_regs._name),                          \
        }

        /* ID registers */
        ID_SPECIAL_REG(ID_AA64PFR0_EL1, id_aa64pfr0),
        ID_SPECIAL_REG(ID_AA64DFR0_EL1, id_aa64dfr0),
        ID_SPECIAL_REG(ID_AA64ISAR0_EL1, id_aa64isar0),
        ID_SPECIAL_REG(ID_AA64MMFR0_EL1, id_aa64mmfr0),
        ID_SPECIAL_REG(ID_AA64MMFR1_EL1, id_aa64mmfr1),

        /*
         * All other ID registers are read as zero.
         * They are all in the op0=3, op1=0, CRn=0, CRm={0..7} space.
         */
        {
                .esr_iss = (3 << ISS_MSR_OP0_SHIFT) |
                    (0 << ISS_MSR_OP1_SHIFT) |
                    (0 << ISS_MSR_CRn_SHIFT) |
                    (0 << ISS_MSR_CRm_SHIFT),
                .esr_mask = ISS_MSR_OP0_MASK | ISS_MSR_OP1_MASK |
                    ISS_MSR_CRn_MASK | (0x8 << ISS_MSR_CRm_SHIFT),
                .reg_read = vmm_reg_raz,
                .reg_write = vmm_reg_wi,
                .arg = NULL,
        },

        /* Counter physical registers */
        SPECIAL_REG(CNTP_CTL_EL0, vtimer_phys_ctl_read, vtimer_phys_ctl_write),
        SPECIAL_REG(CNTP_CVAL_EL0, vtimer_phys_cval_read,
            vtimer_phys_cval_write),
        SPECIAL_REG(CNTP_TVAL_EL0, vtimer_phys_tval_read,
            vtimer_phys_tval_write),
        SPECIAL_REG(CNTPCT_EL0, vtimer_phys_cnt_read, vtimer_phys_cnt_write),

        /* Debug registers */
        SPECIAL_REG(DBGPRCR_EL1, vmm_reg_raz, vmm_reg_wi),
        SPECIAL_REG(OSDLR_EL1, vmm_reg_raz, vmm_reg_wi),
        /* TODO: Exceptions on invalid access */
        SPECIAL_REG(OSLAR_EL1, vmm_reg_raz, vmm_write_oslar_el1),
        SPECIAL_REG(OSLSR_EL1, vmm_read_oslsr_el1, vmm_reg_wi),
#undef SPECIAL_REG
};

void
vm_register_reg_handler(struct vm *vm, uint64_t iss, uint64_t mask,
    reg_read_t reg_read, reg_write_t reg_write, void *arg)
{
        int i;

        for (i = 0; i < nitems(vm->special_reg); i++) {
                if (vm->special_reg[i].esr_iss == 0 &&
                    vm->special_reg[i].esr_mask == 0) {
                        vm->special_reg[i].esr_iss = iss;
                        vm->special_reg[i].esr_mask = mask;
                        vm->special_reg[i].reg_read = reg_read;
                        vm->special_reg[i].reg_write = reg_write;
                        vm->special_reg[i].arg = arg;
                        return;
                }
        }

        panic("%s: No free special register slot", __func__);
}

void
vm_deregister_reg_handler(struct vm *vm, uint64_t iss, uint64_t mask)
{
        int i;

        for (i = 0; i < nitems(vm->special_reg); i++) {
                if (vm->special_reg[i].esr_iss == iss &&
                    vm->special_reg[i].esr_mask == mask) {
                        memset(&vm->special_reg[i], 0,
                            sizeof(vm->special_reg[i]));
                        return;
                }
        }

        panic("%s: Invalid special register: iss %lx mask %lx", __func__, iss,
            mask);
}

static int
vm_handle_reg_emul(struct vcpu *vcpu, bool *retu)
{
        struct vm *vm;
        struct vm_exit *vme;
        struct vre *vre;
        int i, rv;

        vm = vcpu->vm;
        vme = &vcpu->exitinfo;
        vre = &vme->u.reg_emul.vre;

        for (i = 0; i < nitems(vm->special_reg); i++) {
                if (vm->special_reg[i].esr_iss == 0 &&
                    vm->special_reg[i].esr_mask == 0)
                        continue;

                if ((vre->inst_syndrome & vm->special_reg[i].esr_mask) ==
                    vm->special_reg[i].esr_iss) {
                        rv = vmm_emulate_register(vcpu, vre,
                            vm->special_reg[i].reg_read,
                            vm->special_reg[i].reg_write,
                            vm->special_reg[i].arg);
                        if (rv == 0) {
                                *retu = false;
                        }
                        return (rv);
                }
        }
        for (i = 0; i < nitems(vmm_special_regs); i++) {
                if ((vre->inst_syndrome & vmm_special_regs[i].esr_mask) ==
                    vmm_special_regs[i].esr_iss) {
                        rv = vmm_emulate_register(vcpu, vre,
                            vmm_special_regs[i].reg_read,
                            vmm_special_regs[i].reg_write,
                            vmm_special_regs[i].arg);
                        if (rv == 0) {
                                *retu = false;
                        }
                        return (rv);
                }
        }


        *retu = true;
        return (0);
}

void
vm_register_inst_handler(struct vm *vm, uint64_t start, uint64_t size,
    mem_region_read_t mmio_read, mem_region_write_t mmio_write)
{
        int i;

        for (i = 0; i < nitems(vm->mmio_region); i++) {
                if (vm->mmio_region[i].start == 0 &&
                    vm->mmio_region[i].end == 0) {
                        vm->mmio_region[i].start = start;
                        vm->mmio_region[i].end = start + size;
                        vm->mmio_region[i].read = mmio_read;
                        vm->mmio_region[i].write = mmio_write;
                        return;
                }
        }

        panic("%s: No free MMIO region", __func__);
}

void
vm_deregister_inst_handler(struct vm *vm, uint64_t start, uint64_t size)
{
        int i;

        for (i = 0; i < nitems(vm->mmio_region); i++) {
                if (vm->mmio_region[i].start == start &&
                    vm->mmio_region[i].end == start + size) {
                        memset(&vm->mmio_region[i], 0,
                            sizeof(vm->mmio_region[i]));
                        return;
                }
        }

        panic("%s: Invalid MMIO region: %lx - %lx", __func__, start,
            start + size);
}

static int
vm_handle_inst_emul(struct vcpu *vcpu, bool *retu)
{
        struct vm *vm;
        struct vm_exit *vme;
        struct vie *vie;
        struct hyp *hyp;
        uint64_t fault_ipa;
        struct vm_guest_paging *paging;
        struct vmm_mmio_region *vmr;
        int error, i;

        vm = vcpu->vm;
        hyp = vm->cookie;
        if (!hyp->vgic_attached)
                goto out_user;

        vme = &vcpu->exitinfo;
        vie = &vme->u.inst_emul.vie;
        paging = &vme->u.inst_emul.paging;

        fault_ipa = vme->u.inst_emul.gpa;

        vmr = NULL;
        for (i = 0; i < nitems(vm->mmio_region); i++) {
                if (vm->mmio_region[i].start <= fault_ipa &&
                    vm->mmio_region[i].end > fault_ipa) {
                        vmr = &vm->mmio_region[i];
                        break;
                }
        }
        if (vmr == NULL)
                goto out_user;

        error = vmm_emulate_instruction(vcpu, fault_ipa, vie, paging,
            vmr->read, vmr->write, retu);
        return (error);

out_user:
        *retu = true;
        return (0);
}

void
vm_exit_suspended(struct vcpu *vcpu, uint64_t pc)
{
        struct vm *vm = vcpu->vm;
        struct vm_exit *vmexit;

        KASSERT(vm->suspend > VM_SUSPEND_NONE && vm->suspend < VM_SUSPEND_LAST,
            ("vm_exit_suspended: invalid suspend type %d", vm->suspend));

        vmexit = vm_exitinfo(vcpu);
        vmexit->pc = pc;
        vmexit->inst_length = 4;
        vmexit->exitcode = VM_EXITCODE_SUSPENDED;
        vmexit->u.suspended.how = vm->suspend;
}

void
vm_exit_debug(struct vcpu *vcpu, uint64_t pc)
{
        struct vm_exit *vmexit;

        vmexit = vm_exitinfo(vcpu);
        vmexit->pc = pc;
        vmexit->inst_length = 4;
        vmexit->exitcode = VM_EXITCODE_DEBUG;
}

static void
restore_guest_fpustate(struct vcpu *vcpu)
{

        /* flush host state to the pcb */
        vfp_save_state(curthread, curthread->td_pcb);
        /* Ensure the VFP state will be re-loaded when exiting the guest */
        PCPU_SET(fpcurthread, NULL);

        /* restore guest FPU state */
        vfp_enable();
        vfp_restore(vcpu->guestfpu);

        /*
         * The FPU is now "dirty" with the guest's state so turn on emulation
         * to trap any access to the FPU by the host.
         */
        vfp_disable();
}

static void
save_guest_fpustate(struct vcpu *vcpu)
{
        if ((READ_SPECIALREG(cpacr_el1) & CPACR_FPEN_MASK) !=
            CPACR_FPEN_TRAP_ALL1)
                panic("VFP not enabled in host!");

        /* save guest FPU state */
        vfp_enable();
        vfp_store(vcpu->guestfpu);
        vfp_disable();

        KASSERT(PCPU_GET(fpcurthread) == NULL,
            ("%s: fpcurthread set with guest registers", __func__));
}

static void
vcpu_require_state(struct vcpu *vcpu, enum vcpu_state newstate)
{
        int error;

        if ((error = vcpu_set_state(vcpu, newstate, false)) != 0)
                panic("Error %d setting state to %d\n", error, newstate);
}

static void
vcpu_require_state_locked(struct vcpu *vcpu, enum vcpu_state newstate)
{
        int error;

        if ((error = vcpu_set_state_locked(vcpu, newstate, false)) != 0)
                panic("Error %d setting state to %d", error, newstate);
}

int
vm_get_capability(struct vcpu *vcpu, int type, int *retval)
{
        if (type < 0 || type >= VM_CAP_MAX)
                return (EINVAL);

        return (vmmops_getcap(vcpu->cookie, type, retval));
}

int
vm_set_capability(struct vcpu *vcpu, int type, int val)
{
        if (type < 0 || type >= VM_CAP_MAX)
                return (EINVAL);

        return (vmmops_setcap(vcpu->cookie, type, val));
}

void *
vcpu_get_cookie(struct vcpu *vcpu)
{
        return (vcpu->cookie);
}

int
vm_get_register(struct vcpu *vcpu, int reg, uint64_t *retval)
{
        if (reg < 0 || reg >= VM_REG_LAST)
                return (EINVAL);

        return (vmmops_getreg(vcpu->cookie, reg, retval));
}

int
vm_set_register(struct vcpu *vcpu, int reg, uint64_t val)
{
        int error;

        if (reg < 0 || reg >= VM_REG_LAST)
                return (EINVAL);
        error = vmmops_setreg(vcpu->cookie, reg, val);
        if (error || reg != VM_REG_GUEST_PC)
                return (error);

        vcpu->nextpc = val;

        return (0);
}

void *
vm_get_cookie(struct vm *vm)
{
        return (vm->cookie);
}

int
vm_inject_exception(struct vcpu *vcpu, uint64_t esr, uint64_t far)
{
        return (vmmops_exception(vcpu->cookie, esr, far));
}

int
vm_attach_vgic(struct vm *vm, struct vm_vgic_descr *descr)
{
        return (vgic_attach_to_vm(vm->cookie, descr));
}

int
vm_assert_irq(struct vm *vm, uint32_t irq)
{
        return (vgic_inject_irq(vm->cookie, -1, irq, true));
}

int
vm_deassert_irq(struct vm *vm, uint32_t irq)
{
        return (vgic_inject_irq(vm->cookie, -1, irq, false));
}

int
vm_raise_msi(struct vm *vm, uint64_t msg, uint64_t addr, int bus, int slot,
    int func)
{
        /* TODO: Should we raise an SError? */
        return (vgic_inject_msi(vm->cookie, msg, addr));
}

static int
vm_handle_smccc_call(struct vcpu *vcpu, struct vm_exit *vme, bool *retu)
{
        struct hypctx *hypctx;
        int i;

        hypctx = vcpu_get_cookie(vcpu);

        if ((hypctx->tf.tf_esr & ESR_ELx_ISS_MASK) != 0)
                return (1);

        vme->exitcode = VM_EXITCODE_SMCCC;
        vme->u.smccc_call.func_id = hypctx->tf.tf_x[0];
        for (i = 0; i < nitems(vme->u.smccc_call.args); i++)
                vme->u.smccc_call.args[i] = hypctx->tf.tf_x[i + 1];

        *retu = true;
        return (0);
}

static int
vm_handle_wfi(struct vcpu *vcpu, struct vm_exit *vme, bool *retu)
{
        struct vm *vm;

        vm = vcpu->vm;
        vcpu_lock(vcpu);
        while (1) {
                if (vm->suspend)
                        break;

                if (vgic_has_pending_irq(vcpu->cookie))
                        break;

                if (vcpu_should_yield(vcpu))
                        break;

                vcpu_require_state_locked(vcpu, VCPU_SLEEPING);
                /*
                 * XXX msleep_spin() cannot be interrupted by signals so
                 * wake up periodically to check pending signals.
                 */
                msleep_spin(vcpu, &vcpu->mtx, "vmidle", hz);
                vcpu_require_state_locked(vcpu, VCPU_FROZEN);
        }
        vcpu_unlock(vcpu);

        *retu = false;
        return (0);
}

static int
vm_handle_paging(struct vcpu *vcpu, bool *retu)
{
        struct vm *vm = vcpu->vm;
        struct vm_exit *vme;
        struct vm_map *map;
        uint64_t addr, esr;
        pmap_t pmap;
        int ftype, rv;

        vme = &vcpu->exitinfo;

        pmap = vmspace_pmap(vm_vmspace(vcpu->vm));
        addr = vme->u.paging.gpa;
        esr = vme->u.paging.esr;

        /* The page exists, but the page table needs to be updated. */
        if (pmap_fault(pmap, esr, addr) == KERN_SUCCESS)
                return (0);

        switch (ESR_ELx_EXCEPTION(esr)) {
        case EXCP_INSN_ABORT_L:
        case EXCP_DATA_ABORT_L:
                ftype = VM_PROT_EXECUTE | VM_PROT_READ | VM_PROT_WRITE;
                break;
        default:
                panic("%s: Invalid exception (esr = %lx)", __func__, esr);
        }

        map = &vm_vmspace(vm)->vm_map;
        rv = vm_fault(map, vme->u.paging.gpa, ftype, VM_FAULT_NORMAL, NULL);
        if (rv != KERN_SUCCESS)
                return (EFAULT);

        return (0);
}

static int
vm_handle_suspend(struct vcpu *vcpu, bool *retu)
{
        struct vm *vm = vcpu->vm;
        int error, i;
        struct thread *td;

        error = 0;
        td = curthread;

        CPU_SET_ATOMIC(vcpu->vcpuid, &vm->suspended_cpus);

        /*
         * Wait until all 'active_cpus' have suspended themselves.
         *
         * Since a VM may be suspended at any time including when one or
         * more vcpus are doing a rendezvous we need to call the rendezvous
         * handler while we are waiting to prevent a deadlock.
         */
        vcpu_lock(vcpu);
        while (error == 0) {
                if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0)
                        break;

                vcpu_require_state_locked(vcpu, VCPU_SLEEPING);
                msleep_spin(vcpu, &vcpu->mtx, "vmsusp", hz);
                vcpu_require_state_locked(vcpu, VCPU_FROZEN);
                if (td_ast_pending(td, TDA_SUSPEND)) {
                        vcpu_unlock(vcpu);
                        error = thread_check_susp(td, false);
                        vcpu_lock(vcpu);
                }
        }
        vcpu_unlock(vcpu);

        /*
         * Wakeup the other sleeping vcpus and return to userspace.
         */
        for (i = 0; i < vm->maxcpus; i++) {
                if (CPU_ISSET(i, &vm->suspended_cpus)) {
                        vcpu_notify_event(vm_vcpu(vm, i));
                }
        }

        *retu = true;
        return (error);
}

int
vm_run(struct vcpu *vcpu)
{
        struct vm *vm = vcpu->vm;
        struct vm_eventinfo evinfo;
        int error, vcpuid;
        struct vm_exit *vme;
        bool retu;
        pmap_t pmap;

        vcpuid = vcpu->vcpuid;

        if (!CPU_ISSET(vcpuid, &vm->active_cpus))
                return (EINVAL);

        if (CPU_ISSET(vcpuid, &vm->suspended_cpus))
                return (EINVAL);

        pmap = vmspace_pmap(vm_vmspace(vm));
        vme = &vcpu->exitinfo;
        evinfo.rptr = NULL;
        evinfo.sptr = &vm->suspend;
        evinfo.iptr = NULL;
restart:
        critical_enter();

        restore_guest_fpustate(vcpu);

        vcpu_require_state(vcpu, VCPU_RUNNING);
        error = vmmops_run(vcpu->cookie, vcpu->nextpc, pmap, &evinfo);
        vcpu_require_state(vcpu, VCPU_FROZEN);

        save_guest_fpustate(vcpu);

        critical_exit();

        if (error == 0) {
                retu = false;
                switch (vme->exitcode) {
                case VM_EXITCODE_INST_EMUL:
                        vcpu->nextpc = vme->pc + vme->inst_length;
                        error = vm_handle_inst_emul(vcpu, &retu);
                        break;

                case VM_EXITCODE_REG_EMUL:
                        vcpu->nextpc = vme->pc + vme->inst_length;
                        error = vm_handle_reg_emul(vcpu, &retu);
                        break;

                case VM_EXITCODE_HVC:
                        /*
                         * The HVC instruction saves the address for the
                         * next instruction as the return address.
                         */
                        vcpu->nextpc = vme->pc;
                        /*
                         * The PSCI call can change the exit information in the
                         * case of suspend/reset/poweroff/cpu off/cpu on.
                         */
                        error = vm_handle_smccc_call(vcpu, vme, &retu);
                        break;

                case VM_EXITCODE_WFI:
                        vcpu->nextpc = vme->pc + vme->inst_length;
                        error = vm_handle_wfi(vcpu, vme, &retu);
                        break;

                case VM_EXITCODE_PAGING:
                        vcpu->nextpc = vme->pc;
                        error = vm_handle_paging(vcpu, &retu);
                        break;

                case VM_EXITCODE_SUSPENDED:
                        vcpu->nextpc = vme->pc;
                        error = vm_handle_suspend(vcpu, &retu);
                        break;

                default:
                        /* Handle in userland */
                        vcpu->nextpc = vme->pc;
                        retu = true;
                        break;
                }
        }

        if (error == 0 && retu == false)
                goto restart;

        return (error);
}