root/sys/amd64/vmm/intel/vmcs.c
/*-
 * SPDX-License-Identifier: BSD-2-Clause
 *
 * Copyright (c) 2011 NetApp, Inc.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

#include "opt_bhyve_snapshot.h"
#include "opt_ddb.h"

#include <sys/param.h>
#include <sys/sysctl.h>
#include <sys/systm.h>
#include <sys/pcpu.h>

#include <vm/vm.h>
#include <vm/pmap.h>

#include <machine/segments.h>
#include <machine/vmm.h>
#include <machine/vmm_snapshot.h>
#include "vmm_host.h"
#include "vmx_cpufunc.h"
#include "vmcs.h"
#include "ept.h"
#include "vmx.h"

#ifdef DDB
#include <ddb/ddb.h>
#endif

SYSCTL_DECL(_hw_vmm_vmx);

static int no_flush_rsb;
SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, no_flush_rsb, CTLFLAG_RW,
    &no_flush_rsb, 0, "Do not flush RSB upon vmexit");

static uint64_t
vmcs_fix_regval(uint32_t encoding, uint64_t val)
{

        switch (encoding) {
        case VMCS_GUEST_CR0:
                val = vmx_fix_cr0(val);
                break;
        case VMCS_GUEST_CR4:
                val = vmx_fix_cr4(val);
                break;
        default:
                break;
        }
        return (val);
}

static uint32_t
vmcs_field_encoding(int ident)
{
        switch (ident) {
        case VM_REG_GUEST_CR0:
                return (VMCS_GUEST_CR0);
        case VM_REG_GUEST_CR3:
                return (VMCS_GUEST_CR3);
        case VM_REG_GUEST_CR4:
                return (VMCS_GUEST_CR4);
        case VM_REG_GUEST_DR7:
                return (VMCS_GUEST_DR7);
        case VM_REG_GUEST_RSP:
                return (VMCS_GUEST_RSP);
        case VM_REG_GUEST_RIP:
                return (VMCS_GUEST_RIP);
        case VM_REG_GUEST_RFLAGS:
                return (VMCS_GUEST_RFLAGS);
        case VM_REG_GUEST_ES:
                return (VMCS_GUEST_ES_SELECTOR);
        case VM_REG_GUEST_CS:
                return (VMCS_GUEST_CS_SELECTOR);
        case VM_REG_GUEST_SS:
                return (VMCS_GUEST_SS_SELECTOR);
        case VM_REG_GUEST_DS:
                return (VMCS_GUEST_DS_SELECTOR);
        case VM_REG_GUEST_FS:
                return (VMCS_GUEST_FS_SELECTOR);
        case VM_REG_GUEST_GS:
                return (VMCS_GUEST_GS_SELECTOR);
        case VM_REG_GUEST_TR:
                return (VMCS_GUEST_TR_SELECTOR);
        case VM_REG_GUEST_LDTR:
                return (VMCS_GUEST_LDTR_SELECTOR);
        case VM_REG_GUEST_EFER:
                return (VMCS_GUEST_IA32_EFER);
        case VM_REG_GUEST_PDPTE0:
                return (VMCS_GUEST_PDPTE0);
        case VM_REG_GUEST_PDPTE1:
                return (VMCS_GUEST_PDPTE1);
        case VM_REG_GUEST_PDPTE2:
                return (VMCS_GUEST_PDPTE2);
        case VM_REG_GUEST_PDPTE3:
                return (VMCS_GUEST_PDPTE3);
        case VM_REG_GUEST_ENTRY_INST_LENGTH:
                return (VMCS_ENTRY_INST_LENGTH);
        case VM_REG_GUEST_FS_BASE:
                return (VMCS_GUEST_FS_BASE);
        case VM_REG_GUEST_GS_BASE:
                return (VMCS_GUEST_GS_BASE);
        default:
                return (-1);
        }
}

static int
vmcs_seg_desc_encoding(int seg, uint32_t *base, uint32_t *lim, uint32_t *acc)
{

        switch (seg) {
        case VM_REG_GUEST_ES:
                *base = VMCS_GUEST_ES_BASE;
                *lim = VMCS_GUEST_ES_LIMIT;
                *acc = VMCS_GUEST_ES_ACCESS_RIGHTS;
                break;
        case VM_REG_GUEST_CS:
                *base = VMCS_GUEST_CS_BASE;
                *lim = VMCS_GUEST_CS_LIMIT;
                *acc = VMCS_GUEST_CS_ACCESS_RIGHTS;
                break;
        case VM_REG_GUEST_SS:
                *base = VMCS_GUEST_SS_BASE;
                *lim = VMCS_GUEST_SS_LIMIT;
                *acc = VMCS_GUEST_SS_ACCESS_RIGHTS;
                break;
        case VM_REG_GUEST_DS:
                *base = VMCS_GUEST_DS_BASE;
                *lim = VMCS_GUEST_DS_LIMIT;
                *acc = VMCS_GUEST_DS_ACCESS_RIGHTS;
                break;
        case VM_REG_GUEST_FS:
                *base = VMCS_GUEST_FS_BASE;
                *lim = VMCS_GUEST_FS_LIMIT;
                *acc = VMCS_GUEST_FS_ACCESS_RIGHTS;
                break;
        case VM_REG_GUEST_GS:
                *base = VMCS_GUEST_GS_BASE;
                *lim = VMCS_GUEST_GS_LIMIT;
                *acc = VMCS_GUEST_GS_ACCESS_RIGHTS;
                break;
        case VM_REG_GUEST_TR:
                *base = VMCS_GUEST_TR_BASE;
                *lim = VMCS_GUEST_TR_LIMIT;
                *acc = VMCS_GUEST_TR_ACCESS_RIGHTS;
                break;
        case VM_REG_GUEST_LDTR:
                *base = VMCS_GUEST_LDTR_BASE;
                *lim = VMCS_GUEST_LDTR_LIMIT;
                *acc = VMCS_GUEST_LDTR_ACCESS_RIGHTS;
                break;
        case VM_REG_GUEST_IDTR:
                *base = VMCS_GUEST_IDTR_BASE;
                *lim = VMCS_GUEST_IDTR_LIMIT;
                *acc = VMCS_INVALID_ENCODING;
                break;
        case VM_REG_GUEST_GDTR:
                *base = VMCS_GUEST_GDTR_BASE;
                *lim = VMCS_GUEST_GDTR_LIMIT;
                *acc = VMCS_INVALID_ENCODING;
                break;
        default:
                return (EINVAL);
        }

        return (0);
}

int
vmcs_getreg(struct vmcs *vmcs, int running, int ident, uint64_t *retval)
{
        int error;
        uint32_t encoding;

        /*
         * If we need to get at vmx-specific state in the VMCS we can bypass
         * the translation of 'ident' to 'encoding' by simply setting the
         * sign bit. As it so happens the upper 16 bits are reserved (i.e
         * set to 0) in the encodings for the VMCS so we are free to use the
         * sign bit.
         */
        if (ident < 0)
                encoding = ident & 0x7fffffff;
        else
                encoding = vmcs_field_encoding(ident);

        if (encoding == (uint32_t)-1)
                return (EINVAL);

        if (!running)
                VMPTRLD(vmcs);

        error = vmread(encoding, retval);

        if (!running)
                VMCLEAR(vmcs);

        return (error);
}

int
vmcs_setreg(struct vmcs *vmcs, int running, int ident, uint64_t val)
{
        int error;
        uint32_t encoding;

        if (ident < 0)
                encoding = ident & 0x7fffffff;
        else
                encoding = vmcs_field_encoding(ident);

        if (encoding == (uint32_t)-1)
                return (EINVAL);

        val = vmcs_fix_regval(encoding, val);

        if (!running)
                VMPTRLD(vmcs);

        error = vmwrite(encoding, val);

        if (!running)
                VMCLEAR(vmcs);

        return (error);
}

int
vmcs_setdesc(struct vmcs *vmcs, int running, int seg, struct seg_desc *desc)
{
        int error;
        uint32_t base, limit, access;

        error = vmcs_seg_desc_encoding(seg, &base, &limit, &access);
        if (error != 0)
                panic("vmcs_setdesc: invalid segment register %d", seg);

        if (!running)
                VMPTRLD(vmcs);
        if ((error = vmwrite(base, desc->base)) != 0)
                goto done;

        if ((error = vmwrite(limit, desc->limit)) != 0)
                goto done;

        if (access != VMCS_INVALID_ENCODING) {
                if ((error = vmwrite(access, desc->access)) != 0)
                        goto done;
        }
done:
        if (!running)
                VMCLEAR(vmcs);
        return (error);
}

int
vmcs_getdesc(struct vmcs *vmcs, int running, int seg, struct seg_desc *desc)
{
        int error;
        uint32_t base, limit, access;
        uint64_t u64;

        error = vmcs_seg_desc_encoding(seg, &base, &limit, &access);
        if (error != 0)
                panic("vmcs_getdesc: invalid segment register %d", seg);

        if (!running)
                VMPTRLD(vmcs);
        if ((error = vmread(base, &u64)) != 0)
                goto done;
        desc->base = u64;

        if ((error = vmread(limit, &u64)) != 0)
                goto done;
        desc->limit = u64;

        if (access != VMCS_INVALID_ENCODING) {
                if ((error = vmread(access, &u64)) != 0)
                        goto done;
                desc->access = u64;
        }
done:
        if (!running)
                VMCLEAR(vmcs);
        return (error);
}

int
vmcs_set_msr_save(struct vmcs *vmcs, u_long g_area, u_int g_count)
{
        int error;

        VMPTRLD(vmcs);

        /*
         * Guest MSRs are saved in the VM-exit MSR-store area.
         * Guest MSRs are loaded from the VM-entry MSR-load area.
         * Both areas point to the same location in memory.
         */
        if ((error = vmwrite(VMCS_EXIT_MSR_STORE, g_area)) != 0)
                goto done;
        if ((error = vmwrite(VMCS_EXIT_MSR_STORE_COUNT, g_count)) != 0)
                goto done;

        if ((error = vmwrite(VMCS_ENTRY_MSR_LOAD, g_area)) != 0)
                goto done;
        if ((error = vmwrite(VMCS_ENTRY_MSR_LOAD_COUNT, g_count)) != 0)
                goto done;

        error = 0;
done:
        VMCLEAR(vmcs);
        return (error);
}

int
vmcs_init(struct vmcs *vmcs)
{
        int error, codesel, datasel, tsssel;
        u_long cr0, cr4, efer;
        uint64_t pat, fsbase, idtrbase;

        codesel = vmm_get_host_codesel();
        datasel = vmm_get_host_datasel();
        tsssel = vmm_get_host_tsssel();

        /*
         * Make sure we have a "current" VMCS to work with.
         */
        VMPTRLD(vmcs);

        /* Host state */

        /* Initialize host IA32_PAT MSR */
        pat = vmm_get_host_pat();
        if ((error = vmwrite(VMCS_HOST_IA32_PAT, pat)) != 0)
                goto done;

        /* Load the IA32_EFER MSR */
        efer = vmm_get_host_efer();
        if ((error = vmwrite(VMCS_HOST_IA32_EFER, efer)) != 0)
                goto done;

        /* Load the control registers */

        cr0 = vmm_get_host_cr0();
        if ((error = vmwrite(VMCS_HOST_CR0, cr0)) != 0)
                goto done;

        cr4 = vmm_get_host_cr4() | CR4_VMXE;
        if ((error = vmwrite(VMCS_HOST_CR4, cr4)) != 0)
                goto done;

        /* Load the segment selectors */
        if ((error = vmwrite(VMCS_HOST_ES_SELECTOR, datasel)) != 0)
                goto done;

        if ((error = vmwrite(VMCS_HOST_CS_SELECTOR, codesel)) != 0)
                goto done;

        if ((error = vmwrite(VMCS_HOST_SS_SELECTOR, datasel)) != 0)
                goto done;

        if ((error = vmwrite(VMCS_HOST_DS_SELECTOR, datasel)) != 0)
                goto done;

        if ((error = vmwrite(VMCS_HOST_FS_SELECTOR, datasel)) != 0)
                goto done;

        if ((error = vmwrite(VMCS_HOST_GS_SELECTOR, datasel)) != 0)
                goto done;

        if ((error = vmwrite(VMCS_HOST_TR_SELECTOR, tsssel)) != 0)
                goto done;

        /*
         * Load the Base-Address for %fs and idtr.
         *
         * Note that we exclude %gs, tss and gdtr here because their base
         * address is pcpu specific.
         */
        fsbase = vmm_get_host_fsbase();
        if ((error = vmwrite(VMCS_HOST_FS_BASE, fsbase)) != 0)
                goto done;

        idtrbase = vmm_get_host_idtrbase();
        if ((error = vmwrite(VMCS_HOST_IDTR_BASE, idtrbase)) != 0)
                goto done;

        /* instruction pointer */
        if (no_flush_rsb) {
                if ((error = vmwrite(VMCS_HOST_RIP,
                    (u_long)vmx_exit_guest)) != 0)
                        goto done;
        } else {
                if ((error = vmwrite(VMCS_HOST_RIP,
                    (u_long)vmx_exit_guest_flush_rsb)) != 0)
                        goto done;
        }

        /* link pointer */
        if ((error = vmwrite(VMCS_LINK_POINTER, ~0)) != 0)
                goto done;
done:
        VMCLEAR(vmcs);
        return (error);
}

#ifdef BHYVE_SNAPSHOT
int
vmcs_getany(struct vmcs *vmcs, int running, int ident, uint64_t *val)
{
        int error;

        if (!running)
                VMPTRLD(vmcs);

        error = vmread(ident, val);

        if (!running)
                VMCLEAR(vmcs);

        return (error);
}

int
vmcs_setany(struct vmcs *vmcs, int running, int ident, uint64_t val)
{
        int error;

        if (!running)
                VMPTRLD(vmcs);

        error = vmwrite(ident, val);

        if (!running)
                VMCLEAR(vmcs);

        return (error);
}

int
vmcs_snapshot_reg(struct vmcs *vmcs, int running, int ident,
                  struct vm_snapshot_meta *meta)
{
        int ret;
        uint64_t val;

        if (meta->op == VM_SNAPSHOT_SAVE) {
                ret = vmcs_getreg(vmcs, running, ident, &val);
                if (ret != 0)
                        goto done;

                SNAPSHOT_VAR_OR_LEAVE(val, meta, ret, done);
        } else if (meta->op == VM_SNAPSHOT_RESTORE) {
                SNAPSHOT_VAR_OR_LEAVE(val, meta, ret, done);

                ret = vmcs_setreg(vmcs, running, ident, val);
                if (ret != 0)
                        goto done;
        } else {
                ret = EINVAL;
                goto done;
        }

done:
        return (ret);
}

int
vmcs_snapshot_desc(struct vmcs *vmcs, int running, int seg,
                   struct vm_snapshot_meta *meta)
{
        int ret;
        struct seg_desc desc;

        if (meta->op == VM_SNAPSHOT_SAVE) {
                ret = vmcs_getdesc(vmcs, running, seg, &desc);
                if (ret != 0)
                        goto done;

                SNAPSHOT_VAR_OR_LEAVE(desc.base, meta, ret, done);
                SNAPSHOT_VAR_OR_LEAVE(desc.limit, meta, ret, done);
                SNAPSHOT_VAR_OR_LEAVE(desc.access, meta, ret, done);
        } else if (meta->op == VM_SNAPSHOT_RESTORE) {
                SNAPSHOT_VAR_OR_LEAVE(desc.base, meta, ret, done);
                SNAPSHOT_VAR_OR_LEAVE(desc.limit, meta, ret, done);
                SNAPSHOT_VAR_OR_LEAVE(desc.access, meta, ret, done);

                ret = vmcs_setdesc(vmcs, running, seg, &desc);
                if (ret != 0)
                        goto done;
        } else {
                ret = EINVAL;
                goto done;
        }

done:
        return (ret);
}

int
vmcs_snapshot_any(struct vmcs *vmcs, int running, int ident,
                  struct vm_snapshot_meta *meta)
{
        int ret;
        uint64_t val;

        if (meta->op == VM_SNAPSHOT_SAVE) {
                ret = vmcs_getany(vmcs, running, ident, &val);
                if (ret != 0)
                        goto done;

                SNAPSHOT_VAR_OR_LEAVE(val, meta, ret, done);
        } else if (meta->op == VM_SNAPSHOT_RESTORE) {
                SNAPSHOT_VAR_OR_LEAVE(val, meta, ret, done);

                ret = vmcs_setany(vmcs, running, ident, val);
                if (ret != 0)
                        goto done;
        } else {
                ret = EINVAL;
                goto done;
        }

done:
        return (ret);
}
#endif

#ifdef DDB
extern int vmxon_enabled[];

DB_SHOW_COMMAND(vmcs, db_show_vmcs)
{
        uint64_t cur_vmcs, val;
        uint32_t exit;

        if (!vmxon_enabled[curcpu]) {
                db_printf("VMX not enabled\n");
                return;
        }

        if (have_addr) {
                db_printf("Only current VMCS supported\n");
                return;
        }

        vmptrst(&cur_vmcs);
        if (cur_vmcs == VMCS_INITIAL) {
                db_printf("No current VM context\n");
                return;
        }
        db_printf("VMCS: %jx\n", cur_vmcs);
        db_printf("VPID: %lu\n", vmcs_read(VMCS_VPID));
        db_printf("Activity: ");
        val = vmcs_read(VMCS_GUEST_ACTIVITY);
        switch (val) {
        case 0:
                db_printf("Active");
                break;
        case 1:
                db_printf("HLT");
                break;
        case 2:
                db_printf("Shutdown");
                break;
        case 3:
                db_printf("Wait for SIPI");
                break;
        default:
                db_printf("Unknown: %#lx", val);
        }
        db_printf("\n");
        exit = vmcs_read(VMCS_EXIT_REASON);
        if (exit & 0x80000000)
                db_printf("Entry Failure Reason: %u\n", exit & 0xffff);
        else
                db_printf("Exit Reason: %u\n", exit & 0xffff);
        db_printf("Qualification: %#lx\n", vmcs_exit_qualification());
        db_printf("Guest Linear Address: %#lx\n",
            vmcs_read(VMCS_GUEST_LINEAR_ADDRESS));
        switch (exit & 0x8000ffff) {
        case EXIT_REASON_EXCEPTION:
        case EXIT_REASON_EXT_INTR:
                val = vmcs_read(VMCS_EXIT_INTR_INFO);
                db_printf("Interrupt Type: ");
                switch (val >> 8 & 0x7) {
                case 0:
                        db_printf("external");
                        break;
                case 2:
                        db_printf("NMI");
                        break;
                case 3:
                        db_printf("HW exception");
                        break;
                case 4:
                        db_printf("SW exception");
                        break;
                default:
                        db_printf("?? %lu", val >> 8 & 0x7);
                        break;
                }
                db_printf("  Vector: %lu", val & 0xff);
                if (val & 0x800)
                        db_printf("  Error Code: %lx",
                            vmcs_read(VMCS_EXIT_INTR_ERRCODE));
                db_printf("\n");
                break;
        case EXIT_REASON_EPT_FAULT:
        case EXIT_REASON_EPT_MISCONFIG:
                db_printf("Guest Physical Address: %#lx\n",
                    vmcs_read(VMCS_GUEST_PHYSICAL_ADDRESS));
                break;
        }
        db_printf("VM-instruction error: %#lx\n", vmcs_instruction_error());
}
#endif