root/usr/src/cmd/bhyve/amd64/task_switch.c
/*-
 * SPDX-License-Identifier: BSD-2-Clause
 *
 * Copyright (c) 2014 Neel Natu <neel@freebsd.org>
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */
/*
 * This file and its contents are supplied under the terms of the
 * Common Development and Distribution License ("CDDL"), version 1.0.
 * You may only use this file in accordance with the terms of version
 * 1.0 of the CDDL.
 *
 * A full copy of the text of the CDDL should have accompanied this
 * source.  A copy of the CDDL is also available via the Internet at
 * http://www.illumos.org/license/CDDL.
 *
 * Copyright 2020 Oxide Computer Company
 */


#include <sys/param.h>
#include <sys/_iovec.h>
#include <sys/mman.h>

#include <x86/psl.h>
#include <x86/segments.h>
#include <x86/specialreg.h>
#include <machine/vmm.h>

#include <assert.h>
#include <errno.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>

#include <vmmapi.h>

#include "bhyverun.h"
#include "debug.h"

/*
 * Using 'struct i386tss' is tempting but causes myriad sign extension
 * issues because all of its fields are defined as signed integers.
 */
struct tss32 {
        uint16_t        tss_link;
        uint16_t        rsvd1;
        uint32_t        tss_esp0;
        uint16_t        tss_ss0;
        uint16_t        rsvd2;
        uint32_t        tss_esp1;
        uint16_t        tss_ss1;
        uint16_t        rsvd3;
        uint32_t        tss_esp2;
        uint16_t        tss_ss2;
        uint16_t        rsvd4;
        uint32_t        tss_cr3;
        uint32_t        tss_eip;
        uint32_t        tss_eflags;
        uint32_t        tss_eax;
        uint32_t        tss_ecx;
        uint32_t        tss_edx;
        uint32_t        tss_ebx;
        uint32_t        tss_esp;
        uint32_t        tss_ebp;
        uint32_t        tss_esi;
        uint32_t        tss_edi;
        uint16_t        tss_es;
        uint16_t        rsvd5;
        uint16_t        tss_cs;
        uint16_t        rsvd6;
        uint16_t        tss_ss;
        uint16_t        rsvd7;
        uint16_t        tss_ds;
        uint16_t        rsvd8;
        uint16_t        tss_fs;
        uint16_t        rsvd9;
        uint16_t        tss_gs;
        uint16_t        rsvd10;
        uint16_t        tss_ldt;
        uint16_t        rsvd11;
        uint16_t        tss_trap;
        uint16_t        tss_iomap;
};
static_assert(sizeof(struct tss32) == 104, "compile-time assertion failed");

#define SEL_START(sel)  (((sel) & ~0x7))
#define SEL_LIMIT(sel)  (((sel) | 0x7))
#define TSS_BUSY(type)  (((type) & 0x2) != 0)

static uint64_t
GETREG(struct vcpu *vcpu, int reg)
{
        uint64_t val;
        int error;

        error = vm_get_register(vcpu, reg, &val);
        assert(error == 0);
        return (val);
}

static void
SETREG(struct vcpu *vcpu, int reg, uint64_t val)
{
        int error;

        error = vm_set_register(vcpu, reg, val);
        assert(error == 0);
}

static struct seg_desc
usd_to_seg_desc(struct user_segment_descriptor *usd)
{
        struct seg_desc seg_desc;

        seg_desc.base = (u_int)USD_GETBASE(usd);
        if (usd->sd_gran)
                seg_desc.limit = (u_int)(USD_GETLIMIT(usd) << 12) | 0xfff;
        else
                seg_desc.limit = (u_int)USD_GETLIMIT(usd);
        seg_desc.access = usd->sd_type | usd->sd_dpl << 5 | usd->sd_p << 7;
        seg_desc.access |= usd->sd_xx << 12;
        seg_desc.access |= usd->sd_def32 << 14;
        seg_desc.access |= usd->sd_gran << 15;

        return (seg_desc);
}

/*
 * Inject an exception with an error code that is a segment selector.
 * The format of the error code is described in section 6.13, "Error Code",
 * Intel SDM volume 3.
 *
 * Bit 0 (EXT) denotes whether the exception occurred during delivery
 * of an external event like an interrupt.
 *
 * Bit 1 (IDT) indicates whether the selector points to a gate descriptor
 * in the IDT.
 *
 * Bit 2(GDT/LDT) has the usual interpretation of Table Indicator (TI).
 */
static void
sel_exception(struct vcpu *vcpu, int vector, uint16_t sel, int ext)
{
        /*
         * Bit 2 from the selector is retained as-is in the error code.
         *
         * Bit 1 can be safely cleared because none of the selectors
         * encountered during task switch emulation refer to a task
         * gate in the IDT.
         *
         * Bit 0 is set depending on the value of 'ext'.
         */
        sel &= ~0x3;
        if (ext)
                sel |= 0x1;
        vm_inject_fault(vcpu, vector, 1, sel);
}

/*
 * Return 0 if the selector 'sel' in within the limits of the GDT/LDT
 * and non-zero otherwise.
 */
static int
desc_table_limit_check(struct vcpu *vcpu, uint16_t sel)
{
        uint64_t base;
        uint32_t limit, access;
        int error, reg;

        reg = ISLDT(sel) ? VM_REG_GUEST_LDTR : VM_REG_GUEST_GDTR;
        error = vm_get_desc(vcpu, reg, &base, &limit, &access);
        assert(error == 0);

        if (reg == VM_REG_GUEST_LDTR) {
                if (SEG_DESC_UNUSABLE(access) || !SEG_DESC_PRESENT(access))
                        return (-1);
        }

        if (limit < SEL_LIMIT(sel))
                return (-1);
        else
                return (0);
}

/*
 * Read/write the segment descriptor 'desc' into the GDT/LDT slot referenced
 * by the selector 'sel'.
 *
 * Returns 0 on success.
 * Returns 1 if an exception was injected into the guest.
 * Returns -1 otherwise.
 */
static int
desc_table_rw(struct vcpu *vcpu, struct vm_guest_paging *paging,
    uint16_t sel, struct user_segment_descriptor *desc, bool doread,
    int *faultptr)
{
        struct iovec iov[2];
        uint64_t base;
        uint32_t limit, access;
        int error, reg;

        reg = ISLDT(sel) ? VM_REG_GUEST_LDTR : VM_REG_GUEST_GDTR;
        error = vm_get_desc(vcpu, reg, &base, &limit, &access);
        assert(error == 0);
        assert(limit >= SEL_LIMIT(sel));

        error = vm_copy_setup(vcpu, paging, base + SEL_START(sel),
            sizeof(*desc), doread ? PROT_READ : PROT_WRITE, iov, nitems(iov),
            faultptr);
        if (error || *faultptr)
                return (error);

        if (doread)
                vm_copyin(iov, desc, sizeof(*desc));
        else
                vm_copyout(desc, iov, sizeof(*desc));
        return (0);
}

static int
desc_table_read(struct vcpu *vcpu, struct vm_guest_paging *paging,
    uint16_t sel, struct user_segment_descriptor *desc, int *faultptr)
{
        return (desc_table_rw(vcpu, paging, sel, desc, true, faultptr));
}

static int
desc_table_write(struct vcpu *vcpu, struct vm_guest_paging *paging,
    uint16_t sel, struct user_segment_descriptor *desc, int *faultptr)
{
        return (desc_table_rw(vcpu, paging, sel, desc, false, faultptr));
}

/*
 * Read the TSS descriptor referenced by 'sel' into 'desc'.
 *
 * Returns 0 on success.
 * Returns 1 if an exception was injected into the guest.
 * Returns -1 otherwise.
 */
static int
read_tss_descriptor(struct vcpu *vcpu, struct vm_task_switch *ts,
    uint16_t sel, struct user_segment_descriptor *desc, int *faultptr)
{
        struct vm_guest_paging sup_paging;
        int error;

        assert(!ISLDT(sel));
        assert(IDXSEL(sel) != 0);

        /* Fetch the new TSS descriptor */
        if (desc_table_limit_check(vcpu, sel)) {
                if (ts->reason == TSR_IRET)
                        sel_exception(vcpu, IDT_TS, sel, ts->ext);
                else
                        sel_exception(vcpu, IDT_GP, sel, ts->ext);
                return (1);
        }

        sup_paging = ts->paging;
        sup_paging.cpl = 0;             /* implicit supervisor mode */
        error = desc_table_read(vcpu, &sup_paging, sel, desc, faultptr);
        return (error);
}

static bool
code_desc(int sd_type)
{
        /* code descriptor */
        return ((sd_type & 0x18) == 0x18);
}

static bool
stack_desc(int sd_type)
{
        /* writable data descriptor */
        return ((sd_type & 0x1A) == 0x12);
}

static bool
data_desc(int sd_type)
{
        /* data descriptor or a readable code descriptor */
        return ((sd_type & 0x18) == 0x10 || (sd_type & 0x1A) == 0x1A);
}

static bool
ldt_desc(int sd_type)
{

        return (sd_type == SDT_SYSLDT);
}

/*
 * Validate the descriptor 'seg_desc' associated with 'segment'.
 */
static int
validate_seg_desc(struct vcpu *vcpu, struct vm_task_switch *ts,
    int segment, struct seg_desc *seg_desc, int *faultptr)
{
        struct vm_guest_paging sup_paging;
        struct user_segment_descriptor usd;
        int error, idtvec;
        int cpl, dpl, rpl;
        uint16_t sel, cs;
        bool ldtseg, codeseg, stackseg, dataseg, conforming;

        ldtseg = codeseg = stackseg = dataseg = false;
        switch (segment) {
        case VM_REG_GUEST_LDTR:
                ldtseg = true;
                break;
        case VM_REG_GUEST_CS:
                codeseg = true;
                break;
        case VM_REG_GUEST_SS:
                stackseg = true;
                break;
        case VM_REG_GUEST_DS:
        case VM_REG_GUEST_ES:
        case VM_REG_GUEST_FS:
        case VM_REG_GUEST_GS:
                dataseg = true;
                break;
        default:
                assert(0);
        }

        /* Get the segment selector */
        sel = GETREG(vcpu, segment);

        /* LDT selector must point into the GDT */
        if (ldtseg && ISLDT(sel)) {
                sel_exception(vcpu, IDT_TS, sel, ts->ext);
                return (1);
        }

        /* Descriptor table limit check */
        if (desc_table_limit_check(vcpu, sel)) {
                sel_exception(vcpu, IDT_TS, sel, ts->ext);
                return (1);
        }

        /* NULL selector */
        if (IDXSEL(sel) == 0) {
                /* Code and stack segment selectors cannot be NULL */
                if (codeseg || stackseg) {
                        sel_exception(vcpu, IDT_TS, sel, ts->ext);
                        return (1);
                }
                seg_desc->base = 0;
                seg_desc->limit = 0;
                seg_desc->access = 0x10000;     /* unusable */
                return (0);
        }

        /* Read the descriptor from the GDT/LDT */
        sup_paging = ts->paging;
        sup_paging.cpl = 0;     /* implicit supervisor mode */
        error = desc_table_read(vcpu, &sup_paging, sel, &usd, faultptr);
        if (error || *faultptr)
                return (error);

        /* Verify that the descriptor type is compatible with the segment */
        if ((ldtseg && !ldt_desc(usd.sd_type)) ||
            (codeseg && !code_desc(usd.sd_type)) ||
            (dataseg && !data_desc(usd.sd_type)) ||
            (stackseg && !stack_desc(usd.sd_type))) {
                sel_exception(vcpu, IDT_TS, sel, ts->ext);
                return (1);
        }

        /* Segment must be marked present */
        if (!usd.sd_p) {
                if (ldtseg)
                        idtvec = IDT_TS;
                else if (stackseg)
                        idtvec = IDT_SS;
                else
                        idtvec = IDT_NP;
                sel_exception(vcpu, idtvec, sel, ts->ext);
                return (1);
        }

        cs = GETREG(vcpu, VM_REG_GUEST_CS);
        cpl = cs & SEL_RPL_MASK;
        rpl = sel & SEL_RPL_MASK;
        dpl = usd.sd_dpl;

        if (stackseg && (rpl != cpl || dpl != cpl)) {
                sel_exception(vcpu, IDT_TS, sel, ts->ext);
                return (1);
        }

        if (codeseg) {
                conforming = (usd.sd_type & 0x4) ? true : false;
                if ((conforming && (cpl < dpl)) ||
                    (!conforming && (cpl != dpl))) {
                        sel_exception(vcpu, IDT_TS, sel, ts->ext);
                        return (1);
                }
        }

        if (dataseg) {
                /*
                 * A data segment is always non-conforming except when it's
                 * descriptor is a readable, conforming code segment.
                 */
                if (code_desc(usd.sd_type) && (usd.sd_type & 0x4) != 0)
                        conforming = true;
                else
                        conforming = false;

                if (!conforming && (rpl > dpl || cpl > dpl)) {
                        sel_exception(vcpu, IDT_TS, sel, ts->ext);
                        return (1);
                }
        }
        *seg_desc = usd_to_seg_desc(&usd);
        return (0);
}

static void
tss32_save(struct vcpu *vcpu, struct vm_task_switch *task_switch,
    uint32_t eip, struct tss32 *tss, struct iovec *iov)
{

        /* General purpose registers */
        tss->tss_eax = GETREG(vcpu, VM_REG_GUEST_RAX);
        tss->tss_ecx = GETREG(vcpu, VM_REG_GUEST_RCX);
        tss->tss_edx = GETREG(vcpu, VM_REG_GUEST_RDX);
        tss->tss_ebx = GETREG(vcpu, VM_REG_GUEST_RBX);
        tss->tss_esp = GETREG(vcpu, VM_REG_GUEST_RSP);
        tss->tss_ebp = GETREG(vcpu, VM_REG_GUEST_RBP);
        tss->tss_esi = GETREG(vcpu, VM_REG_GUEST_RSI);
        tss->tss_edi = GETREG(vcpu, VM_REG_GUEST_RDI);

        /* Segment selectors */
        tss->tss_es = GETREG(vcpu, VM_REG_GUEST_ES);
        tss->tss_cs = GETREG(vcpu, VM_REG_GUEST_CS);
        tss->tss_ss = GETREG(vcpu, VM_REG_GUEST_SS);
        tss->tss_ds = GETREG(vcpu, VM_REG_GUEST_DS);
        tss->tss_fs = GETREG(vcpu, VM_REG_GUEST_FS);
        tss->tss_gs = GETREG(vcpu, VM_REG_GUEST_GS);

        /* eflags and eip */
        tss->tss_eflags = GETREG(vcpu, VM_REG_GUEST_RFLAGS);
        if (task_switch->reason == TSR_IRET)
                tss->tss_eflags &= ~PSL_NT;
        tss->tss_eip = eip;

        /* Copy updated old TSS into guest memory */
        vm_copyout(tss, iov, sizeof(struct tss32));
}

static void
update_seg_desc(struct vcpu *vcpu, int reg, struct seg_desc *sd)
{
        int error;

        error = vm_set_desc(vcpu, reg, sd->base, sd->limit, sd->access);
        assert(error == 0);
}

/*
 * Update the vcpu registers to reflect the state of the new task.
 */
static int
tss32_restore(struct vmctx *ctx, struct vcpu *vcpu, struct vm_task_switch *ts,
    uint16_t ot_sel, struct tss32 *tss, struct iovec *iov, int *faultptr)
{
        struct seg_desc seg_desc, seg_desc2;
        uint64_t *pdpte, maxphyaddr, reserved;
        uint32_t eflags;
        int error, i;
        bool nested;

        nested = false;
        if (ts->reason != TSR_IRET && ts->reason != TSR_JMP) {
                tss->tss_link = ot_sel;
                nested = true;
        }

        eflags = tss->tss_eflags;
        if (nested)
                eflags |= PSL_NT;

        /* LDTR */
        SETREG(vcpu, VM_REG_GUEST_LDTR, tss->tss_ldt);

        /* PBDR */
        if (ts->paging.paging_mode != PAGING_MODE_FLAT) {
                if (ts->paging.paging_mode == PAGING_MODE_PAE) {
                        /*
                         * XXX Assuming 36-bit MAXPHYADDR.
                         */
                        maxphyaddr = (1UL << 36) - 1;
                        pdpte = paddr_guest2host(ctx, tss->tss_cr3 & ~0x1f, 32);
                        for (i = 0; i < 4; i++) {
                                /* Check reserved bits if the PDPTE is valid */
                                if (!(pdpte[i] & 0x1))
                                        continue;
                                /*
                                 * Bits 2:1, 8:5 and bits above the processor's
                                 * maximum physical address are reserved.
                                 */
                                reserved = ~maxphyaddr | 0x1E6;
                                if (pdpte[i] & reserved) {
                                        vm_inject_gp(vcpu);
                                        return (1);
                                }
                        }
                        SETREG(vcpu, VM_REG_GUEST_PDPTE0, pdpte[0]);
                        SETREG(vcpu, VM_REG_GUEST_PDPTE1, pdpte[1]);
                        SETREG(vcpu, VM_REG_GUEST_PDPTE2, pdpte[2]);
                        SETREG(vcpu, VM_REG_GUEST_PDPTE3, pdpte[3]);
                }
                SETREG(vcpu, VM_REG_GUEST_CR3, tss->tss_cr3);
                ts->paging.cr3 = tss->tss_cr3;
        }

        /* eflags and eip */
        SETREG(vcpu, VM_REG_GUEST_RFLAGS, eflags);
        SETREG(vcpu, VM_REG_GUEST_RIP, tss->tss_eip);

        /* General purpose registers */
        SETREG(vcpu, VM_REG_GUEST_RAX, tss->tss_eax);
        SETREG(vcpu, VM_REG_GUEST_RCX, tss->tss_ecx);
        SETREG(vcpu, VM_REG_GUEST_RDX, tss->tss_edx);
        SETREG(vcpu, VM_REG_GUEST_RBX, tss->tss_ebx);
        SETREG(vcpu, VM_REG_GUEST_RSP, tss->tss_esp);
        SETREG(vcpu, VM_REG_GUEST_RBP, tss->tss_ebp);
        SETREG(vcpu, VM_REG_GUEST_RSI, tss->tss_esi);
        SETREG(vcpu, VM_REG_GUEST_RDI, tss->tss_edi);

        /* Segment selectors */
        SETREG(vcpu, VM_REG_GUEST_ES, tss->tss_es);
        SETREG(vcpu, VM_REG_GUEST_CS, tss->tss_cs);
        SETREG(vcpu, VM_REG_GUEST_SS, tss->tss_ss);
        SETREG(vcpu, VM_REG_GUEST_DS, tss->tss_ds);
        SETREG(vcpu, VM_REG_GUEST_FS, tss->tss_fs);
        SETREG(vcpu, VM_REG_GUEST_GS, tss->tss_gs);

        /*
         * If this is a nested task then write out the new TSS to update
         * the previous link field.
         */
        if (nested)
                vm_copyout(tss, iov, sizeof(*tss));

        /* Validate segment descriptors */
        error = validate_seg_desc(vcpu, ts, VM_REG_GUEST_LDTR, &seg_desc,
            faultptr);
        if (error || *faultptr)
                return (error);
        update_seg_desc(vcpu, VM_REG_GUEST_LDTR, &seg_desc);

        /*
         * Section "Checks on Guest Segment Registers", Intel SDM, Vol 3.
         *
         * The SS and CS attribute checks on VM-entry are inter-dependent so
         * we need to make sure that both segments are valid before updating
         * either of them. This ensures that the VMCS state can pass the
         * VM-entry checks so the guest can handle any exception injected
         * during task switch emulation.
         */
        error = validate_seg_desc(vcpu, ts, VM_REG_GUEST_CS, &seg_desc,
            faultptr);
        if (error || *faultptr)
                return (error);

        error = validate_seg_desc(vcpu, ts, VM_REG_GUEST_SS, &seg_desc2,
            faultptr);
        if (error || *faultptr)
                return (error);
        update_seg_desc(vcpu, VM_REG_GUEST_CS, &seg_desc);
        update_seg_desc(vcpu, VM_REG_GUEST_SS, &seg_desc2);
        ts->paging.cpl = tss->tss_cs & SEL_RPL_MASK;

        error = validate_seg_desc(vcpu, ts, VM_REG_GUEST_DS, &seg_desc,
            faultptr);
        if (error || *faultptr)
                return (error);
        update_seg_desc(vcpu, VM_REG_GUEST_DS, &seg_desc);

        error = validate_seg_desc(vcpu, ts, VM_REG_GUEST_ES, &seg_desc,
            faultptr);
        if (error || *faultptr)
                return (error);
        update_seg_desc(vcpu, VM_REG_GUEST_ES, &seg_desc);

        error = validate_seg_desc(vcpu, ts, VM_REG_GUEST_FS, &seg_desc,
            faultptr);
        if (error || *faultptr)
                return (error);
        update_seg_desc(vcpu, VM_REG_GUEST_FS, &seg_desc);

        error = validate_seg_desc(vcpu, ts, VM_REG_GUEST_GS, &seg_desc,
            faultptr);
        if (error || *faultptr)
                return (error);
        update_seg_desc(vcpu, VM_REG_GUEST_GS, &seg_desc);

        return (0);
}


/*
 * Copy of vie_alignment_check() from vmm_instruction_emul.c
 */
static int
alignment_check(int cpl, int size, uint64_t cr0, uint64_t rf, uint64_t gla)
{
        assert(size == 1 || size == 2 || size == 4 || size == 8);
        assert(cpl >= 0 && cpl <= 3);

        if (cpl != 3 || (cr0 & CR0_AM) == 0 || (rf & PSL_AC) == 0)
                return (0);

        return ((gla & (size - 1)) ? 1 : 0);
}

/*
 * Copy of vie_size2mask() from vmm_instruction_emul.c
 */
static uint64_t
size2mask(int size)
{
        switch (size) {
        case 1:
                return (0xff);
        case 2:
                return (0xffff);
        case 4:
                return (0xffffffff);
        case 8:
                return (0xffffffffffffffff);
        default:
                assert(0);
                /* not reached */
                return (0);
        }
}

/*
 * Copy of vie_calculate_gla() from vmm_instruction_emul.c
 */
static int
calculate_gla(enum vm_cpu_mode cpu_mode, enum vm_reg_name seg,
    struct seg_desc *desc, uint64_t offset, int length, int addrsize,
    int prot, uint64_t *gla)
{
        uint64_t firstoff, low_limit, high_limit, segbase;
        int glasize, type;

        assert(seg >= VM_REG_GUEST_ES && seg <= VM_REG_GUEST_GS);
        assert((length == 1 || length == 2 || length == 4 || length == 8));
        assert((prot & ~(PROT_READ | PROT_WRITE)) == 0);

        firstoff = offset;
        if (cpu_mode == CPU_MODE_64BIT) {
                assert(addrsize == 4 || addrsize == 8);
                glasize = 8;
        } else {
                assert(addrsize == 2 || addrsize == 4);
                glasize = 4;
                /*
                 * If the segment selector is loaded with a NULL selector
                 * then the descriptor is unusable and attempting to use
                 * it results in a #GP(0).
                 */
                if (SEG_DESC_UNUSABLE(desc->access))
                        return (-1);

                /*
                 * The processor generates a #NP exception when a segment
                 * register is loaded with a selector that points to a
                 * descriptor that is not present. If this was the case then
                 * it would have been checked before the VM-exit.
                 */
                assert(SEG_DESC_PRESENT(desc->access));

                /*
                 * The descriptor type must indicate a code/data segment.
                 */
                type = SEG_DESC_TYPE(desc->access);
                assert(type >= 16 && type <= 31);

                if (prot & PROT_READ) {
                        /* #GP on a read access to a exec-only code segment */
                        if ((type & 0xA) == 0x8)
                                return (-1);
                }

                if (prot & PROT_WRITE) {
                        /*
                         * #GP on a write access to a code segment or a
                         * read-only data segment.
                         */
                        if (type & 0x8)                 /* code segment */
                                return (-1);

                        if ((type & 0xA) == 0)          /* read-only data seg */
                                return (-1);
                }

                /*
                 * 'desc->limit' is fully expanded taking granularity into
                 * account.
                 */
                if ((type & 0xC) == 0x4) {
                        /* expand-down data segment */
                        low_limit = desc->limit + 1;
                        high_limit = SEG_DESC_DEF32(desc->access) ?
                            0xffffffff : 0xffff;
                } else {
                        /* code segment or expand-up data segment */
                        low_limit = 0;
                        high_limit = desc->limit;
                }

                while (length > 0) {
                        offset &= size2mask(addrsize);
                        if (offset < low_limit || offset > high_limit)
                                return (-1);
                        offset++;
                        length--;
                }
        }

        /*
         * In 64-bit mode all segments except %fs and %gs have a segment
         * base address of 0.
         */
        if (cpu_mode == CPU_MODE_64BIT && seg != VM_REG_GUEST_FS &&
            seg != VM_REG_GUEST_GS) {
                segbase = 0;
        } else {
                segbase = desc->base;
        }

        /*
         * Truncate 'firstoff' to the effective address size before adding
         * it to the segment base.
         */
        firstoff &= size2mask(addrsize);
        *gla = (segbase + firstoff) & size2mask(glasize);
        return (0);
}

/*
 * Push an error code on the stack of the new task. This is needed if the
 * task switch was triggered by a hardware exception that causes an error
 * code to be saved (e.g. #PF).
 */
static int
push_errcode(struct vcpu *vcpu, struct vm_guest_paging *paging,
    int task_type, uint32_t errcode, int *faultptr)
{
        struct iovec iov[2];
        struct seg_desc seg_desc;
        int stacksize, bytes, error;
        uint64_t gla, cr0, rflags;
        uint32_t esp;
        uint16_t stacksel;

        *faultptr = 0;

        cr0 = GETREG(vcpu, VM_REG_GUEST_CR0);
        rflags = GETREG(vcpu, VM_REG_GUEST_RFLAGS);
        stacksel = GETREG(vcpu, VM_REG_GUEST_SS);

        error = vm_get_desc(vcpu, VM_REG_GUEST_SS, &seg_desc.base,
            &seg_desc.limit, &seg_desc.access);
        assert(error == 0);

        /*
         * Section "Error Code" in the Intel SDM vol 3: the error code is
         * pushed on the stack as a doubleword or word (depending on the
         * default interrupt, trap or task gate size).
         */
        if (task_type == SDT_SYS386BSY || task_type == SDT_SYS386TSS)
                bytes = 4;
        else
                bytes = 2;

        /*
         * PUSH instruction from Intel SDM vol 2: the 'B' flag in the
         * stack-segment descriptor determines the size of the stack
         * pointer outside of 64-bit mode.
         */
        if (SEG_DESC_DEF32(seg_desc.access))
                stacksize = 4;
        else
                stacksize = 2;

        esp = GETREG(vcpu, VM_REG_GUEST_RSP);
        esp -= bytes;

        if (calculate_gla(paging->cpu_mode, VM_REG_GUEST_SS,
            &seg_desc, esp, bytes, stacksize, PROT_WRITE, &gla)) {
                sel_exception(vcpu, IDT_SS, stacksel, 1);
                *faultptr = 1;
                return (0);
        }

        if (alignment_check(paging->cpl, bytes, cr0, rflags, gla)) {
                vm_inject_ac(vcpu, 1);
                *faultptr = 1;
                return (0);
        }

        error = vm_copy_setup(vcpu, paging, gla, bytes, PROT_WRITE,
            iov, nitems(iov), faultptr);
        if (error || *faultptr)
                return (error);

        vm_copyout(&errcode, iov, bytes);
        SETREG(vcpu, VM_REG_GUEST_RSP, esp);
        return (0);
}

/*
 * Evaluate return value from helper functions and potentially return to
 * the VM run loop.
 */
#define CHKERR(error,fault)                                             \
        do {                                                            \
                assert((error == 0) || (error == EFAULT));              \
                if (error)                                              \
                        return (VMEXIT_ABORT);                          \
                else if (fault)                                         \
                        return (VMEXIT_CONTINUE);                       \
        } while (0)

int
vmexit_task_switch(struct vmctx *ctx, struct vcpu *vcpu, struct vm_exit *vmexit)
{
        struct seg_desc nt;
        struct tss32 oldtss, newtss;
        struct vm_task_switch *task_switch;
        struct vm_guest_paging *paging, sup_paging;
        struct user_segment_descriptor nt_desc, ot_desc;
        struct iovec nt_iov[2], ot_iov[2];
        uint64_t cr0, ot_base;
        uint32_t eip, ot_lim, access;
        int error, ext, fault, minlimit, nt_type, ot_type;
        enum task_switch_reason reason;
        uint16_t nt_sel, ot_sel;

        task_switch = &vmexit->u.task_switch;
        nt_sel = task_switch->tsssel;
        ext = vmexit->u.task_switch.ext;
        reason = vmexit->u.task_switch.reason;
        paging = &vmexit->u.task_switch.paging;

        assert(paging->cpu_mode == CPU_MODE_PROTECTED);

        /*
         * Calculate the instruction pointer to store in the old TSS.
         */
        eip = vmexit->rip + vmexit->inst_length;

        /*
         * Section 4.6, "Access Rights" in Intel SDM Vol 3.
         * The following page table accesses are implicitly supervisor mode:
         * - accesses to GDT or LDT to load segment descriptors
         * - accesses to the task state segment during task switch
         */
        sup_paging = *paging;
        sup_paging.cpl = 0;     /* implicit supervisor mode */

        /* Fetch the new TSS descriptor */
        error = read_tss_descriptor(vcpu, task_switch, nt_sel, &nt_desc,
            &fault);
        CHKERR(error, fault);

        nt = usd_to_seg_desc(&nt_desc);

        /* Verify the type of the new TSS */
        nt_type = SEG_DESC_TYPE(nt.access);
        if (nt_type != SDT_SYS386BSY && nt_type != SDT_SYS386TSS &&
            nt_type != SDT_SYS286BSY && nt_type != SDT_SYS286TSS) {
                sel_exception(vcpu, IDT_TS, nt_sel, ext);
                goto done;
        }

        /* TSS descriptor must have present bit set */
        if (!SEG_DESC_PRESENT(nt.access)) {
                sel_exception(vcpu, IDT_NP, nt_sel, ext);
                goto done;
        }

        /*
         * TSS must have a minimum length of 104 bytes for a 32-bit TSS and
         * 44 bytes for a 16-bit TSS.
         */
        if (nt_type == SDT_SYS386BSY || nt_type == SDT_SYS386TSS)
                minlimit = 104 - 1;
        else if (nt_type == SDT_SYS286BSY || nt_type == SDT_SYS286TSS)
                minlimit = 44 - 1;
        else
                minlimit = 0;

        assert(minlimit > 0);
        if (nt.limit < (unsigned int)minlimit) {
                sel_exception(vcpu, IDT_TS, nt_sel, ext);
                goto done;
        }

        /* TSS must be busy if task switch is due to IRET */
        if (reason == TSR_IRET && !TSS_BUSY(nt_type)) {
                sel_exception(vcpu, IDT_TS, nt_sel, ext);
                goto done;
        }

        /*
         * TSS must be available (not busy) if task switch reason is
         * CALL, JMP, exception or interrupt.
         */
        if (reason != TSR_IRET && TSS_BUSY(nt_type)) {
                sel_exception(vcpu, IDT_GP, nt_sel, ext);
                goto done;
        }

        /* Fetch the new TSS */
        error = vm_copy_setup(vcpu, &sup_paging, nt.base, minlimit + 1,
            PROT_READ | PROT_WRITE, nt_iov, nitems(nt_iov), &fault);
        CHKERR(error, fault);
        vm_copyin(nt_iov, &newtss, minlimit + 1);

        /* Get the old TSS selector from the guest's task register */
        ot_sel = GETREG(vcpu, VM_REG_GUEST_TR);
        if (ISLDT(ot_sel) || IDXSEL(ot_sel) == 0) {
                /*
                 * This might happen if a task switch was attempted without
                 * ever loading the task register with LTR. In this case the
                 * TR would contain the values from power-on:
                 * (sel = 0, base = 0, limit = 0xffff).
                 */
                sel_exception(vcpu, IDT_TS, ot_sel, task_switch->ext);
                goto done;
        }

        /* Get the old TSS base and limit from the guest's task register */
        error = vm_get_desc(vcpu, VM_REG_GUEST_TR, &ot_base, &ot_lim,
            &access);
        assert(error == 0);
        assert(!SEG_DESC_UNUSABLE(access) && SEG_DESC_PRESENT(access));
        ot_type = SEG_DESC_TYPE(access);
        assert(ot_type == SDT_SYS386BSY || ot_type == SDT_SYS286BSY);

        /* Fetch the old TSS descriptor */
        error = read_tss_descriptor(vcpu, task_switch, ot_sel, &ot_desc,
            &fault);
        CHKERR(error, fault);

        /* Get the old TSS */
        error = vm_copy_setup(vcpu, &sup_paging, ot_base, minlimit + 1,
            PROT_READ | PROT_WRITE, ot_iov, nitems(ot_iov), &fault);
        CHKERR(error, fault);
        vm_copyin(ot_iov, &oldtss, minlimit + 1);

        /*
         * Clear the busy bit in the old TSS descriptor if the task switch
         * due to an IRET or JMP instruction.
         */
        if (reason == TSR_IRET || reason == TSR_JMP) {
                ot_desc.sd_type &= ~0x2;
                error = desc_table_write(vcpu, &sup_paging, ot_sel,
                    &ot_desc, &fault);
                CHKERR(error, fault);
        }

        if (nt_type == SDT_SYS286BSY || nt_type == SDT_SYS286TSS) {
                EPRINTLN("Task switch to 16-bit TSS not supported");
                return (VMEXIT_ABORT);
        }

        /* Save processor state in old TSS */
        tss32_save(vcpu, task_switch, eip, &oldtss, ot_iov);

        /*
         * If the task switch was triggered for any reason other than IRET
         * then set the busy bit in the new TSS descriptor.
         */
        if (reason != TSR_IRET) {
                nt_desc.sd_type |= 0x2;
                error = desc_table_write(vcpu, &sup_paging, nt_sel,
                    &nt_desc, &fault);
                CHKERR(error, fault);
        }

        /* Update task register to point at the new TSS */
        SETREG(vcpu, VM_REG_GUEST_TR, nt_sel);

        /* Update the hidden descriptor state of the task register */
        nt = usd_to_seg_desc(&nt_desc);
        update_seg_desc(vcpu, VM_REG_GUEST_TR, &nt);

        /* Set CR0.TS */
        cr0 = GETREG(vcpu, VM_REG_GUEST_CR0);
        SETREG(vcpu, VM_REG_GUEST_CR0, cr0 | CR0_TS);

        /*
         * We are now committed to the task switch. Any exceptions encountered
         * after this point will be handled in the context of the new task and
         * the saved instruction pointer will belong to the new task.
         */
        error = vm_set_register(vcpu, VM_REG_GUEST_RIP, newtss.tss_eip);
        assert(error == 0);

        /* Load processor state from new TSS */
        error = tss32_restore(ctx, vcpu, task_switch, ot_sel, &newtss, nt_iov,
            &fault);
        CHKERR(error, fault);

        /*
         * Section "Interrupt Tasks" in Intel SDM, Vol 3: if an exception
         * caused an error code to be generated, this error code is copied
         * to the stack of the new task.
         */
        if (task_switch->errcode_valid) {
                assert(task_switch->ext);
                assert(task_switch->reason == TSR_IDT_GATE);
                error = push_errcode(vcpu, &task_switch->paging, nt_type,
                    task_switch->errcode, &fault);
                CHKERR(error, fault);
        }

        /*
         * Treatment of virtual-NMI blocking if NMI is delivered through
         * a task gate.
         *
         * Section "Architectural State Before A VM Exit", Intel SDM, Vol3:
         * If the virtual NMIs VM-execution control is 1, VM entry injects
         * an NMI, and delivery of the NMI causes a task switch that causes
         * a VM exit, virtual-NMI blocking is in effect before the VM exit
         * commences.
         *
         * Thus, virtual-NMI blocking is in effect at the time of the task
         * switch VM exit.
         */

        /*
         * Treatment of virtual-NMI unblocking on IRET from NMI handler task.
         *
         * Section "Changes to Instruction Behavior in VMX Non-Root Operation"
         * If "virtual NMIs" control is 1 IRET removes any virtual-NMI blocking.
         * This unblocking of virtual-NMI occurs even if IRET causes a fault.
         *
         * Thus, virtual-NMI blocking is cleared at the time of the task switch
         * VM exit.
         */

        /*
         * If the task switch was triggered by an event delivered through
         * the IDT then extinguish the pending event from the vcpu's
         * exitintinfo.
         */
        if (task_switch->reason == TSR_IDT_GATE) {
                error = vm_set_intinfo(vcpu, 0);
                assert(error == 0);
        }

        /*
         * XXX should inject debug exception if 'T' bit is 1
         */
done:
        return (VMEXIT_CONTINUE);
}