root/usr/src/uts/intel/os/desctbls.c
/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */

/*
 * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
 */

/*
 * Copyright 2018 Joyent, Inc. All rights reserved.
 * Copyright 2022 Oxide Computer Compnay
 */

/*
 * Copyright (c) 1992 Terrence R. Lambert.
 * Copyright (c) 1990 The Regents of the University of California.
 * All rights reserved.
 *
 * This code is derived from software contributed to Berkeley by
 * William Jolitz.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. All advertising materials mentioning features or use of this software
 *    must display the following acknowledgement:
 *      This product includes software developed by the University of
 *      California, Berkeley and its contributors.
 * 4. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *      from: @(#)machdep.c     7.4 (Berkeley) 6/3/91
 */

#include <sys/types.h>
#include <sys/sysmacros.h>
#include <sys/tss.h>
#include <sys/segments.h>
#include <sys/trap.h>
#include <sys/cpuvar.h>
#include <sys/bootconf.h>
#include <sys/x86_archext.h>
#include <sys/controlregs.h>
#include <sys/archsystm.h>
#include <sys/machsystm.h>
#include <sys/kobj.h>
#include <sys/cmn_err.h>
#include <sys/reboot.h>
#include <sys/kdi.h>
#include <sys/mach_mmu.h>
#include <sys/systm.h>
#include <sys/note.h>

#ifdef __xpv
#include <sys/hypervisor.h>
#include <vm/as.h>
#endif

#include <sys/promif.h>
#include <sys/bootinfo.h>
#include <vm/kboot_mmu.h>
#include <vm/hat_pte.h>

/*
 * cpu0 and default tables and structures.
 */
user_desc_t     *gdt0;
#if !defined(__xpv)
desctbr_t       gdt0_default_r;
#endif

gate_desc_t     *idt0;          /* interrupt descriptor table */

tss_t           *ktss0;                 /* kernel task state structure */


user_desc_t     zero_udesc;             /* base zero user desc native procs */
user_desc_t     null_udesc;             /* null user descriptor */
system_desc_t   null_sdesc;             /* null system descriptor */

user_desc_t     zero_u32desc;           /* 32-bit compatibility procs */

user_desc_t     ucs_on;
user_desc_t     ucs_off;
user_desc_t     ucs32_on;
user_desc_t     ucs32_off;

/*
 * If the size of this is changed, you must update hat_pcp_setup() and the
 * definitions in exception.s
 */
extern char dblfault_stack0[DEFAULTSTKSZ];
extern char nmi_stack0[DEFAULTSTKSZ];
extern char mce_stack0[DEFAULTSTKSZ];

extern void     fast_null(void);
extern hrtime_t get_hrtime(void);
extern hrtime_t gethrvtime(void);
extern hrtime_t get_hrestime(void);
extern uint64_t getlgrp(void);

void (*(fasttable[]))(void) = {
        fast_null,                      /* T_FNULL routine */
        fast_null,                      /* T_FGETFP routine (initially null) */
        fast_null,                      /* T_FSETFP routine (initially null) */
        (void (*)())(uintptr_t)get_hrtime,      /* T_GETHRTIME */
        (void (*)())(uintptr_t)gethrvtime,      /* T_GETHRVTIME */
        (void (*)())(uintptr_t)get_hrestime,    /* T_GETHRESTIME */
        (void (*)())(uintptr_t)getlgrp          /* T_GETLGRP */
};

/*
 * Structure containing pre-computed descriptors to allow us to temporarily
 * interpose on a standard handler.
 */
struct interposing_handler {
        int ih_inum;
        gate_desc_t ih_interp_desc;
        gate_desc_t ih_default_desc;
};

/*
 * The brand infrastructure interposes on two handlers, and we use one as a
 * NULL signpost.
 */
static struct interposing_handler brand_tbl[2];

/*
 * software prototypes for default local descriptor table
 */

/*
 * Routines for loading segment descriptors in format the hardware
 * can understand.
 */

/*
 * In long mode we have the new L or long mode attribute bit
 * for code segments. Only the conforming bit in type is used along
 * with descriptor priority and present bits. Default operand size must
 * be zero when in long mode. In 32-bit compatibility mode all fields
 * are treated as in legacy mode. For data segments while in long mode
 * only the present bit is loaded.
 */
void
set_usegd(user_desc_t *dp, uint_t lmode, void *base, uint32_t size,
    uint_t type, uint_t dpl, uint_t gran, uint_t defopsz)
{
        ASSERT(lmode == SDP_SHORT || lmode == SDP_LONG);
        /* This should never be a "system" segment. */
        ASSERT3U(type & SDT_S, !=, 0);
        ASSERT3P(dp, !=, NULL);

        /*
         * 64-bit long mode.
         */
        if (lmode == SDP_LONG)
                dp->usd_def32 = 0;              /* 32-bit operands only */
        else
                /*
                 * 32-bit compatibility mode.
                 */
                dp->usd_def32 = defopsz;        /* 0 = 16, 1 = 32-bit ops */

        /*
         * We should always set the "accessed" bit (SDT_A), otherwise the CPU
         * will write to the GDT whenever we change segment registers around.
         * With KPTI on, the GDT is read-only in the user page table, which
         * causes crashes if we don't set this.
         */
        ASSERT3U(type & SDT_A, !=, 0);

        dp->usd_long = lmode;   /* 64-bit mode */
        dp->usd_type = type;
        dp->usd_dpl = dpl;
        dp->usd_p = 1;
        dp->usd_gran = gran;            /* 0 = bytes, 1 = pages */

        dp->usd_lobase = (uintptr_t)base;
        dp->usd_midbase = (uintptr_t)base >> 16;
        dp->usd_hibase = (uintptr_t)base >> (16 + 8);
        dp->usd_lolimit = size;
        dp->usd_hilimit = (uintptr_t)size >> 16;
}

/*
 * Install system segment descriptor for LDT and TSS segments.
 */

void
set_syssegd(system_desc_t *dp, void *base, size_t size, uint_t type,
    uint_t dpl)
{
        dp->ssd_lolimit = size;
        dp->ssd_hilimit = (uintptr_t)size >> 16;

        dp->ssd_lobase = (uintptr_t)base;
        dp->ssd_midbase = (uintptr_t)base >> 16;
        dp->ssd_hibase = (uintptr_t)base >> (16 + 8);
        dp->ssd_hi64base = (uintptr_t)base >> (16 + 8 + 8);

        dp->ssd_type = type;
        dp->ssd_zero1 = 0;      /* must be zero */
        dp->ssd_zero2 = 0;
        dp->ssd_dpl = dpl;
        dp->ssd_p = 1;
        dp->ssd_gran = 0;       /* force byte units */
}

void *
get_ssd_base(system_desc_t *dp)
{
        uintptr_t       base;

        base = (uintptr_t)dp->ssd_lobase |
            (uintptr_t)dp->ssd_midbase << 16 |
            (uintptr_t)dp->ssd_hibase << (16 + 8) |
            (uintptr_t)dp->ssd_hi64base << (16 + 8 + 8);
        return ((void *)base);
}

/*
 * Install gate segment descriptor for interrupt, trap, call and task gates.
 *
 * For 64 bit native if we have KPTI enabled, we use the IST stack mechanism on
 * all interrupts.  We have different ISTs for each class of exceptions that are
 * most likely to occur while handling an existing exception; while many of
 * these are just going to panic, it's nice not to trample on the existing
 * exception state for debugging purposes.
 *
 * Normal interrupts are all redirected unconditionally to the KPTI trampoline
 * stack space. This unifies the trampoline handling between user and kernel
 * space (and avoids the need to touch %gs).
 *
 * The KDI IDT *all* uses the DBG IST: consider single stepping tr_pftrap, when
 * we do a read from KMDB that cause another #PF.  Without its own IST, this
 * would stomp on the kernel's mcpu_kpti_flt frame.
 */
uint_t
idt_vector_to_ist(uint_t vector)
{
#if defined(__xpv)
        _NOTE(ARGUNUSED(vector));
        return (IST_NONE);
#else
        switch (vector) {
        /* These should always use IST even without KPTI enabled. */
        case T_DBLFLT:
                return (IST_DF);
        case T_NMIFLT:
                return (IST_NMI);
        case T_MCE:
                return (IST_MCE);

        case T_BPTFLT:
        case T_SGLSTP:
                if (kpti_enable == 1) {
                        return (IST_DBG);
                }
                return (IST_NONE);
        case T_STKFLT:
        case T_GPFLT:
        case T_PGFLT:
                if (kpti_enable == 1) {
                        return (IST_NESTABLE);
                }
                return (IST_NONE);
        default:
                if (kpti_enable == 1) {
                        return (IST_DEFAULT);
                }
                return (IST_NONE);
        }
#endif
}

void
set_gatesegd(gate_desc_t *dp, void (*func)(void), selector_t sel,
    uint_t type, uint_t dpl, uint_t ist)
{
        dp->sgd_looffset = (uintptr_t)func;
        dp->sgd_hioffset = (uintptr_t)func >> 16;
        dp->sgd_hi64offset = (uintptr_t)func >> (16 + 16);
        dp->sgd_selector =  (uint16_t)sel;
        dp->sgd_ist = ist;
        dp->sgd_type = type;
        dp->sgd_dpl = dpl;
        dp->sgd_p = 1;
}

/*
 * Updates a single user descriptor in the the GDT of the current cpu.
 * Caller is responsible for preventing cpu migration.
 */

void
gdt_update_usegd(uint_t sidx, user_desc_t *udp)
{
#if defined(DEBUG)
        /* This should never be a "system" segment, but it might be null. */
        if (udp->usd_p != 0 || udp->usd_type != 0) {
                ASSERT3U(udp->usd_type & SDT_S, !=, 0);
        }
        /*
         * We should always set the "accessed" bit (SDT_A), otherwise the CPU
         * will write to the GDT whenever we change segment registers around.
         * With KPTI on, the GDT is read-only in the user page table, which
         * causes crashes if we don't set this.
         */
        if (udp->usd_p != 0 || udp->usd_type != 0) {
                ASSERT3U(udp->usd_type & SDT_A, !=, 0);
        }
#endif

#if defined(__xpv)
        uint64_t dpa = CPU->cpu_m.mcpu_gdtpa + sizeof (*udp) * sidx;

        if (HYPERVISOR_update_descriptor(pa_to_ma(dpa), *(uint64_t *)udp))
                panic("gdt_update_usegd: HYPERVISOR_update_descriptor");

#else   /* __xpv */
        CPU->cpu_gdt[sidx] = *udp;
#endif  /* __xpv */
}

/*
 * Writes single descriptor pointed to by udp into a processes
 * LDT entry pointed to by ldp.
 */
int
ldt_update_segd(user_desc_t *ldp, user_desc_t *udp)
{
#if defined(DEBUG)
        /* This should never be a "system" segment, but it might be null. */
        if (udp->usd_p != 0 || udp->usd_type != 0) {
                ASSERT3U(udp->usd_type & SDT_S, !=, 0);
        }
        /*
         * We should always set the "accessed" bit (SDT_A), otherwise the CPU
         * will write to the LDT whenever we change segment registers around.
         * With KPTI on, the LDT is read-only in the user page table, which
         * causes crashes if we don't set this.
         */
        if (udp->usd_p != 0 || udp->usd_type != 0) {
                ASSERT3U(udp->usd_type & SDT_A, !=, 0);
        }
#endif

#if defined(__xpv)
        uint64_t dpa;

        dpa = mmu_ptob(hat_getpfnum(kas.a_hat, (caddr_t)ldp)) |
            ((uintptr_t)ldp & PAGEOFFSET);

        /*
         * The hypervisor is a little more restrictive about what it
         * supports in the LDT.
         */
        if (HYPERVISOR_update_descriptor(pa_to_ma(dpa), *(uint64_t *)udp) != 0)
                return (EINVAL);

#else   /* __xpv */
        *ldp = *udp;

#endif  /* __xpv */
        return (0);
}

#if defined(__xpv)

/*
 * Converts hw format gate descriptor into pseudo-IDT format for the hypervisor.
 * Returns true if a valid entry was written.
 */
int
xen_idt_to_trap_info(uint_t vec, gate_desc_t *sgd, void *ti_arg)
{
        trap_info_t *ti = ti_arg;       /* XXPV Aargh - segments.h comment */

        /*
         * skip holes in the IDT
         */
        if (GATESEG_GETOFFSET(sgd) == 0)
                return (0);

        ASSERT(sgd->sgd_type == SDT_SYSIGT);
        ti->vector = vec;
        TI_SET_DPL(ti, sgd->sgd_dpl);

        /*
         * Is this an interrupt gate?
         */
        if (sgd->sgd_type == SDT_SYSIGT) {
                /* LINTED */
                TI_SET_IF(ti, 1);
        }
        ti->cs = sgd->sgd_selector;
        ti->cs |= SEL_KPL;      /* force into ring 3. see KCS_SEL  */
        ti->address = GATESEG_GETOFFSET(sgd);
        return (1);
}

/*
 * Convert a single hw format gate descriptor and write it into our virtual IDT.
 */
void
xen_idt_write(gate_desc_t *sgd, uint_t vec)
{
        trap_info_t trapinfo[2];

        bzero(trapinfo, sizeof (trapinfo));
        if (xen_idt_to_trap_info(vec, sgd, &trapinfo[0]) == 0)
                return;
        if (xen_set_trap_table(trapinfo) != 0)
                panic("xen_idt_write: xen_set_trap_table() failed");
}

#endif  /* __xpv */


/*
 * Build kernel GDT.
 */

static void
init_gdt_common(user_desc_t *gdt)
{
        int i;

        ASSERT3P(gdt, !=, NULL);

        init_boot_gdt(gdt);

        /*
         * 64-bit kernel code segment.
         */
        set_usegd(&gdt[GDT_KCODE], SDP_LONG, NULL, 0, SDT_MEMERA, SEL_KPL,
            SDP_PAGES, SDP_OP32);

        /*
         * 64-bit kernel data segment. The limit attribute is ignored in 64-bit
         * mode, but we set it here to SDP_LIMIT_MAX so that we can use the
         * SYSRET instruction to return from system calls back to 32-bit
         * applications.  SYSRET doesn't update the base, limit, or attributes
         * of %ss or %ds descriptors. We therefore must ensure that the kernel
         * uses something, though it will be ignored by hardware, that is
         * compatible with 32-bit apps. For the same reason we must set the
         * default op size of this descriptor to 32-bit operands.
         */
        set_usegd(&gdt[GDT_KDATA], SDP_LONG, NULL, SDP_LIMIT_MAX, SDT_MEMRWA,
            SEL_KPL, SDP_PAGES, SDP_OP32);
        gdt[GDT_KDATA].usd_def32 = 1;

        /*
         * 64-bit user code segment.
         */
        set_usegd(&gdt[GDT_UCODE], SDP_LONG, NULL, 0, SDT_MEMERA, SEL_UPL,
            SDP_PAGES, SDP_OP32);

        /*
         * 32-bit user code segment.
         */
        set_usegd(&gdt[GDT_U32CODE], SDP_SHORT, NULL, SDP_LIMIT_MAX, SDT_MEMERA,
            SEL_UPL, SDP_PAGES, SDP_OP32);

        /*
         * See gdt_ucode32() and gdt_ucode_native().
         */
        ucs_on = ucs_off = gdt[GDT_UCODE];
        ucs_off.usd_p = 0;      /* forces #np fault */

        ucs32_on = ucs32_off = gdt[GDT_U32CODE];
        ucs32_off.usd_p = 0;    /* forces #np fault */

        /*
         * 32 and 64 bit data segments can actually share the same descriptor.
         * In long mode only the present bit is checked but all other fields
         * are loaded. But in compatibility mode all fields are interpreted
         * as in legacy mode so they must be set correctly for a 32-bit data
         * segment.
         */
        set_usegd(&gdt[GDT_UDATA], SDP_SHORT, NULL, SDP_LIMIT_MAX, SDT_MEMRWA,
            SEL_UPL, SDP_PAGES, SDP_OP32);

#if !defined(__xpv)

        /*
         * The 64-bit kernel has no default LDT. By default, the LDT descriptor
         * in the GDT is 0.
         */

        /*
         * Kernel TSS
         */
        set_syssegd((system_desc_t *)&gdt[GDT_KTSS], ktss0,
            sizeof (*ktss0) - 1, SDT_SYSTSS, SEL_KPL);

#endif  /* !__xpv */

        /*
         * Initialize fs and gs descriptors for 32 bit processes.
         * Only attributes and limits are initialized, the effective
         * base address is programmed via fsbase/gsbase.
         */
        set_usegd(&gdt[GDT_LWPFS], SDP_SHORT, NULL, SDP_LIMIT_MAX, SDT_MEMRWA,
            SEL_UPL, SDP_PAGES, SDP_OP32);
        set_usegd(&gdt[GDT_LWPGS], SDP_SHORT, NULL, SDP_LIMIT_MAX, SDT_MEMRWA,
            SEL_UPL, SDP_PAGES, SDP_OP32);

        /*
         * Initialize the descriptors set aside for brand usage.
         * Only attributes and limits are initialized.
         */
        for (i = GDT_BRANDMIN; i <= GDT_BRANDMAX; i++)
                set_usegd(&gdt0[i], SDP_SHORT, NULL, SDP_LIMIT_MAX, SDT_MEMRWA,
                    SEL_UPL, SDP_PAGES, SDP_OP32);

        /*
         * Initialize convenient zero base user descriptors for clearing
         * lwp private %fs and %gs descriptors in GDT. See setregs() for
         * an example.
         */
        set_usegd(&zero_udesc, SDP_LONG, 0, 0, SDT_MEMRWA, SEL_UPL,
            SDP_BYTES, SDP_OP32);
        set_usegd(&zero_u32desc, SDP_SHORT, 0, SDP_LIMIT_MAX, SDT_MEMRWA,
            SEL_UPL, SDP_PAGES, SDP_OP32);
}

#if defined(__xpv)

static user_desc_t *
init_gdt(void)
{
        uint64_t gdtpa;
        ulong_t ma[1];          /* XXPV should be a memory_t */
        ulong_t addr;

#if !defined(__lint)
        /*
         * Our gdt is never larger than a single page.
         */
        ASSERT((sizeof (*gdt0) * NGDT) <= PAGESIZE);
#endif
        gdt0 = (user_desc_t *)BOP_ALLOC(bootops, (caddr_t)GDT_VA,
            PAGESIZE, PAGESIZE);
        ASSERT3P(gdt0, !=, NULL);
        bzero(gdt0, PAGESIZE);

        init_gdt_common(gdt0);

        /*
         * XXX Since we never invoke kmdb until after the kernel takes
         * over the descriptor tables why not have it use the kernel's
         * selectors?
         */
        if (boothowto & RB_DEBUG) {
                set_usegd(&gdt0[GDT_B32DATA], SDP_LONG, NULL, SDP_LIMIT_MAX,
                    SDT_MEMRWA, SEL_KPL, SDP_PAGES, SDP_OP32);
                set_usegd(&gdt0[GDT_B64CODE], SDP_LONG, NULL, SDP_LIMIT_MAX,
                    SDT_MEMERA, SEL_KPL, SDP_PAGES, SDP_OP32);
        }

        /*
         * Clear write permission for page containing the gdt and install it.
         */
        gdtpa = pfn_to_pa(va_to_pfn(gdt0));
        ma[0] = (ulong_t)(pa_to_ma(gdtpa) >> PAGESHIFT);
        kbm_read_only((uintptr_t)gdt0, gdtpa);
        xen_set_gdt(ma, NGDT);

        /*
         * Reload the segment registers to use the new GDT.
         * On 64-bit, fixup KCS_SEL to be in ring 3.
         * See KCS_SEL in segments.h.
         */
        load_segment_registers((KCS_SEL | SEL_KPL), KFS_SEL, KGS_SEL, KDS_SEL);

        /*
         *  setup %gs for kernel
         */
        xen_set_segment_base(SEGBASE_GS_KERNEL, (ulong_t)&cpus[0]);

        /*
         * XX64 We should never dereference off "other gsbase" or
         * "fsbase".  So, we should arrange to point FSBASE and
         * KGSBASE somewhere truly awful e.g. point it at the last
         * valid address below the hole so that any attempts to index
         * off them cause an exception.
         *
         * For now, point it at 8G -- at least it should be unmapped
         * until some 64-bit processes run.
         */
        addr = 0x200000000ul;
        xen_set_segment_base(SEGBASE_FS, addr);
        xen_set_segment_base(SEGBASE_GS_USER, addr);
        xen_set_segment_base(SEGBASE_GS_USER_SEL, 0);

        return (gdt0);
}

#else   /* __xpv */

static user_desc_t *
init_gdt(void)
{
        desctbr_t       r_gdt;

#if !defined(__lint)
        /*
         * Our gdt is never larger than a single page.
         */
        ASSERT((sizeof (*gdt0) * NGDT) <= PAGESIZE);
#endif
        gdt0 = (user_desc_t *)BOP_ALLOC(bootops, (caddr_t)GDT_VA,
            PAGESIZE, PAGESIZE);
        bzero(gdt0, PAGESIZE);

        init_gdt_common(gdt0);

        /*
         * Install our new GDT
         */
        r_gdt.dtr_limit = (sizeof (*gdt0) * NGDT) - 1;
        r_gdt.dtr_base = (uintptr_t)gdt0;
        wr_gdtr(&r_gdt);

        /*
         * Reload the segment registers to use the new GDT
         */
        load_segment_registers(KCS_SEL, KFS_SEL, KGS_SEL, KDS_SEL);

        /*
         *  setup %gs for kernel
         */
        wrmsr(MSR_AMD_GSBASE, (uint64_t)&cpus[0]);

        /*
         * XX64 We should never dereference off "other gsbase" or
         * "fsbase".  So, we should arrange to point FSBASE and
         * KGSBASE somewhere truly awful e.g. point it at the last
         * valid address below the hole so that any attempts to index
         * off them cause an exception.
         *
         * For now, point it at 8G -- at least it should be unmapped
         * until some 64-bit processes run.
         */
        wrmsr(MSR_AMD_FSBASE, 0x200000000ul);
        wrmsr(MSR_AMD_KGSBASE, 0x200000000ul);
        return (gdt0);
}

#endif  /* __xpv */


/*
 * Build kernel IDT.
 *
 * Note that for amd64 we pretty much require every gate to be an interrupt
 * gate which blocks interrupts atomically on entry; that's because of our
 * dependency on using 'swapgs' every time we come into the kernel to find
 * the cpu structure. If we get interrupted just before doing that, %cs could
 * be in kernel mode (so that the trap prolog doesn't do a swapgs), but
 * %gsbase is really still pointing at something in userland. Bad things will
 * ensue. We also use interrupt gates for i386 as well even though this is not
 * required for some traps.
 *
 * Perhaps they should have invented a trap gate that does an atomic swapgs?
 */
static void
init_idt_common(gate_desc_t *idt)
{
        set_gatesegd(&idt[T_ZERODIV],
            (kpti_enable == 1) ? &tr_div0trap : &div0trap,
            KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_ZERODIV));
        set_gatesegd(&idt[T_SGLSTP],
            (kpti_enable == 1) ? &tr_dbgtrap : &dbgtrap,
            KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_SGLSTP));
        set_gatesegd(&idt[T_NMIFLT],
            (kpti_enable == 1) ? &tr_nmiint : &nmiint,
            KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_NMIFLT));
        set_gatesegd(&idt[T_BPTFLT],
            (kpti_enable == 1) ? &tr_brktrap : &brktrap,
            KCS_SEL, SDT_SYSIGT, TRP_UPL, idt_vector_to_ist(T_BPTFLT));
        set_gatesegd(&idt[T_OVFLW],
            (kpti_enable == 1) ? &tr_ovflotrap : &ovflotrap,
            KCS_SEL, SDT_SYSIGT, TRP_UPL, idt_vector_to_ist(T_OVFLW));
        set_gatesegd(&idt[T_BOUNDFLT],
            (kpti_enable == 1) ? &tr_boundstrap : &boundstrap,
            KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_BOUNDFLT));
        set_gatesegd(&idt[T_ILLINST],
            (kpti_enable == 1) ? &tr_invoptrap : &invoptrap,
            KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_ILLINST));
        set_gatesegd(&idt[T_NOEXTFLT],
            (kpti_enable == 1) ? &tr_ndptrap : &ndptrap,
            KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_NOEXTFLT));

        /*
         * double fault handler.
         *
         * Note that on the hypervisor a guest does not receive #df faults.
         * Instead a failsafe event is injected into the guest if its selectors
         * and/or stack is in a broken state. See xen_failsafe_callback.
         */
#if !defined(__xpv)
        set_gatesegd(&idt[T_DBLFLT],
            (kpti_enable == 1) ? &tr_syserrtrap : &syserrtrap,
            KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_DBLFLT));
#endif  /* !__xpv */

        /*
         * T_EXTOVRFLT coprocessor-segment-overrun not supported.
         */
        set_gatesegd(&idt[T_TSSFLT],
            (kpti_enable == 1) ? &tr_invtsstrap : &invtsstrap,
            KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_TSSFLT));
        set_gatesegd(&idt[T_SEGFLT],
            (kpti_enable == 1) ? &tr_segnptrap : &segnptrap,
            KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_SEGFLT));
        set_gatesegd(&idt[T_STKFLT],
            (kpti_enable == 1) ? &tr_stktrap : &stktrap,
            KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_STKFLT));
        set_gatesegd(&idt[T_GPFLT],
            (kpti_enable == 1) ? &tr_gptrap : &gptrap,
            KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_GPFLT));
        set_gatesegd(&idt[T_PGFLT],
            (kpti_enable == 1) ? &tr_pftrap : &pftrap,
            KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_PGFLT));
        set_gatesegd(&idt[T_EXTERRFLT],
            (kpti_enable == 1) ? &tr_ndperr : &ndperr,
            KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_EXTERRFLT));
        set_gatesegd(&idt[T_ALIGNMENT],
            (kpti_enable == 1) ? &tr_achktrap : &achktrap,
            KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_ALIGNMENT));
        set_gatesegd(&idt[T_MCE],
            (kpti_enable == 1) ? &tr_mcetrap : &mcetrap,
            KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_MCE));
        set_gatesegd(&idt[T_SIMDFPE],
            (kpti_enable == 1) ? &tr_xmtrap : &xmtrap,
            KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_SIMDFPE));

        /*
         * install fast trap handler at 210.
         */
        set_gatesegd(&idt[T_FASTTRAP],
            (kpti_enable == 1) ? &tr_fasttrap : &fasttrap,
            KCS_SEL, SDT_SYSIGT, TRP_UPL, idt_vector_to_ist(T_FASTTRAP));

        /*
         * System call handler.
         */
        set_gatesegd(&idt[T_SYSCALLINT],
            (kpti_enable == 1) ? &tr_sys_syscall_int : &sys_syscall_int,
            KCS_SEL, SDT_SYSIGT, TRP_UPL, idt_vector_to_ist(T_SYSCALLINT));

        /*
         * Install the DTrace interrupt handler for the pid provider.
         */
        set_gatesegd(&idt[T_DTRACE_RET],
            (kpti_enable == 1) ? &tr_dtrace_ret : &dtrace_ret,
            KCS_SEL, SDT_SYSIGT, TRP_UPL, idt_vector_to_ist(T_DTRACE_RET));

        /*
         * Prepare interposing descriptor for the syscall handler
         * and cache copy of the default descriptor.
         */
        brand_tbl[0].ih_inum = T_SYSCALLINT;
        brand_tbl[0].ih_default_desc = idt0[T_SYSCALLINT];

        set_gatesegd(&(brand_tbl[0].ih_interp_desc),
            (kpti_enable == 1) ? &tr_brand_sys_syscall_int :
            &brand_sys_syscall_int, KCS_SEL, SDT_SYSIGT, TRP_UPL,
            idt_vector_to_ist(T_SYSCALLINT));

        brand_tbl[1].ih_inum = 0;
}

#if defined(__xpv)

static void
init_idt(gate_desc_t *idt)
{
        init_idt_common(idt);
}

#else   /* __xpv */

static void
init_idt(gate_desc_t *idt)
{
        char    ivctname[80];
        void    (*ivctptr)(void);
        int     i;

        /*
         * Initialize entire table with 'reserved' trap and then overwrite
         * specific entries. T_EXTOVRFLT (9) is unsupported and reserved
         * since it can only be generated on a 386 processor. 15 is also
         * unsupported and reserved.
         */
        for (i = 0; i < NIDT; i++) {
                set_gatesegd(&idt[i],
                    (kpti_enable == 1) ? &tr_resvtrap : &resvtrap,
                    KCS_SEL, SDT_SYSIGT, TRP_KPL,
                    idt_vector_to_ist(T_RESVTRAP));
        }

        /*
         * 20-31 reserved
         */
        for (i = 20; i < 32; i++) {
                set_gatesegd(&idt[i],
                    (kpti_enable == 1) ? &tr_invaltrap : &invaltrap,
                    KCS_SEL, SDT_SYSIGT, TRP_KPL,
                    idt_vector_to_ist(T_INVALTRAP));
        }

        /*
         * interrupts 32 - 255
         */
        for (i = 32; i < 256; i++) {
                (void) snprintf(ivctname, sizeof (ivctname),
                    (kpti_enable == 1) ? "tr_ivct%d" : "ivct%d", i);
                ivctptr = (void (*)(void))kobj_getsymvalue(ivctname, 0);
                if (ivctptr == NULL)
                        panic("kobj_getsymvalue(%s) failed", ivctname);

                set_gatesegd(&idt[i], ivctptr, KCS_SEL, SDT_SYSIGT, TRP_KPL,
                    idt_vector_to_ist(i));
        }

        /*
         * Now install the common ones. Note that it will overlay some
         * entries installed above like T_SYSCALLINT, T_FASTTRAP etc.
         */
        init_idt_common(idt);
}

#endif  /* __xpv */

/*
 * The kernel does not deal with LDTs unless a user explicitly creates
 * one. Under normal circumstances, the LDTR contains 0. Any process attempting
 * to reference the LDT will therefore cause a #gp. System calls made via the
 * obsolete lcall mechanism are emulated by the #gp fault handler.
 */
static void
init_ldt(void)
{
#if defined(__xpv)
        xen_set_ldt(NULL, 0);
#else
        wr_ldtr(0);
#endif
}

#if !defined(__xpv)

static void
init_tss(void)
{
        extern struct cpu cpus[];

        /*
         * tss_rsp0 is dynamically filled in by resume() (in swtch.s) on each
         * context switch but it'll be overwritten with this same value anyway.
         */
        if (kpti_enable == 1) {
                ktss0->tss_rsp0 = (uint64_t)&cpus->cpu_m.mcpu_kpti.kf_tr_rsp;
        }

        /* Set up the IST stacks for double fault, NMI, MCE. */
        ktss0->tss_ist1 = (uintptr_t)&dblfault_stack0[sizeof (dblfault_stack0)];
        ktss0->tss_ist2 = (uintptr_t)&nmi_stack0[sizeof (nmi_stack0)];
        ktss0->tss_ist3 = (uintptr_t)&mce_stack0[sizeof (mce_stack0)];

        /*
         * This IST stack is used for #DB,#BP (debug) interrupts (when KPTI is
         * enabled), and also for KDI (always).
         */
        ktss0->tss_ist4 = (uint64_t)&cpus->cpu_m.mcpu_kpti_dbg.kf_tr_rsp;

        if (kpti_enable == 1) {
                /* This IST stack is used for #GP,#PF,#SS (fault) interrupts. */
                ktss0->tss_ist5 =
                    (uint64_t)&cpus->cpu_m.mcpu_kpti_flt.kf_tr_rsp;

                /* This IST stack is used for all other intrs (for KPTI). */
                ktss0->tss_ist6 = (uint64_t)&cpus->cpu_m.mcpu_kpti.kf_tr_rsp;
        }

        /*
         * Set I/O bit map offset equal to size of TSS segment limit
         * for no I/O permission map. This will force all user I/O
         * instructions to generate #gp fault.
         */
        ktss0->tss_bitmapbase = sizeof (*ktss0);

        /*
         * Point %tr to descriptor for ktss0 in gdt.
         */
        wr_tsr(KTSS_SEL);
}

#endif  /* !__xpv */

#if defined(__xpv)

void
init_desctbls(void)
{
        uint_t vec;
        user_desc_t *gdt;

        /*
         * Setup and install our GDT.
         */
        gdt = init_gdt();

        /*
         * Store static pa of gdt to speed up pa_to_ma() translations
         * on lwp context switches.
         */
        ASSERT(IS_P2ALIGNED((uintptr_t)gdt, PAGESIZE));
        CPU->cpu_gdt = gdt;
        CPU->cpu_m.mcpu_gdtpa = pfn_to_pa(va_to_pfn(gdt));

        /*
         * Setup and install our IDT.
         */
#if !defined(__lint)
        ASSERT(NIDT * sizeof (*idt0) <= PAGESIZE);
#endif
        idt0 = (gate_desc_t *)BOP_ALLOC(bootops, (caddr_t)IDT_VA,
            PAGESIZE, PAGESIZE);
        bzero(idt0, PAGESIZE);
        init_idt(idt0);
        for (vec = 0; vec < NIDT; vec++)
                xen_idt_write(&idt0[vec], vec);

        CPU->cpu_idt = idt0;

        /*
         * set default kernel stack
         */
        xen_stack_switch(KDS_SEL,
            (ulong_t)&dblfault_stack0[sizeof (dblfault_stack0)]);

        xen_init_callbacks();

        init_ldt();
}

#else   /* __xpv */

void
init_desctbls(void)
{
        user_desc_t *gdt;
        desctbr_t idtr;

        /*
         * Allocate IDT and TSS structures on unique pages for better
         * performance in virtual machines.
         */
#if !defined(__lint)
        ASSERT(NIDT * sizeof (*idt0) <= PAGESIZE);
#endif
        idt0 = (gate_desc_t *)BOP_ALLOC(bootops, (caddr_t)IDT_VA,
            PAGESIZE, PAGESIZE);
        bzero(idt0, PAGESIZE);
#if !defined(__lint)
        ASSERT(sizeof (*ktss0) <= PAGESIZE);
#endif
        ktss0 = (tss_t *)BOP_ALLOC(bootops, (caddr_t)KTSS_VA,
            PAGESIZE, PAGESIZE);
        bzero(ktss0, PAGESIZE);


        /*
         * Setup and install our GDT.
         */
        gdt = init_gdt();
        ASSERT(IS_P2ALIGNED((uintptr_t)gdt, PAGESIZE));
        CPU->cpu_gdt = gdt;

        /*
         * Initialize this CPU's LDT.
         */
        CPU->cpu_m.mcpu_ldt = BOP_ALLOC(bootops, (caddr_t)LDT_VA,
            LDT_CPU_SIZE, PAGESIZE);
        bzero(CPU->cpu_m.mcpu_ldt, LDT_CPU_SIZE);
        CPU->cpu_m.mcpu_ldt_len = 0;

        /*
         * Setup and install our IDT.
         */
        init_idt(idt0);

        idtr.dtr_base = (uintptr_t)idt0;
        idtr.dtr_limit = (NIDT * sizeof (*idt0)) - 1;
        wr_idtr(&idtr);
        CPU->cpu_idt = idt0;


        init_tss();
        CPU->cpu_tss = ktss0;
        init_ldt();

        /* Stash this so that the NMI,MCE,#DF and KDI handlers can use it. */
        kpti_safe_cr3 = (uint64_t)getcr3();
}

#endif  /* __xpv */

#ifndef __xpv
/*
 * As per Intel Vol 3 27.5.2, the GDTR limit is reset to 64Kb on a VM exit, so
 * we have to manually fix it up ourselves.
 *
 * The caller may still need to make sure that it can't go off-CPU with the
 * incorrect limit, before calling this (such as disabling pre-emption).
 */
void
reset_gdtr_limit(void)
{
        ulong_t flags = intr_clear();
        desctbr_t gdtr;

        rd_gdtr(&gdtr);
        gdtr.dtr_limit = (sizeof (user_desc_t) * NGDT) - 1;
        wr_gdtr(&gdtr);

        intr_restore(flags);
}
#endif /* __xpv */

/*
 * We need a GDT owned by the kernel and not the bootstrap relatively
 * early in kernel initialization (e.g., to have segments we can reliably
 * catch an exception on).
 *
 * Initializes a GDT with segments normally defined in the boot loader.
 */
void
init_boot_gdt(user_desc_t *bgdt)
{
        ASSERT3P(bgdt, !=, NULL);

#ifdef  __xpv
        /* XXX: It is unclear why this 32-bit data segment is marked long. */
        set_usegd(&bgdt[GDT_B32DATA], SDP_LONG, NULL, SDP_LIMIT_MAX, SDT_MEMRWA,
            SEL_KPL, SDP_PAGES, SDP_OP32);
#else
        /*
         * Reset boot segments.  These ostensibly come from the boot loader,
         * but we reset them to match our expectations, particulary if we
         * are not using that loader.
         */
        set_usegd(&bgdt[GDT_B32DATA], SDP_SHORT, NULL, SDP_LIMIT_MAX,
            SDT_MEMRWA, SEL_KPL, SDP_PAGES, SDP_OP32);
        set_usegd(&bgdt[GDT_B32CODE], SDP_SHORT, NULL, SDP_LIMIT_MAX,
            SDT_MEMERA, SEL_KPL, SDP_PAGES, SDP_OP32);

        /*
         * 16-bit segments for making BIOS calls (not applicable on all
         * architectures).
         */
        set_usegd(&bgdt[GDT_B16CODE], SDP_SHORT, NULL, SDP_LIMIT_MAX,
            SDT_MEMERA, SEL_KPL, 0, 0);
        /*
         * XXX: SDP_OP32 makes this a 32-bit segment, which seems wrong
         * here, but that's what boot_gdt.s used.
         */
        set_usegd(&bgdt[GDT_B16DATA], SDP_SHORT, NULL, SDP_LIMIT_MAX,
            SDT_MEMRWA, SEL_KPL, 0, SDP_OP32);
#endif  /* __xpv */

        /*
         * A 64-bit code segment used in early boot.  Early IDTs refer to this.
         */
        set_usegd(&bgdt[GDT_B64CODE], SDP_LONG, NULL, SDP_LIMIT_MAX, SDT_MEMERA,
            SEL_KPL, SDP_PAGES, SDP_OP32);
}

/*
 * Enable interpositioning on the system call path by rewriting the
 * sys{call|enter} MSRs and the syscall-related entries in the IDT to use
 * the branded entry points.
 */
void
brand_interpositioning_enable(void *arg __unused)
{
        gate_desc_t     *idt = CPU->cpu_idt;
        int             i;

        ASSERT(curthread->t_preempt != 0 || getpil() >= DISP_LEVEL);

        for (i = 0; brand_tbl[i].ih_inum; i++) {
                idt[brand_tbl[i].ih_inum] = brand_tbl[i].ih_interp_desc;
#if defined(__xpv)
                xen_idt_write(&idt[brand_tbl[i].ih_inum],
                    brand_tbl[i].ih_inum);
#endif
        }

#if defined(__xpv)

        /*
         * Currently the hypervisor only supports 64-bit syscalls via
         * syscall instruction. The 32-bit syscalls are handled by
         * interrupt gate above.
         */
        xen_set_callback(brand_sys_syscall, CALLBACKTYPE_syscall,
            CALLBACKF_mask_events);

#else

        if (is_x86_feature(x86_featureset, X86FSET_ASYSC)) {
                if (kpti_enable == 1) {
                        wrmsr(MSR_AMD_LSTAR, (uintptr_t)tr_brand_sys_syscall);
                        wrmsr(MSR_AMD_CSTAR, (uintptr_t)tr_brand_sys_syscall32);
                } else {
                        wrmsr(MSR_AMD_LSTAR, (uintptr_t)brand_sys_syscall);
                        wrmsr(MSR_AMD_CSTAR, (uintptr_t)brand_sys_syscall32);
                }
        }

#endif

        if (is_x86_feature(x86_featureset, X86FSET_SEP)) {
                if (kpti_enable == 1) {
                        wrmsr(MSR_INTC_SEP_EIP,
                            (uintptr_t)tr_brand_sys_sysenter);
                } else {
                        wrmsr(MSR_INTC_SEP_EIP, (uintptr_t)brand_sys_sysenter);
                }
        }
}

/*
 * Disable interpositioning on the system call path by rewriting the
 * sys{call|enter} MSRs and the syscall-related entries in the IDT to use
 * the standard entry points, which bypass the interpositioning hooks.
 */
void
brand_interpositioning_disable(void *arg __unused)
{
        gate_desc_t     *idt = CPU->cpu_idt;
        int i;

        ASSERT(curthread->t_preempt != 0 || getpil() >= DISP_LEVEL);

        for (i = 0; brand_tbl[i].ih_inum; i++) {
                idt[brand_tbl[i].ih_inum] = brand_tbl[i].ih_default_desc;
#if defined(__xpv)
                xen_idt_write(&idt[brand_tbl[i].ih_inum],
                    brand_tbl[i].ih_inum);
#endif
        }

#if defined(__xpv)

        /*
         * See comment above in brand_interpositioning_enable.
         */
        xen_set_callback(sys_syscall, CALLBACKTYPE_syscall,
            CALLBACKF_mask_events);

#else

        if (is_x86_feature(x86_featureset, X86FSET_ASYSC)) {
                if (kpti_enable == 1) {
                        wrmsr(MSR_AMD_LSTAR, (uintptr_t)tr_sys_syscall);
                        wrmsr(MSR_AMD_CSTAR, (uintptr_t)tr_sys_syscall32);
                } else {
                        wrmsr(MSR_AMD_LSTAR, (uintptr_t)sys_syscall);
                        wrmsr(MSR_AMD_CSTAR, (uintptr_t)sys_syscall32);
                }
        }

#endif

        if (is_x86_feature(x86_featureset, X86FSET_SEP)) {
                if (kpti_enable == 1) {
                        wrmsr(MSR_INTC_SEP_EIP, (uintptr_t)tr_sys_sysenter);
                } else {
                        wrmsr(MSR_INTC_SEP_EIP, (uintptr_t)sys_sysenter);
                }
        }
}