root/sys/cddl/dev/kinst/amd64/kinst_isa.c
/*
 * SPDX-License-Identifier: CDDL 1.0
 *
 * Copyright (c) 2022 Christos Margiolis <christos@FreeBSD.org>
 * Copyright (c) 2022 Mark Johnston <markj@FreeBSD.org>
 * Copyright (c) 2023 The FreeBSD Foundation
 *
 * Portions of this software were developed by Christos Margiolis
 * <christos@FreeBSD.org> under sponsorship from the FreeBSD Foundation.
 */

#include <sys/param.h>
#include <sys/pcpu.h>

#include <machine/cpufunc.h>
#include <machine/md_var.h>

#include <sys/dtrace.h>
#include <cddl/dev/dtrace/dtrace_cddl.h>
#include <dis_tables.h>

#include "kinst.h"

#define KINST_PUSHL_RBP         0x55
#define KINST_STI               0xfb
#define KINST_POPF              0x9d

#define KINST_MODRM_MOD(b)      (((b) & 0xc0) >> 6)
#define KINST_MODRM_REG(b)      (((b) & 0x38) >> 3)
#define KINST_MODRM_RM(b)       ((b) & 0x07)

#define KINST_SIB_SCALE(s)      (((s) & 0xc0) >> 6)
#define KINST_SIB_INDEX(s)      (((s) & 0x38) >> 3)
#define KINST_SIB_BASE(s)       (((s) & 0x07) >> 0)

#define KINST_REX_W(r)          (((r) & 0x08) >> 3)
#define KINST_REX_R(r)          (((r) & 0x04) >> 2)
#define KINST_REX_X(r)          (((r) & 0x02) >> 1)
#define KINST_REX_B(r)          (((r) & 0x01) >> 0)

#define KINST_F_CALL            0x0001  /* instruction is a "call" */
#define KINST_F_DIRECT_CALL     0x0002  /* instruction is a direct call */
#define KINST_F_RIPREL          0x0004  /* instruction is position-dependent */
#define KINST_F_JMP             0x0008  /* instruction is a %rip-relative jmp */
#define KINST_F_MOD_DIRECT      0x0010  /* operand is not a memory address */

/*
 * Per-CPU trampolines used when the interrupted thread is executing with
 * interrupts disabled.  If an interrupt is raised while executing a trampoline,
 * the interrupt thread cannot safely overwrite its trampoline if it hits a
 * kinst probe while executing the interrupt handler.
 */
DPCPU_DEFINE_STATIC(uint8_t *, intr_tramp);

/*
 * Map ModR/M register bits to a trapframe offset.
 */
static int
kinst_regoff(int reg)
{
#define _MATCH_REG(i, reg)                      \
        case i:                                 \
                return (offsetof(struct trapframe, tf_ ## reg) / \
                    sizeof(register_t))
        switch (reg) {
        _MATCH_REG( 0, rax);
        _MATCH_REG( 1, rcx);
        _MATCH_REG( 2, rdx);
        _MATCH_REG( 3, rbx);
        _MATCH_REG( 4, rsp); /* SIB when mod != 3 */
        _MATCH_REG( 5, rbp);
        _MATCH_REG( 6, rsi);
        _MATCH_REG( 7, rdi);
        _MATCH_REG( 8, r8); /* REX.R is set */
        _MATCH_REG( 9, r9);
        _MATCH_REG(10, r10);
        _MATCH_REG(11, r11);
        _MATCH_REG(12, r12);
        _MATCH_REG(13, r13);
        _MATCH_REG(14, r14);
        _MATCH_REG(15, r15);
        }
#undef _MATCH_REG
        panic("%s: unhandled register index %d", __func__, reg);
}

/*
 * Obtain the specified register's value.
 */
static uint64_t
kinst_regval(struct trapframe *frame, int reg)
{
        if (reg == -1)
                return (0);
        return (((register_t *)frame)[kinst_regoff(reg)]);
}

static uint32_t
kinst_riprel_disp(struct kinst_probe *kp, void *dst)
{
        return ((uint32_t)((intptr_t)kp->kp_patchpoint + kp->kp_md.disp -
            (intptr_t)dst));
}

static void
kinst_trampoline_populate(struct kinst_probe *kp, uint8_t *tramp)
{
        uint8_t *instr;
        uint32_t disp;
        int ilen;

        ilen = kp->kp_md.tinstlen;

        kinst_memcpy(tramp, kp->kp_md.template, ilen);
        if ((kp->kp_md.flags & KINST_F_RIPREL) != 0) {
                disp = kinst_riprel_disp(kp, tramp);
                kinst_memcpy(&tramp[kp->kp_md.dispoff], &disp, sizeof(uint32_t));
        }

        /*
         * The following position-independent jmp takes us back to the
         * original code.  It is encoded as "jmp *0(%rip)" (six bytes),
         * followed by the absolute address of the instruction following
         * the one that was traced (eight bytes).
         */
        tramp[ilen + 0] = 0xff;
        tramp[ilen + 1] = 0x25;
        tramp[ilen + 2] = 0x00;
        tramp[ilen + 3] = 0x00;
        tramp[ilen + 4] = 0x00;
        tramp[ilen + 5] = 0x00;
        instr = kp->kp_patchpoint + kp->kp_md.instlen;
        kinst_memcpy(&tramp[ilen + 6], &instr, sizeof(uintptr_t));
}

int
kinst_invop(uintptr_t addr, struct trapframe *frame, uintptr_t scratch)
{
        solaris_cpu_t *cpu;
        uintptr_t *stack, retaddr;
        struct kinst_probe *kp;
        struct kinst_probe_md *kpmd;
        uint8_t *tramp;

        stack = (uintptr_t *)frame->tf_rsp;
        cpu = &solaris_cpu[curcpu];

        LIST_FOREACH(kp, KINST_GETPROBE(addr), kp_hashnext) {
                if ((uintptr_t)kp->kp_patchpoint == addr)
                        break;
        }
        if (kp == NULL)
                return (0);

        /*
         * Report the address of the breakpoint for the benefit of consumers
         * fetching register values with regs[].
         */
        frame->tf_rip--;

        DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
        cpu->cpu_dtrace_caller = stack[0];
        DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT | CPU_DTRACE_BADADDR);
        dtrace_probe(kp->kp_id, 0, 0, 0, 0, 0);
        cpu->cpu_dtrace_caller = 0;

        kpmd = &kp->kp_md;
        if ((kpmd->flags & KINST_F_CALL) != 0) {
                /*
                 * dtrace_invop_start() reserves space on the stack to
                 * store the return address of the call instruction.
                 */
                retaddr = (uintptr_t)(kp->kp_patchpoint + kpmd->instlen);
                *(uintptr_t *)scratch = retaddr;

                if ((kpmd->flags & KINST_F_DIRECT_CALL) != 0) {
                        frame->tf_rip = (uintptr_t)(kp->kp_patchpoint +
                            kpmd->disp + kpmd->instlen);
                } else {
                        register_t rval;

                        if (kpmd->reg1 == -1 && kpmd->reg2 == -1) {
                                /* rip-relative */
                                rval = frame->tf_rip + kpmd->instlen;
                        } else {
                                /* indirect */
                                rval = kinst_regval(frame, kpmd->reg1) +
                                    (kinst_regval(frame, kpmd->reg2) <<
                                    kpmd->scale);
                        }

                        if ((kpmd->flags & KINST_F_MOD_DIRECT) != 0) {
                                frame->tf_rip = rval + kpmd->disp;
                        } else {
                                frame->tf_rip =
                                    *(uintptr_t *)(rval + kpmd->disp);
                        }
                }
                return (DTRACE_INVOP_CALL);
        } else {
                if ((frame->tf_rflags & PSL_I) == 0)
                        tramp = DPCPU_GET(intr_tramp);
                else
                        tramp = curthread->t_kinst_tramp;
                if (tramp == NULL) {
                        /*
                         * A trampoline allocation failed, so this probe is
                         * effectively disabled.  Restore the original
                         * instruction.
                         *
                         * We can't safely print anything here, but the
                         * trampoline allocator should have left a breadcrumb in
                         * the dmesg.
                         */
                        kinst_patch_tracepoint(kp, kp->kp_savedval);
                        frame->tf_rip = (register_t)kp->kp_patchpoint;
                } else {
                        kinst_trampoline_populate(kp, tramp);
                        frame->tf_rip = (register_t)tramp;
                }
                return (DTRACE_INVOP_NOP);
        }
}

void
kinst_patch_tracepoint(struct kinst_probe *kp, kinst_patchval_t val)
{
        register_t reg;
        int oldwp;

        reg = intr_disable();
        oldwp = disable_wp();
        *kp->kp_patchpoint = val;
        restore_wp(oldwp);
        intr_restore(reg);
}

static void
kinst_set_disp8(struct kinst_probe *kp, uint8_t byte)
{
        kp->kp_md.disp = (int64_t)(int8_t)byte;
}

static void
kinst_set_disp32(struct kinst_probe *kp, uint8_t *bytes)
{
        int32_t disp32;

        memcpy(&disp32, bytes, sizeof(disp32));
        kp->kp_md.disp = (int64_t)disp32;
}

/*
 * Set up all of the state needed to faithfully execute a probed instruction.
 *
 * In the simple case, we copy the instruction unmodified to a per-thread
 * trampoline, wherein it is followed by a jump back to the original code.
 * - Instructions can have %rip as an operand:
 *   - with %rip-relative addressing encoded in ModR/M, or
 *   - implicitly as a part of the instruction definition (jmp, call).
 * - Call instructions (which may be %rip-relative) need to push the correct
 *   return address onto the stack.
 *
 * Call instructions are simple enough to be emulated in software, so we simply
 * do not use the trampoline mechanism in that case.  kinst_invop() will compute
 * the branch target using the address info computed here (register operands and
 * displacement).
 *
 * %rip-relative operands encoded using the ModR/M byte always use a 32-bit
 * displacement; when populating the trampoline the displacement is adjusted to
 * be relative to the trampoline address.  Trampolines are always allocated
 * above KERNBASE for this reason.
 *
 * For other %rip-relative operands (just jumps) we take the same approach.
 * Instructions which specify an 8-bit displacement must be rewritten to use a
 * 32-bit displacement.
 */
static int
kinst_instr_dissect(struct kinst_probe *kp, uint8_t **instr)
{
        struct kinst_probe_md *kpmd;
        dis86_t d86;
        uint8_t *bytes, modrm, rex;
        int dispoff, i, ilen, opcidx;

        kpmd = &kp->kp_md;

        d86.d86_data = instr;
        d86.d86_get_byte = dtrace_dis_get_byte;
        d86.d86_check_func = NULL;
        if (dtrace_disx86(&d86, SIZE64) != 0) {
                KINST_LOG("failed to disassemble instruction at: %p", *instr);
                return (EINVAL);
        }
        bytes = d86.d86_bytes;
        kpmd->instlen = kpmd->tinstlen = d86.d86_len;

        /*
         * Skip over prefixes, save REX.
         */
        rex = 0;
        for (i = 0; i < kpmd->instlen; i++) {
                switch (bytes[i]) {
                case 0xf0 ... 0xf3:
                        /* group 1 */
                        continue;
                case 0x26:
                case 0x2e:
                case 0x36:
                case 0x3e:
                case 0x64:
                case 0x65:
                        /* group 2 */
                        continue;
                case 0x66:
                        /* group 3 */
                        continue;
                case 0x67:
                        /* group 4 */
                        continue;
                case 0x40 ... 0x4f:
                        /* REX */
                        rex = bytes[i];
                        continue;
                }
                break;
        }
        KASSERT(i < kpmd->instlen,
            ("%s: failed to disassemble instruction at %p", __func__, bytes));
        opcidx = i;

        /*
         * Identify instructions of interest by opcode: calls and jumps.
         * Extract displacements.
         */
        dispoff = -1;
        switch (bytes[opcidx]) {
        case 0x0f:
                switch (bytes[opcidx + 1]) {
                case 0x80 ... 0x8f:
                        /* conditional jmp near */
                        kpmd->flags |= KINST_F_JMP | KINST_F_RIPREL;
                        dispoff = opcidx + 2;
                        kinst_set_disp32(kp, &bytes[dispoff]);
                        break;
                }
                break;
        case 0xe3:
                /*
                 * There is no straightforward way to translate this instruction
                 * to use a 32-bit displacement.  Fortunately, it is rarely
                 * used.
                 */
                return (EINVAL);
        case 0x70 ... 0x7f:
                /* conditional jmp short */
                kpmd->flags |= KINST_F_JMP | KINST_F_RIPREL;
                dispoff = opcidx + 1;
                kinst_set_disp8(kp, bytes[dispoff]);
                break;
        case 0xe9:
                /* unconditional jmp near */
                kpmd->flags |= KINST_F_JMP | KINST_F_RIPREL;
                dispoff = opcidx + 1;
                kinst_set_disp32(kp, &bytes[dispoff]);
                break;
        case 0xeb:
                /* unconditional jmp short */
                kpmd->flags |= KINST_F_JMP | KINST_F_RIPREL;
                dispoff = opcidx + 1;
                kinst_set_disp8(kp, bytes[dispoff]);
                break;
        case 0xe8:
        case 0x9a:
                /* direct call */
                kpmd->flags |= KINST_F_CALL | KINST_F_DIRECT_CALL;
                dispoff = opcidx + 1;
                kinst_set_disp32(kp, &bytes[dispoff]);
                break;
        case 0xff:
                KASSERT(d86.d86_got_modrm,
                    ("no ModR/M byte for instr at %p", *instr - kpmd->instlen));
                switch (KINST_MODRM_REG(bytes[d86.d86_rmindex])) {
                case 0x02:
                case 0x03:
                        /* indirect call */
                        kpmd->flags |= KINST_F_CALL;
                        break;
                case 0x04:
                case 0x05:
                        /* indirect jump */
                        kpmd->flags |= KINST_F_JMP;
                        break;
                }
        }

        /*
         * If there's a ModR/M byte, we need to check it to see if the operand
         * is %rip-relative, and rewrite the displacement if so.  If not, we
         * might still have to extract operand info if this is a call
         * instruction.
         */
        if (d86.d86_got_modrm) {
                uint8_t mod, rm, sib;

                kpmd->reg1 = kpmd->reg2 = -1;

                modrm = bytes[d86.d86_rmindex];
                mod = KINST_MODRM_MOD(modrm);
                rm = KINST_MODRM_RM(modrm);
                if (mod == 0 && rm == 5) {
                        kpmd->flags |= KINST_F_RIPREL;
                        dispoff = d86.d86_rmindex + 1;
                        kinst_set_disp32(kp, &bytes[dispoff]);
                } else if ((kpmd->flags & KINST_F_CALL) != 0) {
                        bool havesib;

                        havesib = (mod != 3 && rm == 4);
                        dispoff = d86.d86_rmindex + (havesib ? 2 : 1);
                        if (mod == 1)
                                kinst_set_disp8(kp, bytes[dispoff]);
                        else if (mod == 2)
                                kinst_set_disp32(kp, &bytes[dispoff]);
                        else if (mod == 3)
                                kpmd->flags |= KINST_F_MOD_DIRECT;

                        if (havesib) {
                                sib = bytes[d86.d86_rmindex + 1];
                                if (KINST_SIB_BASE(sib) != 5) {
                                        kpmd->reg1 = KINST_SIB_BASE(sib) |
                                            (KINST_REX_B(rex) << 3);
                                }
                                kpmd->scale = KINST_SIB_SCALE(sib);
                                kpmd->reg2 = KINST_SIB_INDEX(sib) |
                                    (KINST_REX_X(rex) << 3);
                        } else {
                                kpmd->reg1 = rm | (KINST_REX_B(rex) << 3);
                        }
                }
        }

        /*
         * Calls are emulated in software; once operands are decoded we have
         * nothing else to do.
         */
        if ((kpmd->flags & KINST_F_CALL) != 0)
                return (0);

        /*
         * Allocate and populate an instruction trampoline template.
         *
         * Position-independent instructions can simply be copied, but
         * position-dependent instructions require some surgery: jump
         * instructions with an 8-bit displacement need to be converted to use a
         * 32-bit displacement, and the adjusted displacement needs to be
         * computed.
         */
        ilen = kpmd->instlen;
        if ((kpmd->flags & KINST_F_RIPREL) != 0) {
                if ((kpmd->flags & KINST_F_JMP) == 0 ||
                    bytes[opcidx] == 0x0f ||
                    bytes[opcidx] == 0xe9 ||
                    bytes[opcidx] == 0xff) {
                        memcpy(kpmd->template, bytes, dispoff);
                        memcpy(&kpmd->template[dispoff + 4],
                            &bytes[dispoff + 4], ilen - (dispoff + 4));
                        kpmd->dispoff = dispoff;
                } else if (bytes[opcidx] == 0xeb) {
                        memcpy(kpmd->template, bytes, opcidx);
                        kpmd->template[opcidx] = 0xe9;
                        kpmd->dispoff = opcidx + 1;

                        /* Instruction length changes from 2 to 5. */
                        kpmd->tinstlen = 5;
                        kpmd->disp -= 3;
                } else if (bytes[opcidx] >= 0x70 && bytes[opcidx] <= 0x7f)  {
                        memcpy(kpmd->template, bytes, opcidx);
                        kpmd->template[opcidx] = 0x0f;
                        kpmd->template[opcidx + 1] = bytes[opcidx] + 0x10;
                        kpmd->dispoff = opcidx + 2;

                        /* Instruction length changes from 2 to 6. */
                        kpmd->tinstlen = 6;
                        kpmd->disp -= 4;
                } else {
                        panic("unhandled opcode %#x", bytes[opcidx]);
                }
        } else {
                memcpy(kpmd->template, bytes, ilen);
        }

        return (0);
}

int
kinst_make_probe(linker_file_t lf, int symindx, linker_symval_t *symval,
    void *opaque)
{
        struct kinst_probe *kp;
        dtrace_kinst_probedesc_t *pd;
        const char *func;
        int error, instrsize, n, off;
        uint8_t *instr, *limit, *tmp;
        bool push_found;

        pd = opaque;
        func = symval->name;
        if (kinst_excluded(func))
                return (0);
        if (strcmp(func, pd->kpd_func) != 0)
                return (0);

        instr = (uint8_t *)symval->value;
        limit = (uint8_t *)symval->value + symval->size;
        if (instr >= limit)
                return (0);

        /*
         * Refuse to instrument functions lacking the usual frame pointer
         * manipulations since they might correspond to exception handlers.
         */
        tmp = instr;
        push_found = false;
        while (tmp < limit) {
                /*
                 * Checking for 'pop %rbp' as well makes the filtering too
                 * strict as it would skip functions that never return (e.g.,
                 * vnlru_proc()).
                 */
                if (*tmp == KINST_PUSHL_RBP) {
                        push_found = true;
                        break;
                }
                tmp += dtrace_instr_size(tmp);
        }
        if (!push_found)
                return (0);

        n = 0;
        while (instr < limit) {
                instrsize = dtrace_instr_size(instr);
                off = (int)(instr - (uint8_t *)symval->value);
                if (pd->kpd_off != -1 && off != pd->kpd_off) {
                        instr += instrsize;
                        continue;
                }

                /*
                 * Check for instructions which may enable interrupts.  Such
                 * instructions are tricky to trace since it is unclear whether
                 * to use the per-thread or per-CPU trampolines.  Since they are
                 * rare, we don't bother to implement special handling for them.
                 *
                 * If the caller specified an offset, return an error, otherwise
                 * silently ignore the instruction so that it remains possible
                 * to enable all instructions in a function.
                 */
                if (instrsize == 1 &&
                    (instr[0] == KINST_POPF || instr[0] == KINST_STI)) {
                        if (pd->kpd_off != -1)
                                return (EINVAL);
                        instr += instrsize;
                        continue;
                }

                /*
                 * Prevent separate dtrace(1) instances from creating copies of
                 * the same probe.
                 */
                LIST_FOREACH(kp, KINST_GETPROBE(instr), kp_hashnext) {
                        if (strcmp(kp->kp_func, func) == 0 &&
                            strtol(kp->kp_name, NULL, 10) == off)
                                return (0);
                }
                if (++n > KINST_PROBETAB_MAX) {
                        KINST_LOG("probe list full: %d entries", n);
                        return (ENOMEM);
                }
                kp = malloc(sizeof(struct kinst_probe), M_KINST,
                    M_WAITOK | M_ZERO);
                kp->kp_func = func;
                snprintf(kp->kp_name, sizeof(kp->kp_name), "%d", off);
                kp->kp_savedval = *instr;
                kp->kp_patchval = KINST_PATCHVAL;
                kp->kp_patchpoint = instr;

                error = kinst_instr_dissect(kp, &instr);
                if (error != 0)
                        return (error);

                kinst_probe_create(kp, lf);
        }

        return (0);
}

int
kinst_md_init(void)
{
        uint8_t *tramp;
        int cpu;

        CPU_FOREACH(cpu) {
                tramp = kinst_trampoline_alloc(M_WAITOK);
                if (tramp == NULL)
                        return (ENOMEM);
                DPCPU_ID_SET(cpu, intr_tramp, tramp);
        }

        return (0);
}

void
kinst_md_deinit(void)
{
        uint8_t *tramp;
        int cpu;

        CPU_FOREACH(cpu) {
                tramp = DPCPU_ID_GET(cpu, intr_tramp);
                if (tramp != NULL) {
                        kinst_trampoline_dealloc(tramp);
                        DPCPU_ID_SET(cpu, intr_tramp, NULL);
                }
        }
}

/*
 * Exclude machine-dependent functions that are not safe-to-trace.
 */
bool
kinst_md_excluded(const char *name)
{
        return (false);
}