root/sys/amd64/pt/pt.c
/*
 * Copyright (c) 2025 Bojan Novković <bnovkov@freebsd.org>
 *
 * SPDX-License-Identifier: BSD-2-Clause
 */

/*
 * hwt(4) Intel Processor Trace (PT) backend
 *
 * Driver Design Overview
 *
 * - Since PT is configured on a per-core basis, the driver uses
 *   'smp_rendezvous' to start and disable tracing on each target core.
 * - PT-specific resources are stored in a 'struct pt_ctx' context structure for
 *   each traced CPU core or thread. Upon initialization, a ToPA configuration
 *   is generated for each 'pt_ctx' structure using the HWT tracing buffers.
 *   The HWT tracing buffer is split into 4K ToPA entries. Currently, each
 *   4K ToPA entry is configured to trigger an interrupt after it is filled.
 * - The PT driver uses the XSAVE/XRSTOR PT extensions to load and save all
 *   relevant PT registers. Every time a traced thread is switched
 *   out or in, its state will be saved to or loaded from its corresponding
 *   'pt_ctx' context.
 * - When tracing starts, the PT hardware will start writing data into the
 *   tracing buffer. When a TOPA_INT entry is filled, it will trigger an
 *   interrupt before continuing. The interrupt handler will then fetch the
 *   last valid tracing buffer offset and enqueue a HWT_RECORD_BUFFER record.
 *   The driver is currently configured to use the NMI interrupt line.
 * - The userspace PT backend waits for incoming HWT_RECORD_BUFFER records
 *   and uses the offsets to decode data from the tracing buffer.
 *
 * Future improvements and limitations
 *
 * - We currently configure the PT hardware to trigger an interrupt whenever
 *   a 4K ToPA entry is filled. While this is fine when tracing smaller
 *   functions or infrequent code paths, this will generate too much interrupt
 *   traffic when tracing hotter functions. A proper solution for this issue
 *   should estimate the amount of data generated by the current configuration
 *   and use it to determine interrupt frequency.
 *
 * - Support for more tracing options and PT features.
 *
 */

#include <sys/systm.h>
#include <sys/bus.h>
#include <sys/hwt.h>
#include <sys/interrupt.h>
#include <sys/kernel.h>
#include <sys/lock.h>
#include <sys/malloc.h>
#include <sys/module.h>
#include <sys/mutex.h>
#include <sys/smp.h>

#include <vm/vm.h>
#include <vm/vm_page.h>

#include <machine/atomic.h>
#include <machine/cpufunc.h>
#include <machine/fpu.h>
#include <machine/smp.h>
#include <machine/specialreg.h>

#include <x86/apicvar.h>
#include <x86/x86_var.h>

#include <dev/hwt/hwt_context.h>
#include <dev/hwt/hwt_vm.h>
#include <dev/hwt/hwt_backend.h>
#include <dev/hwt/hwt_config.h>
#include <dev/hwt/hwt_cpu.h>
#include <dev/hwt/hwt_record.h>
#include <dev/hwt/hwt_thread.h>

#include <amd64/pt/pt.h>

#ifdef PT_DEBUG
#define dprintf(fmt, ...) printf(fmt, ##__VA_ARGS__)
#else
#define dprintf(fmt, ...)
#endif
#define PT_SUPPORTED_FLAGS                                              \
        (RTIT_CTL_MTCEN | RTIT_CTL_CR3FILTER | RTIT_CTL_DIS_TNT |       \
            RTIT_CTL_USER | RTIT_CTL_OS | RTIT_CTL_BRANCHEN)
#define PT_XSAVE_MASK (XFEATURE_ENABLED_X87 | XFEATURE_ENABLED_SSE)
#define PT_XSTATE_BV (PT_XSAVE_MASK | XFEATURE_ENABLED_PT)
#define PT_MAX_IP_RANGES 2

#define PT_TOPA_MASK_PTRS 0x7f
#define PT_TOPA_PAGE_MASK 0xffffff80
#define PT_TOPA_PAGE_SHIFT 7

#define CPUID_PT_LEAF   0x14

MALLOC_DEFINE(M_PT, "pt", "Intel Processor Trace");

static void pt_send_buffer_record(void *arg);
static int pt_topa_intr(struct trapframe *tf);

/*
 * Intel Processor Trace XSAVE-managed state.
 */
struct pt_ext_area {
        uint64_t rtit_ctl;
        uint64_t rtit_output_base;
        uint64_t rtit_output_mask_ptrs;
        uint64_t rtit_status;
        uint64_t rtit_cr3_match;
        uint64_t rtit_addr0_a;
        uint64_t rtit_addr0_b;
        uint64_t rtit_addr1_a;
        uint64_t rtit_addr1_b;
};

struct pt_buffer {
        uint64_t *topa_hw; /* ToPA table entries. */
        size_t size;
        struct mtx lock; /* Lock for fields below. */
        vm_offset_t offset;
};

struct pt_ctx {
        int id;
        struct pt_buffer buf; /* ToPA buffer metadata */
        struct hwt_context *hwt_ctx;
        uint8_t *save_area; /* PT XSAVE area */
};
/* PT tracing contexts used for CPU mode. */
static struct pt_ctx *pt_pcpu_ctx;

enum pt_cpu_state { PT_INACTIVE = 0, PT_ACTIVE };

static struct pt_cpu {
        struct pt_ctx *ctx;      /* active PT tracing context */
        enum pt_cpu_state state; /* used as part of trace stop protocol */
        void *swi_cookie;        /* Software interrupt handler context */
        int in_pcint_handler;
} *pt_pcpu;

/*
 * PT-related CPUID bits.
 */
static struct pt_cpu_info {
        uint32_t l0_eax;
        uint32_t l0_ebx;
        uint32_t l0_ecx;
        uint32_t l1_eax;
        uint32_t l1_ebx;
        size_t xsave_area_size;
        size_t xstate_hdr_offset;
        size_t pt_xsave_offset;
} pt_info  __read_mostly;

static bool initialized = false;
static int cpu_mode_ctr = 0;

static __inline enum pt_cpu_state
pt_cpu_get_state(int cpu_id)
{
        return (atomic_load_int(&pt_pcpu[cpu_id].state));
}

static __inline void
pt_cpu_set_state(int cpu_id, enum pt_cpu_state state)
{
        atomic_store_int(&pt_pcpu[cpu_id].state, state);
}

static __inline struct xstate_hdr *
pt_ctx_get_xstate_hdr(struct pt_ctx *ctx)
{
        return ((struct xstate_hdr *)(ctx->save_area +
            pt_info.xstate_hdr_offset));
}


static __inline struct pt_ext_area *
pt_ctx_get_ext_area(struct pt_ctx *ctx)
{
        return ((struct pt_ext_area *)(ctx->save_area +
            pt_info.pt_xsave_offset));
}

/*
 * Updates current trace buffer offset from the
 * ToPA MSRs. Records if the trace buffer wrapped.
 */
static __inline void
pt_update_buffer(struct pt_buffer *buf)
{
        uint64_t reg;
        uint64_t offset;

        /* Update buffer offset. */
        reg = rdmsr(MSR_IA32_RTIT_OUTPUT_MASK_PTRS);
        offset = ((reg & PT_TOPA_PAGE_MASK) >> PT_TOPA_PAGE_SHIFT) * PAGE_SIZE;
        offset += (reg >> 32);

        atomic_store_rel_64(&buf->offset, offset);
}

static __inline void
pt_fill_buffer_record(int id, struct pt_buffer *buf,
    struct hwt_record_entry *rec)
{
        vm_offset_t offset;

        offset = atomic_load_acq_64(&buf->offset);

        rec->record_type = HWT_RECORD_BUFFER;
        rec->buf_id = id;
        rec->curpage = offset / PAGE_SIZE;
        rec->offset = offset & PAGE_MASK;
}

/*
 * Enables or disables tracing on curcpu
 * using the XSAVE/XRSTOR PT extensions.
 */
static void
pt_cpu_toggle_local(uint8_t *save_area, bool enable)
{
        u_long xcr0, cr0;
        u_long xss;

        cr0 = rcr0();
        if (cr0 & CR0_TS)
                clts();
        xcr0 = rxcr(XCR0);
        if ((xcr0 & PT_XSAVE_MASK) != PT_XSAVE_MASK)
                load_xcr(XCR0, xcr0 | PT_XSAVE_MASK);
        xss = rdmsr(MSR_IA32_XSS);
        wrmsr(MSR_IA32_XSS, xss | XFEATURE_ENABLED_PT);

        if (!enable) {
                KASSERT((rdmsr(MSR_IA32_RTIT_CTL) & RTIT_CTL_TRACEEN) != 0,
                    ("%s: PT is disabled", __func__));
                xsaves(save_area, XFEATURE_ENABLED_PT);
        } else {
                KASSERT((rdmsr(MSR_IA32_RTIT_CTL) & RTIT_CTL_TRACEEN) == 0,
                    ("%s: PT is enabled", __func__));
                xrstors(save_area, XFEATURE_ENABLED_PT);
        }
        wrmsr(MSR_IA32_XSS, xss);
        if ((xcr0 & PT_XSAVE_MASK) != PT_XSAVE_MASK)
                load_xcr(XCR0, xcr0);
        if (cr0 & CR0_TS)
                load_cr0(cr0);
}

/*
 * Starts PT tracing on 'curcpu'.
 */
static void
pt_cpu_start(void *dummy)
{
        struct pt_cpu *cpu;

        cpu = &pt_pcpu[curcpu];
        MPASS(cpu->ctx != NULL);

        dprintf("%s: curcpu %d\n", __func__, curcpu);
        pt_cpu_set_state(curcpu, PT_ACTIVE);
        load_cr4(rcr4() | CR4_XSAVE);
        wrmsr(MSR_IA32_RTIT_STATUS, 0);
        pt_cpu_toggle_local(cpu->ctx->save_area, true);
}

/*
 * Stops PT tracing on 'curcpu'.
 * Updates trace buffer offset to ensure
 * any data generated between the last interrupt
 * and the trace stop gets picked up by userspace.
 */
static void
pt_cpu_stop(void *dummy)
{
        struct pt_cpu *cpu;
        struct pt_ctx *ctx;

        cpu = &pt_pcpu[curcpu];
        ctx = cpu->ctx;

        dprintf("%s: curcpu %d\n", __func__, curcpu);
        /* Shutdown may occur before PT gets properly configured. */
        if (ctx == NULL) {
                dprintf("%s: missing context on cpu %d; bailing\n", __func__,
                    curcpu);
                return;
        }
        pt_cpu_toggle_local(cpu->ctx->save_area, false);
        pt_update_buffer(&ctx->buf);
}

/*
 * Prepares the Table of Physical Addresses (ToPA) metadata for 'pt_ctx'.
 * The HWT trace buffer is split into 4K ToPA table entries and used
 * as a circular buffer, meaning that the last ToPA entry points to
 * the first ToPA entry. Each entry is configured to raise an
 * interrupt after being filled.
 */
static int
pt_topa_prepare(struct pt_ctx *ctx, struct hwt_vm *vm)
{
        struct pt_buffer *buf;
        size_t topa_size;
        int i;

        topa_size = TOPA_SIZE_4K;
        buf = &ctx->buf;

        KASSERT(buf->topa_hw == NULL,
            ("%s: ToPA info already exists", __func__));
        buf->topa_hw = mallocarray(vm->npages + 1, sizeof(uint64_t), M_PT,
            M_ZERO | M_WAITOK);
        dprintf("%s: ToPA virt addr %p\n", __func__, buf->topa_hw);
        buf->size = vm->npages * PAGE_SIZE;
        for (i = 0; i < vm->npages; i++) {
                buf->topa_hw[i] = VM_PAGE_TO_PHYS(vm->pages[i]) | topa_size;
                /*
                 * XXX: TOPA_INT should ideally be set according to
                 * expected amount of incoming trace data. Too few TOPA_INT
                 * entries will not trigger interrupts often enough when tracing
                 * smaller functions.
                 */
                buf->topa_hw[i] |= TOPA_INT;
        }
        buf->topa_hw[vm->npages] = (uint64_t)vtophys(buf->topa_hw) | TOPA_END;

        return (0);
}

/*
 * Configures IP filtering for trace generation.
 * A maximum of 2 ranges can be specified due to
 * limitations imposed by the XSAVE/XRSTOR PT extensions.
 */
static int
pt_configure_ranges(struct pt_ctx *ctx, struct pt_cpu_config *cfg)
{
        struct pt_ext_area *pt_ext;
        int nranges_supp, n, error = 0;

        pt_ext = pt_ctx_get_ext_area(ctx);
        if (pt_info.l0_ebx & CPUPT_IPF) {
                nranges_supp = (pt_info.l1_eax & CPUPT_NADDR_M) >>
                    CPUPT_NADDR_S;

                if (nranges_supp > PT_IP_FILTER_MAX_RANGES)
                        nranges_supp = PT_IP_FILTER_MAX_RANGES;
                n = cfg->nranges;
                if (n > nranges_supp) {
                        printf("%s: %d IP filtering ranges requested, CPU "
                               "supports %d, truncating\n",
                            __func__, n, nranges_supp);
                        n = nranges_supp;
                }

                switch (n) {
                case 2:
                        pt_ext->rtit_ctl |= (1UL << RTIT_CTL_ADDR_CFG_S(1));
                        pt_ext->rtit_addr1_a = cfg->ip_ranges[1].start;
                        pt_ext->rtit_addr1_b = cfg->ip_ranges[1].end;
                case 1:
                        pt_ext->rtit_ctl |= (1UL << RTIT_CTL_ADDR_CFG_S(0));
                        pt_ext->rtit_addr0_a = cfg->ip_ranges[0].start;
                        pt_ext->rtit_addr0_b = cfg->ip_ranges[0].end;
                        break;
                default:
                        error = (EINVAL);
                        break;
                };
        } else
                error = (ENXIO);

        return (error);
}

static int
pt_init_ctx(struct pt_ctx *pt_ctx, struct hwt_vm *vm, int ctx_id)
{

        dprintf("%s: ctx id %d\n", __func__, ctx_id);

        KASSERT(pt_ctx->buf.topa_hw == NULL,
            ("%s: active ToPA buffer in context %p\n", __func__, pt_ctx));

        memset(pt_ctx, 0, sizeof(struct pt_ctx));
        mtx_init(&pt_ctx->buf.lock, "pttopa", NULL, MTX_SPIN);
        pt_ctx->save_area = malloc_aligned(pt_info.xsave_area_size, 64,
            M_PT, M_NOWAIT | M_ZERO);
        if (pt_ctx->save_area == NULL)
                return (ENOMEM);
        dprintf("%s: preparing ToPA buffer\n", __func__);
        if (pt_topa_prepare(pt_ctx, vm) != 0) {
                free(pt_ctx->save_area, M_PT);
                return (ENOMEM);
        }

        pt_ctx->id = ctx_id;

        return (0);
}

static void
pt_deinit_ctx(struct pt_ctx *pt_ctx)
{

        if (pt_ctx->buf.topa_hw != NULL)
                free(pt_ctx->buf.topa_hw, M_PT);
        if (pt_ctx->save_area != NULL)
                free(pt_ctx->save_area, M_PT);
        memset(pt_ctx, 0, sizeof(*pt_ctx));
}

/*
 * HWT backend configuration method.
 *
 * Checks and translates the user-defined configuration to a
 * set of PT tracing features. Uses the feature set to initialize
 * the tracing context for the target CPU or thread.
 */
static int
pt_backend_configure(struct hwt_context *ctx, int cpu_id, int thread_id)
{
        struct hwt_cpu *hwt_cpu;
        struct hwt_thread *thr;
        struct pt_ctx *pt_ctx;
        struct pt_cpu_config *cfg;
        struct pt_ext_area *pt_ext;
        struct xstate_hdr *hdr;
        int error;

        dprintf("%s\n", __func__);

        cfg = (struct pt_cpu_config *)ctx->config;
        pt_ctx = NULL;

        /* Clear any flags we don't support yet. */
        cfg->rtit_ctl &= PT_SUPPORTED_FLAGS;
        if (cfg->rtit_ctl & RTIT_CTL_MTCEN) {
                if ((pt_info.l0_ebx & CPUPT_MTC) == 0) {
                        printf("%s: CPU does not support generating MTC "
                            "packets\n", __func__);
                        return (ENXIO);
                }
        }

        if (cfg->rtit_ctl & RTIT_CTL_CR3FILTER) {
                if ((pt_info.l0_ebx & CPUPT_CR3) == 0) {
                        printf("%s: CPU does not support CR3 filtering\n",
                            __func__);
                        return (ENXIO);
                }
        }

        if (cfg->rtit_ctl & RTIT_CTL_DIS_TNT) {
                if ((pt_info.l0_ebx & CPUPT_DIS_TNT) == 0) {
                        printf("%s: CPU does not support TNT\n", __func__);
                        return (ENXIO);
                }
        }
        /* TODO: support for more config bits. */

        if (ctx->mode == HWT_MODE_CPU) {
                TAILQ_FOREACH(hwt_cpu, &ctx->cpus, next) {
                        if (hwt_cpu->cpu_id != cpu_id)
                                continue;
                        pt_ctx = &pt_pcpu_ctx[cpu_id];
                        break;
                }
        } else {
                TAILQ_FOREACH(thr, &ctx->threads, next) {
                        if (thr->thread_id != thread_id)
                                continue;
                        KASSERT(thr->private != NULL,
                            ("%s: hwt thread private"
                             " not set, thr %p",
                                __func__, thr));
                        pt_ctx = (struct pt_ctx *)thr->private;
                        break;
                }
        }
        if (pt_ctx == NULL)
                return (ENOENT);

        dprintf("%s: preparing MSRs\n", __func__);
        pt_ext = pt_ctx_get_ext_area(pt_ctx);
        hdr = pt_ctx_get_xstate_hdr(pt_ctx);

        pt_ext->rtit_ctl |= cfg->rtit_ctl;
        if (cfg->nranges != 0) {
                dprintf("%s: preparing IPF ranges\n", __func__);
                if ((error = pt_configure_ranges(pt_ctx, cfg)) != 0)
                        return (error);
        }
        pt_ctx->hwt_ctx = ctx;
        pt_ext->rtit_ctl |= RTIT_CTL_TOPA;
        pt_ext->rtit_output_base = (uint64_t)vtophys(pt_ctx->buf.topa_hw);
        pt_ext->rtit_output_mask_ptrs = PT_TOPA_MASK_PTRS;
        hdr->xstate_bv = XFEATURE_ENABLED_PT;
        hdr->xstate_xcomp_bv = XFEATURE_ENABLED_PT |
            XSTATE_XCOMP_BV_COMPACT;
        pt_ext->rtit_ctl |= RTIT_CTL_TRACEEN;
        pt_pcpu[cpu_id].ctx = pt_ctx;

        return (0);
}

/*
 * hwt backend trace start operation. CPU affine.
 */
static void
pt_backend_enable(struct hwt_context *ctx, int cpu_id)
{
        if (ctx->mode == HWT_MODE_CPU)
                return;

        KASSERT(curcpu == cpu_id,
            ("%s: attempting to start PT on another cpu", __func__));
        pt_cpu_start(NULL);
        CPU_SET(cpu_id, &ctx->cpu_map);
}

/*
 * hwt backend trace stop operation. CPU affine.
 */
static void
pt_backend_disable(struct hwt_context *ctx, int cpu_id)
{
        struct pt_cpu *cpu;

        if (ctx->mode == HWT_MODE_CPU)
                return;
        KASSERT(curcpu == cpu_id,
            ("%s: attempting to disable PT on another cpu", __func__));

        cpu = &pt_pcpu[cpu_id];

        dprintf("%s: waiting for cpu %d to exit interrupt handler\n", __func__,
            cpu_id);
        pt_cpu_set_state(cpu_id, PT_INACTIVE);
        while (atomic_cmpset_int(&cpu->in_pcint_handler, 1, 0))
                ;

        pt_cpu_stop(NULL);
        CPU_CLR(cpu_id, &ctx->cpu_map);
        cpu->ctx = NULL;
}

/*
 * hwt backend trace start operation for remote CPUs.
 */
static int
pt_backend_enable_smp(struct hwt_context *ctx)
{
        dprintf("%s\n", __func__);

        KASSERT(ctx->mode == HWT_MODE_CPU,
            ("%s: should only be used for CPU mode", __func__));
        if (ctx->mode == HWT_MODE_CPU &&
            atomic_swap_32(&cpu_mode_ctr, 1) != 0)
                return (-1);

        smp_rendezvous_cpus(ctx->cpu_map, NULL, pt_cpu_start, NULL, NULL);

        return (0);
}

/*
 * hwt backend trace stop operation for remote CPUs.
 */
static int
pt_backend_disable_smp(struct hwt_context *ctx)
{
        struct pt_cpu *cpu;

        dprintf("%s\n", __func__);
        if (ctx->mode == HWT_MODE_CPU &&
            atomic_swap_32(&cpu_mode_ctr, 0) == 0)
                return (-1);

        if (CPU_EMPTY(&ctx->cpu_map)) {
                dprintf("%s: empty cpu map\n", __func__);
                return (-1);
        }
        CPU_FOREACH_ISSET(cpu_id, &ctx->cpu_map) {
                cpu = &pt_pcpu[cpu_id];
                dprintf("%s: waiting for cpu %d to exit interrupt handler\n",
                    __func__, cpu_id);
                pt_cpu_set_state(cpu_id, PT_INACTIVE);
                while (atomic_cmpset_int(&cpu->in_pcint_handler, 1, 0))
                        ;
        }
        smp_rendezvous_cpus(ctx->cpu_map, NULL, pt_cpu_stop, NULL, NULL);

        return (0);
}

/*
 * HWT backend initialization method.
 *
 * Installs the ToPA interrupt handler and initializes
 * the tracing contexts used for HWT_MODE_CPU.
 */
static int
pt_backend_init(struct hwt_context *ctx)
{
        struct hwt_cpu *hwt_cpu;
        int error;

        dprintf("%s\n", __func__);
        if (ctx->mode != HWT_MODE_CPU)
                return (0);
        TAILQ_FOREACH(hwt_cpu, &ctx->cpus, next) {
                error = pt_init_ctx(&pt_pcpu_ctx[hwt_cpu->cpu_id], hwt_cpu->vm,
                    hwt_cpu->cpu_id);
                if (error)
                        return (error);
        }

        return (0);
}

/*
 * HWT backend teardown method.
 *
 * Removes the ToPA interrupt handler, stops tracing on all active CPUs,
 * and releases all previously allocated ToPA metadata.
 */
static int
pt_backend_deinit(struct hwt_context *ctx)
{
        struct pt_ctx *pt_ctx;
        struct hwt_thread *thr;
        int cpu_id;

        dprintf("%s\n", __func__);

        pt_backend_disable_smp(ctx);
        if (ctx->mode == HWT_MODE_THREAD) {
                TAILQ_FOREACH(thr, &ctx->threads, next) {
                        KASSERT(thr->private != NULL,
                            ("%s: thr->private not set", __func__));
                        pt_ctx = (struct pt_ctx *)thr->private;
                        pt_deinit_ctx(pt_ctx);
                }
        } else {
                CPU_FOREACH_ISSET(cpu_id, &ctx->cpu_map) {
                        if (pt_pcpu[cpu_id].ctx == NULL)
                                continue;
                        KASSERT(pt_pcpu[cpu_id].ctx == &pt_pcpu_ctx[cpu_id],
                            ("%s: CPU mode tracing with non-cpu mode PT"
                             "context active",
                                __func__));
                        pt_deinit_ctx(pt_pcpu[cpu_id].ctx);
                        pt_pcpu[cpu_id].ctx = NULL;
                        atomic_set_int(&pt_pcpu[cpu_id].in_pcint_handler, 0);
                }
        }

        return (0);
}

/*
 * Fetches current offset into the tracing buffer.
 */
static int
pt_backend_read(struct hwt_vm *vm, int *curpage, vm_offset_t *curpage_offset,
    uint64_t *data)
{
        struct pt_buffer *buf;
        uint64_t offset;

        if (vm->ctx->mode == HWT_MODE_THREAD)
                buf = &((struct pt_ctx *)vm->thr->private)->buf;
        else
                buf = &pt_pcpu[vm->cpu->cpu_id].ctx->buf;
        offset = atomic_load_acq_64(&buf->offset);
        *curpage = offset / PAGE_SIZE;
        *curpage_offset = offset & PAGE_MASK;

        return (0);
}

/*
 * HWT thread creation hook.
 * Allocates and associates a 'struct pt_ctx' for a given hwt thread.
 */
static int
pt_backend_alloc_thread(struct hwt_thread *thr)
{
        struct pt_ctx *pt_ctx;
        int error;

        /* Omit M_WAITOK since this might get invoked a non-sleepable context */
        pt_ctx = malloc(sizeof(*pt_ctx), M_PT, M_NOWAIT | M_ZERO);
        if (pt_ctx == NULL)
                return (ENOMEM);

        error = pt_init_ctx(pt_ctx, thr->vm, thr->thread_id);
        if (error)
                return (error);

        thr->private = pt_ctx;
        return (0);
}
/*
 * HWT thread teardown hook.
 */
static void
pt_backend_free_thread(struct hwt_thread *thr)
{
        struct pt_ctx *ctx;

        ctx = (struct pt_ctx *)thr->private;

        pt_deinit_ctx(ctx);
        free(ctx, M_PT);
}

static void
pt_backend_dump(int cpu_id)
{
}

static struct hwt_backend_ops pt_ops = {
        .hwt_backend_init = pt_backend_init,
        .hwt_backend_deinit = pt_backend_deinit,

        .hwt_backend_configure = pt_backend_configure,

        .hwt_backend_enable = pt_backend_enable,
        .hwt_backend_disable = pt_backend_disable,

#ifdef SMP
        .hwt_backend_enable_smp = pt_backend_enable_smp,
        .hwt_backend_disable_smp = pt_backend_disable_smp,
#endif

        .hwt_backend_read = pt_backend_read,
        .hwt_backend_dump = pt_backend_dump,

        .hwt_backend_thread_alloc = pt_backend_alloc_thread,
        .hwt_backend_thread_free = pt_backend_free_thread,
};

static struct hwt_backend backend = {
        .ops = &pt_ops,
        .name = "pt",
        .kva_req = 1,
};

/*
 * Reads the latest valid trace buffer offset and enqueues
 * a HWT_RECORD_BUFFER record.
 * Used as a taskqueue routine from the ToPA interrupt handler.
 */
static void
pt_send_buffer_record(void *arg)
{
        struct pt_cpu *cpu = (struct pt_cpu *)arg;
        struct hwt_record_entry record;

        struct pt_ctx *ctx = cpu->ctx;
        pt_fill_buffer_record(ctx->id, &ctx->buf, &record);
        hwt_record_ctx(ctx->hwt_ctx, &record, M_ZERO | M_NOWAIT);
}
static void
pt_topa_status_clear(void)
{
        uint64_t reg;

        reg = rdmsr(MSR_IA_GLOBAL_STATUS_RESET);
        reg &= ~GLOBAL_STATUS_FLAG_TRACETOPAPMI;
        reg |= GLOBAL_STATUS_FLAG_TRACETOPAPMI;
        wrmsr(MSR_IA_GLOBAL_STATUS_RESET, reg);
}

/*
 * ToPA PMI handler.
 *
 * Invoked every time a ToPA entry marked with TOPA_INT is filled.
 * Uses taskqueue to enqueue a buffer record for userspace.
 * Re-enables the PC interrupt line as long as tracing is active.
 */
static int
pt_topa_intr(struct trapframe *tf)
{
        struct pt_buffer *buf;
        struct pt_cpu *cpu;
        struct pt_ctx *ctx;
        uint64_t reg;

        cpu = &pt_pcpu[curcpu];
        reg = rdmsr(MSR_IA_GLOBAL_STATUS);
        if ((reg & GLOBAL_STATUS_FLAG_TRACETOPAPMI) == 0) {
                pt_topa_status_clear();
                return (0);
        }

        if (pt_cpu_get_state(curcpu) != PT_ACTIVE) {
                return (1);
        }
        atomic_set_int(&cpu->in_pcint_handler, 1);

        ctx = cpu->ctx;
        KASSERT(ctx != NULL,
            ("%s: cpu %d: ToPA PMI interrupt without an active context",
                __func__, curcpu));
        buf = &ctx->buf;
        KASSERT(buf->topa_hw != NULL,
            ("%s: cpu %d: ToPA PMI interrupt with invalid buffer", __func__,
                curcpu));
        pt_cpu_toggle_local(ctx->save_area, false);
        pt_update_buffer(buf);
        pt_topa_status_clear();

        if (pt_cpu_get_state(curcpu) == PT_ACTIVE) {
                swi_sched(cpu->swi_cookie, SWI_FROMNMI);
                pt_cpu_toggle_local(ctx->save_area, true);
                lapic_reenable_pcint();
        }
        atomic_set_int(&cpu->in_pcint_handler, 0);
        return (1);
}

/*
 * Module initialization.
 *
 * Saves all PT-related cpuid info, registers itself as a HWT backend,
 * and allocates metadata required to keep track of tracing operations
 * on each CPU.
 */
static int
pt_init(void)
{
        u_int cp[4];
        int error, i;

        dprintf("pt: Enumerating part 1\n");
        cpuid_count(CPUID_PT_LEAF, 0, cp);
        dprintf("pt: Maximum valid sub-leaf Index: %x\n", cp[0]);
        dprintf("pt: ebx %x\n", cp[1]);
        dprintf("pt: ecx %x\n", cp[2]);

        pt_info.l0_eax = cp[0];
        pt_info.l0_ebx = cp[1];
        pt_info.l0_ecx = cp[2];

        dprintf("pt: Enumerating part 2\n");
        cpuid_count(CPUID_PT_LEAF, 1, cp);
        dprintf("pt: eax %x\n", cp[0]);
        dprintf("pt: ebx %x\n", cp[1]);

        pt_info.l1_eax = cp[0];
        pt_info.l1_ebx = cp[1];

        error = hwt_backend_register(&backend);
        if (error != 0) {
                printf("pt: unable to register hwt backend, error %d\n", error);
                return (error);
        }
        pt_pcpu = mallocarray(mp_ncpus, sizeof(struct pt_cpu), M_PT,
            M_ZERO | M_WAITOK);
        pt_pcpu_ctx = mallocarray(mp_ncpus, sizeof(struct pt_ctx), M_PT,
            M_ZERO | M_WAITOK);

        for (i = 0; i < mp_ncpus; i++) {
                error = swi_add(&clk_intr_event, "pt", pt_send_buffer_record,
                    &pt_pcpu[i], SWI_CLOCK, INTR_MPSAFE,
                    &pt_pcpu[i].swi_cookie);
                if (error != 0) {
                        dprintf(
                            "%s: failed to add interrupt handler for cpu: %d\n",
                            __func__, error);
                        goto err;
                }
        }

        nmi_register_handler(pt_topa_intr);
        if (lapic_enable_pcint()) {
                initialized = true;
                return (0);
        } else
                printf("pt: failed to setup interrupt line\n");
err:
        nmi_remove_handler(pt_topa_intr);
        hwt_backend_unregister(&backend);

        for (i = 0; i < mp_ncpus; i++) {
                if (pt_pcpu[i].swi_cookie != 0)
                        swi_remove(pt_pcpu[i].swi_cookie);
        }
        free(pt_pcpu, M_PT);
        free(pt_pcpu_ctx, M_PT);
        pt_pcpu = NULL;
        pt_pcpu_ctx = NULL;

        return (error);
}

/*
 * Checks whether the CPU support Intel PT and
 * initializes XSAVE area info.
 *
 * The driver relies on XSAVE/XRSTOR PT extensions,
 * Table of Physical Addresses (ToPA) support, and
 * support for multiple ToPA entries.
 */
static bool
pt_supported(void)
{
        u_int cp[4];

        if ((cpu_stdext_feature & CPUID_STDEXT_PROCTRACE) == 0) {
                printf("pt: CPU does not support Intel Processor Trace\n");
                return (false);
        }
        if ((cpu_feature2 & CPUID2_XSAVE) == 0) {
                printf("pt: XSAVE is not supported\n");
                return (false);
        }
        if (!xsave_extfeature_supported(XFEATURE_ENABLED_PT, true)) {
                printf("pt: CPU does not support managing PT state using XSAVE\n");
                return (false);
        }
        if (!xsave_extension_supported(CPUID_EXTSTATE_XSAVEC)) {
                printf("pt: XSAVE compaction is not supported\n");
                return (false);
        }
        if (!xsave_extension_supported(CPUID_EXTSTATE_XSAVES)) {
                printf("pt: CPU does not support XSAVES/XRSTORS\n");
                return (false);
        }

        /* Require ToPA support. */
        cpuid_count(CPUID_PT_LEAF, 0, cp);
        if ((cp[2] & CPUPT_TOPA) == 0) {
                printf("pt: ToPA is not supported\n");
                return (false);
        }
        if ((cp[2] & CPUPT_TOPA_MULTI) == 0) {
                printf("pt: multiple ToPA outputs are not supported\n");
                return (false);
        }

        pt_info.xstate_hdr_offset = xsave_area_hdr_offset();
        pt_info.xsave_area_size = xsave_area_size(PT_XSTATE_BV, true, true);
        pt_info.pt_xsave_offset = xsave_area_offset(PT_XSTATE_BV,
            XFEATURE_ENABLED_PT, true, true);

        return (true);
}

static void
pt_deinit(void)
{
        int i;
        struct pt_cpu *cpu;

        if (!initialized)
                return;
        nmi_remove_handler(pt_topa_intr);
        lapic_disable_pcint();
        hwt_backend_unregister(&backend);

        for (i = 0; i < mp_ncpus; i++) {
                cpu = &pt_pcpu[i];
                swi_remove(cpu->swi_cookie);
        }

        free(pt_pcpu, M_PT);
        free(pt_pcpu_ctx, M_PT);
        pt_pcpu = NULL;
        pt_pcpu_ctx = NULL;
        initialized = false;
}

static int
pt_modevent(module_t mod, int type, void *data)
{
        switch (type) {
        case MOD_LOAD:
                if (!pt_supported() || pt_init() != 0) {
                        return (ENXIO);
                }
                break;
        case MOD_UNLOAD:
                pt_deinit();
                break;
        default:
                break;
        }

        return (0);
}

static moduledata_t pt_mod = { "intel_pt", pt_modevent, NULL };

DECLARE_MODULE(intel_pt, pt_mod, SI_SUB_DRIVERS, SI_ORDER_FIRST);
MODULE_DEPEND(intel_pt, hwt, 1, 1, 1);
MODULE_VERSION(intel_pt, 1);