root/sys/dev/dt/dt_dev.c
/*      $OpenBSD: dt_dev.c,v 1.48 2026/03/24 09:11:56 cludwig Exp $ */

/*
 * Copyright (c) 2019 Martin Pieuchot <mpi@openbsd.org>
 *
 * Permission to use, copy, modify, and distribute this software for any
 * purpose with or without fee is hereby granted, provided that the above
 * copyright notice and this permission notice appear in all copies.
 *
 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
 */

#include <sys/types.h>
#include <sys/systm.h>
#include <sys/param.h>
#include <sys/clockintr.h>
#include <sys/device.h>
#include <sys/exec_elf.h>
#include <sys/malloc.h>
#include <sys/proc.h>
#include <sys/ptrace.h>
#include <sys/vnode.h>
#include <uvm/uvm.h>
#include <uvm/uvm_map.h>
#include <uvm/uvm_vnode.h>
#include <sys/file.h>
#include <sys/filedesc.h>
#include <sys/fcntl.h>

#include <machine/intr.h>

#include <dev/dt/dtvar.h>

/*
 * Number of frames to skip in stack traces.
 *
 * The number of frames required to execute dt(4) profiling code
 * depends on the probe, context, architecture and possibly the
 * compiler.
 *
 * Static probes (tracepoints) are executed in the context of the
 * current thread and only need to skip frames up to the recording
 * function.  For example the syscall provider:
 *
 *      dt_prov_syscall_entry+0x141
 *      syscall+0x205           <--- start here
 *      Xsyscall+0x128
 *
 * Probes executed in their own context, like the profile provider,
 * need to skip the frames of that context which are different for
 * every architecture.  For example the profile provider executed
 * from hardclock(9) on amd64:
 *
 *      dt_prov_profile_enter+0x6e
 *      hardclock+0x1a9
 *      lapic_clockintr+0x3f
 *      Xresume_lapic_ltimer+0x26
 *      acpicpu_idle+0x1d2      <---- start here.
 *      sched_idle+0x225
 *      proc_trampoline+0x1c
 */
#if defined(__amd64__)
#define DT_FA_PROFILE   5
#define DT_FA_STATIC    2
#elif defined(__i386__)
#define DT_FA_PROFILE   5
#define DT_FA_STATIC    2
#elif defined(__macppc__)
#define DT_FA_PROFILE  5
#define DT_FA_STATIC   2
#elif defined(__octeon__)
#define DT_FA_PROFILE   6
#define DT_FA_STATIC    2
#elif defined(__powerpc64__)
#define DT_FA_PROFILE   6
#define DT_FA_STATIC    2
#elif defined(__sparc64__)
#define DT_FA_PROFILE   7
#define DT_FA_STATIC    1
#else
#define DT_FA_STATIC    0
#define DT_FA_PROFILE   0
#endif

#define DT_EVTRING_SIZE 16      /* # of slots in per PCB event ring */

#define DPRINTF(x...) /* nothing */

/*
 *  Locks used to protect struct members and variables in this file:
 *      a       atomic
 *      I       invariant after initialization
 *      K       kernel lock
 *      D       dtrace rw-lock dt_lock
 *      r       owned by thread doing read(2)
 *      c       owned by CPU
 *      s       sliced ownership, based on read/write indexes
 *      p       written by CPU, read by thread doing read(2)
 */

/*
 * Per-CPU Event States
 */
struct dt_cpubuf {
        unsigned int             dc_prod;       /* [r] read index */
        unsigned int             dc_cons;       /* [c] write index */
        struct dt_evt           *dc_ring;       /* [s] ring of event states */
        unsigned int             dc_inevt;      /* [c] in event already? */

        /* Counters */
        unsigned int             dc_dropevt;    /* [p] # of events dropped */
        unsigned int             dc_skiptick;   /* [p] # of ticks skipped */
        unsigned int             dc_recurevt;   /* [p] # of recursive events */
        unsigned int             dc_readevt;    /* [r] # of events read */
};

/*
 * Descriptor associated with each program opening /dev/dt.  It is used
 * to keep track of enabled PCBs.
 */
struct dt_softc {
        SLIST_ENTRY(dt_softc)    ds_next;       /* [K] descriptor list */
        int                      ds_unit;       /* [I] D_CLONE unique unit */
        pid_t                    ds_pid;        /* [I] PID of tracing program */
        void                    *ds_si;         /* [I] to defer wakeup(9) */

        struct dt_pcb_list       ds_pcbs;       /* [K] list of enabled PCBs */
        int                      ds_recording;  /* [D] currently recording? */
        unsigned int             ds_evtcnt;     /* [a] # of readable evts */

        struct dt_cpubuf         ds_cpu[MAXCPUS]; /* [I] Per-cpu event states */
        unsigned int             ds_lastcpu;    /* [r] last CPU ring read(2). */
};

SLIST_HEAD(, dt_softc) dtdev_list;      /* [K] list of open /dev/dt nodes */

/*
 * Probes are created during dt_attach() and never modified/freed during
 * the lifetime of the system.  That's why we consider them as [I]mmutable.
 */
unsigned int                    dt_nprobes;     /* [I] # of probes available */
SIMPLEQ_HEAD(, dt_probe)        dt_probe_list;  /* [I] list of probes */

struct rwlock                   dt_lock = RWLOCK_INITIALIZER("dtlk");
volatile uint32_t               dt_tracing = 0; /* [D] # of processes tracing */

int allowdt;                                    /* [a] */

void    dtattach(struct device *, struct device *, void *);
int     dtopen(dev_t, int, int, struct proc *);
int     dtclose(dev_t, int, int, struct proc *);
int     dtread(dev_t, struct uio *, int);
int     dtioctl(dev_t, u_long, caddr_t, int, struct proc *);

struct  dt_softc *dtlookup(int);
struct  dt_softc *dtalloc(void);
void    dtfree(struct dt_softc *);

int     dt_ioctl_list_probes(struct dt_softc *, struct dtioc_probe *);
int     dt_ioctl_get_args(struct dt_softc *, struct dtioc_arg *);
int     dt_ioctl_get_stats(struct dt_softc *, struct dtioc_stat *);
int     dt_ioctl_record_start(struct dt_softc *);
void    dt_ioctl_record_stop(struct dt_softc *);
int     dt_ioctl_probe_enable(struct dt_softc *, struct dtioc_req *);
int     dt_ioctl_probe_disable(struct dt_softc *, struct dtioc_req *);
int     dt_ioctl_rd_vnode(struct dt_softc *, struct dtioc_rdvn *);

int     dt_ring_copy(struct dt_cpubuf *, struct uio *, size_t, size_t *);

void    dt_wakeup(struct dt_softc *);
void    dt_deferred_wakeup(void *);

void
dtattach(struct device *parent, struct device *self, void *aux)
{
        SLIST_INIT(&dtdev_list);
        SIMPLEQ_INIT(&dt_probe_list);

        /* Init providers */
        dt_nprobes += dt_prov_profile_init();
        dt_nprobes += dt_prov_syscall_init();
        dt_nprobes += dt_prov_static_init();
#ifdef DDBPROF
        dt_nprobes += dt_prov_kprobe_init();
#endif
}

int
dtopen(dev_t dev, int flags, int mode, struct proc *p)
{
        struct dt_softc *sc;
        int unit = minor(dev);

        if (atomic_load_int(&allowdt) == 0)
                return EPERM;

        sc = dtalloc();
        if (sc == NULL)
                return ENOMEM;

        /* no sleep after this point */
        if (dtlookup(unit) != NULL) {
                dtfree(sc);
                return EBUSY;
        }

        sc->ds_unit = unit;
        sc->ds_pid = p->p_p->ps_pid;
        TAILQ_INIT(&sc->ds_pcbs);
        sc->ds_lastcpu = 0;
        sc->ds_evtcnt = 0;

        SLIST_INSERT_HEAD(&dtdev_list, sc, ds_next);

        DPRINTF("dt%d: pid %d open\n", sc->ds_unit, sc->ds_pid);

        return 0;
}

int
dtclose(dev_t dev, int flags, int mode, struct proc *p)
{
        struct dt_softc *sc;
        int unit = minor(dev);

        sc = dtlookup(unit);
        KASSERT(sc != NULL);

        DPRINTF("dt%d: pid %d close\n", sc->ds_unit, sc->ds_pid);

        SLIST_REMOVE(&dtdev_list, sc, dt_softc, ds_next);
        dt_ioctl_record_stop(sc);
        dt_pcb_purge(&sc->ds_pcbs);
        dtfree(sc);

        return 0;
}

int
dtread(dev_t dev, struct uio *uio, int flags)
{
        struct dt_softc *sc;
        struct dt_cpubuf *dc;
        int i, error = 0, unit = minor(dev);
        size_t count, max, read = 0;

        sc = dtlookup(unit);
        KASSERT(sc != NULL);

        max = howmany(uio->uio_resid, sizeof(struct dt_evt));
        if (max < 1)
                return (EMSGSIZE);

        while (!atomic_load_int(&sc->ds_evtcnt)) {
                sleep_setup(sc, PWAIT | PCATCH, "dtread");
                error = sleep_finish(INFSLP, !atomic_load_int(&sc->ds_evtcnt));
                if (error == EINTR || error == ERESTART)
                        break;
        }
        if (error)
                return error;

        KERNEL_ASSERT_LOCKED();
        for (i = 0; i < ncpusfound; i++) {
                count = 0;
                dc = &sc->ds_cpu[(sc->ds_lastcpu + i) % ncpusfound];
                error = dt_ring_copy(dc, uio, max, &count);
                if (error && count == 0)
                        break;

                read += count;
                max -= count;
                if (max == 0)
                        break;
        }
        sc->ds_lastcpu += i % ncpusfound;

        atomic_sub_int(&sc->ds_evtcnt, read);

        return error;
}

int
dtioctl(dev_t dev, u_long cmd, caddr_t addr, int flag, struct proc *p)
{
        struct dt_softc *sc;
        int unit = minor(dev);
        int on, error = 0;

        sc = dtlookup(unit);
        KASSERT(sc != NULL);

        switch (cmd) {
        case DTIOCGPLIST:
                return dt_ioctl_list_probes(sc, (struct dtioc_probe *)addr);
        case DTIOCGARGS:
                return dt_ioctl_get_args(sc, (struct dtioc_arg *)addr);
        case DTIOCGSTATS:
                return dt_ioctl_get_stats(sc, (struct dtioc_stat *)addr);
        case DTIOCRECORD:
        case DTIOCPRBENABLE:
        case DTIOCPRBDISABLE:
        case DTIOCRDVNODE:
                /* root only ioctl(2) */
                break;
        default:
                return ENOTTY;
        }

        if ((error = suser(p)) != 0)
                return error;

        switch (cmd) {
        case DTIOCRECORD:
                on = *(int *)addr;
                if (on)
                        error = dt_ioctl_record_start(sc);
                else
                        dt_ioctl_record_stop(sc);
                break;
        case DTIOCPRBENABLE:
                error = dt_ioctl_probe_enable(sc, (struct dtioc_req *)addr);
                break;
        case DTIOCPRBDISABLE:
                error = dt_ioctl_probe_disable(sc, (struct dtioc_req *)addr);
                break;
        case DTIOCRDVNODE:
                error = dt_ioctl_rd_vnode(sc, (struct dtioc_rdvn *)addr);
                break;
        default:
                KASSERT(0);
        }

        return error;
}

struct dt_softc *
dtlookup(int unit)
{
        struct dt_softc *sc;

        KERNEL_ASSERT_LOCKED();

        SLIST_FOREACH(sc, &dtdev_list, ds_next) {
                if (sc->ds_unit == unit)
                        break;
        }

        return sc;
}

struct dt_softc *
dtalloc(void)
{
        struct dt_softc *sc;
        struct dt_evt *dtev;
        int i;

        sc = malloc(sizeof(*sc), M_DEVBUF, M_WAITOK|M_CANFAIL|M_ZERO);
        if (sc == NULL)
                return NULL;

        for (i = 0; i < ncpusfound; i++) {
                dtev = mallocarray(DT_EVTRING_SIZE, sizeof(*dtev), M_DEVBUF,
                    M_WAITOK|M_CANFAIL|M_ZERO);
                if (dtev == NULL)
                        break;
                sc->ds_cpu[i].dc_ring = dtev;
        }
        if (i < ncpusfound) {
                dtfree(sc);
                return NULL;
        }

        sc->ds_si = softintr_establish(IPL_SOFTCLOCK | IPL_MPSAFE,
            dt_deferred_wakeup, sc);
        if (sc->ds_si == NULL) {
                dtfree(sc);
                return NULL;
        }

        return sc;
}

void
dtfree(struct dt_softc *sc)
{
        struct dt_evt *dtev;
        int i;

        if (sc->ds_si != NULL)
                softintr_disestablish(sc->ds_si);

        for (i = 0; i < ncpusfound; i++) {
                dtev = sc->ds_cpu[i].dc_ring;
                free(dtev, M_DEVBUF, DT_EVTRING_SIZE * sizeof(*dtev));
        }
        free(sc, M_DEVBUF, sizeof(*sc));
}

int
dt_ioctl_list_probes(struct dt_softc *sc, struct dtioc_probe *dtpr)
{
        struct dtioc_probe_info info, *dtpi;
        struct dt_probe *dtp;
        size_t size;
        int error = 0;

        size = dtpr->dtpr_size;
        dtpr->dtpr_size = dt_nprobes * sizeof(*dtpi);
        if (size == 0)
                return 0;

        dtpi = dtpr->dtpr_probes;
        SIMPLEQ_FOREACH(dtp, &dt_probe_list, dtp_next) {
                if (size < sizeof(*dtpi)) {
                        error = ENOSPC;
                        break;
                }
                memset(&info, 0, sizeof(info));
                info.dtpi_pbn = dtp->dtp_pbn;
                info.dtpi_nargs = dtp->dtp_nargs;
                strlcpy(info.dtpi_prov, dtp->dtp_prov->dtpv_name,
                    sizeof(info.dtpi_prov));
                strlcpy(info.dtpi_func, dtp->dtp_func, sizeof(info.dtpi_func));
                strlcpy(info.dtpi_name, dtp->dtp_name, sizeof(info.dtpi_name));
                error = copyout(&info, dtpi, sizeof(*dtpi));
                if (error)
                        break;
                size -= sizeof(*dtpi);
                dtpi++;
        }

        return error;
}

int
dt_ioctl_get_args(struct dt_softc *sc, struct dtioc_arg *dtar)
{
        struct dtioc_arg_info info, *dtai;
        struct dt_probe *dtp;
        size_t size, n, t;
        uint32_t pbn;
        int error = 0;

        pbn = dtar->dtar_pbn;
        if (pbn == 0 || pbn > dt_nprobes)
                return EINVAL;

        SIMPLEQ_FOREACH(dtp, &dt_probe_list, dtp_next) {
                if (pbn == dtp->dtp_pbn)
                        break;
        }
        if (dtp == NULL)
                return EINVAL;

        if (dtp->dtp_sysnum != 0) {
                /* currently not supported for system calls */
                dtar->dtar_size = 0;
                return 0;
        }

        size = dtar->dtar_size;
        dtar->dtar_size = dtp->dtp_nargs * sizeof(*dtar);
        if (size == 0)
                return 0;

        t = 0;
        dtai = dtar->dtar_args;
        for (n = 0; n < dtp->dtp_nargs; n++) {
                if (size < sizeof(*dtai)) {
                        error = ENOSPC;
                        break;
                }
                if (n >= DTMAXARGTYPES || dtp->dtp_argtype[n] == NULL)
                        continue;
                memset(&info, 0, sizeof(info));
                info.dtai_pbn = dtp->dtp_pbn;
                info.dtai_argn = t++;
                strlcpy(info.dtai_argtype, dtp->dtp_argtype[n],
                    sizeof(info.dtai_argtype));
                error = copyout(&info, dtai, sizeof(*dtai));
                if (error)
                        break;
                size -= sizeof(*dtai);
                dtai++;
        }
        dtar->dtar_size = t * sizeof(*dtar);

        return error;
}

int
dt_ioctl_get_stats(struct dt_softc *sc, struct dtioc_stat *dtst)
{
        struct dt_cpubuf *dc;
        uint64_t readevt, dropevt, skiptick, recurevt;
        int i;

        readevt = dropevt = skiptick = 0;
        for (i = 0; i < ncpusfound; i++) {
                dc = &sc->ds_cpu[i];

                membar_consumer();
                dropevt += dc->dc_dropevt;
                skiptick = dc->dc_skiptick;
                recurevt = dc->dc_recurevt;
                readevt += dc->dc_readevt;
        }

        dtst->dtst_readevt = readevt;
        dtst->dtst_dropevt = dropevt;
        dtst->dtst_skiptick = skiptick;
        dtst->dtst_recurevt = recurevt;
        return 0;
}

int
dt_ioctl_record_start(struct dt_softc *sc)
{
        uint64_t now;
        struct dt_pcb *dp;
        int error = 0;

        rw_enter_write(&dt_lock);
        if (sc->ds_recording) {
                error = EBUSY;
                goto out;
        }

        KERNEL_ASSERT_LOCKED();
        if (TAILQ_EMPTY(&sc->ds_pcbs)) {
                error = ENOENT;
                goto out;
        }

        now = nsecuptime();
        TAILQ_FOREACH(dp, &sc->ds_pcbs, dp_snext) {
                struct dt_probe *dtp = dp->dp_dtp;

                SMR_SLIST_INSERT_HEAD_LOCKED(&dtp->dtp_pcbs, dp, dp_pnext);
                dtp->dtp_recording++;
                dtp->dtp_prov->dtpv_recording++;

                if (dp->dp_nsecs != 0) {
                        clockintr_bind(&dp->dp_clockintr, dp->dp_cpu, dt_clock,
                            dp);
                        clockintr_schedule(&dp->dp_clockintr,
                            now + dp->dp_nsecs);
                }
        }
        sc->ds_recording = 1;
        dt_tracing++;

 out:
        rw_exit_write(&dt_lock);
        return error;
}

void
dt_ioctl_record_stop(struct dt_softc *sc)
{
        struct dt_pcb *dp;

        rw_enter_write(&dt_lock);
        if (!sc->ds_recording) {
                rw_exit_write(&dt_lock);
                return;
        }

        DPRINTF("dt%d: pid %d disable\n", sc->ds_unit, sc->ds_pid);

        dt_tracing--;
        sc->ds_recording = 0;
        TAILQ_FOREACH(dp, &sc->ds_pcbs, dp_snext) {
                struct dt_probe *dtp = dp->dp_dtp;

                /*
                 * Set an execution barrier to ensure the shared
                 * reference to dp is inactive.
                 */
                if (dp->dp_nsecs != 0)
                        clockintr_unbind(&dp->dp_clockintr, CL_BARRIER);

                dtp->dtp_recording--;
                dtp->dtp_prov->dtpv_recording--;
                SMR_SLIST_REMOVE_LOCKED(&dtp->dtp_pcbs, dp, dt_pcb, dp_pnext);
        }
        rw_exit_write(&dt_lock);

        /* Wait until readers cannot access the PCBs. */
        smr_barrier();
}

int
dt_ioctl_probe_enable(struct dt_softc *sc, struct dtioc_req *dtrq)
{
        struct dt_pcb_list plist;
        struct dt_probe *dtp;
        struct dt_pcb *dp;
        int error;

        if (sc->ds_recording)
                return EBUSY;

        SIMPLEQ_FOREACH(dtp, &dt_probe_list, dtp_next) {
                if (dtp->dtp_pbn == dtrq->dtrq_pbn)
                        break;
        }
        if (dtp == NULL)
                return ENOENT;

        /* Only allow one probe of each type. */
        TAILQ_FOREACH(dp, &sc->ds_pcbs, dp_snext) {
                if (dp->dp_dtp->dtp_pbn == dtrq->dtrq_pbn)
                        return EEXIST;
        }

        TAILQ_INIT(&plist);
        error = dtp->dtp_prov->dtpv_alloc(dtp, sc, &plist, dtrq);
        if (error)
                return error;

        DPRINTF("dt%d: pid %d enable %u : %b\n", sc->ds_unit, sc->ds_pid,
            dtrq->dtrq_pbn, (unsigned int)dtrq->dtrq_evtflags, DTEVT_FLAG_BITS);

        /* Append all PCBs to this instance */
        TAILQ_CONCAT(&sc->ds_pcbs, &plist, dp_snext);

        return 0;
}

int
dt_ioctl_probe_disable(struct dt_softc *sc, struct dtioc_req *dtrq)
{
        struct dt_probe *dtp;
        int error;

        if (sc->ds_recording)
                return EBUSY;

        SIMPLEQ_FOREACH(dtp, &dt_probe_list, dtp_next) {
                if (dtp->dtp_pbn == dtrq->dtrq_pbn)
                        break;
        }
        if (dtp == NULL)
                return ENOENT;

        if (dtp->dtp_prov->dtpv_dealloc) {
                error = dtp->dtp_prov->dtpv_dealloc(dtp, sc, dtrq);
                if (error)
                        return error;
        }

        DPRINTF("dt%d: pid %d dealloc\n", sc->ds_unit, sc->ds_pid,
            dtrq->dtrq_pbn);

        return 0;
}

int
dt_ioctl_rd_vnode(struct dt_softc *sc, struct dtioc_rdvn *dtrv)
{
        struct process *ps;
        struct proc *p = curproc;
        boolean_t ok;
        struct vm_map_entry *e;
        int err = 0;
        int fd;
        struct uvm_vnode *uvn;
        struct vnode *vn;
        struct file *fp;

        if ((ps = prfind(dtrv->dtrv_pid)) == NULL)
                return ESRCH;

        vm_map_lock_read(&ps->ps_vmspace->vm_map);

        ok = uvm_map_lookup_entry(&ps->ps_vmspace->vm_map,
            (vaddr_t)dtrv->dtrv_va, &e);
        if (ok == 0 || (e->etype & UVM_ET_OBJ) == 0 ||
            (e->protection & PROT_EXEC) == 0 ||
            !UVM_OBJ_IS_VNODE(e->object.uvm_obj)) {
                err = ENOENT;
                vn = NULL;
                DPRINTF("%s no mapping for %p\n", __func__, dtrv->dtrv_va);
        } else {
                uvn = (struct uvm_vnode *)e->object.uvm_obj;
                vn = uvn->u_vnode;
                vref(vn);

                dtrv->dtrv_len = (size_t)uvn->u_size;
                dtrv->dtrv_start = (caddr_t)e->start;
                dtrv->dtrv_offset = (caddr_t)e->offset;
        }

        vm_map_unlock_read(&ps->ps_vmspace->vm_map);

        if (vn != NULL) {
                fdplock(p->p_fd);
                err = falloc(p, &fp, &fd);
                fdpunlock(p->p_fd);
                if (err != 0) {
                        vrele(vn);
                        DPRINTF("%s fdopen failed (%d)\n", __func__, err);
                        return err;
                }
                err = VOP_OPEN(vn, O_RDONLY, p->p_p->ps_ucred, p);
                if (err == 0) {
                        fp->f_flag = FREAD;
                        fp->f_type = DTYPE_VNODE;
                        fp->f_ops = &vnops;
                        fp->f_data = vn;
                        dtrv->dtrv_fd = fd;
                        fdplock(p->p_fd);
                        fdinsert(p->p_fd, fd, UF_EXCLOSE, fp);
                        fdpunlock(p->p_fd);
                        FRELE(fp, p);
                } else {
                        DPRINTF("%s vopen() failed (%d)\n", __func__,
                            err);
                        vrele(vn);
                        fdplock(p->p_fd);
                        fdremove(p->p_fd, fd);
                        fdpunlock(p->p_fd);
                        FRELE(fp, p);
                }
        }

        return err;
}

struct dt_probe *
dt_dev_alloc_probe(const char *func, const char *name, struct dt_provider *dtpv)
{
        struct dt_probe *dtp;

        dtp = malloc(sizeof(*dtp), M_DT, M_NOWAIT|M_ZERO);
        if (dtp == NULL)
                return NULL;

        SMR_SLIST_INIT(&dtp->dtp_pcbs);
        dtp->dtp_prov = dtpv;
        dtp->dtp_func = func;
        dtp->dtp_name = name;
        dtp->dtp_sysnum = -1;
        dtp->dtp_ref = 0;

        return dtp;
}

void
dt_dev_register_probe(struct dt_probe *dtp)
{
        static uint64_t probe_nb;

        dtp->dtp_pbn = ++probe_nb;
        SIMPLEQ_INSERT_TAIL(&dt_probe_list, dtp, dtp_next);
}

struct dt_pcb *
dt_pcb_alloc(struct dt_probe *dtp, struct dt_softc *sc)
{
        struct dt_pcb *dp;

        dp = malloc(sizeof(*dp), M_DT, M_WAITOK|M_CANFAIL|M_ZERO);
        if (dp == NULL)
                return NULL;

        dp->dp_sc = sc;
        dp->dp_dtp = dtp;
        return dp;
}

void
dt_pcb_free(struct dt_pcb *dp)
{
        free(dp, M_DT, sizeof(*dp));
}

void
dt_pcb_purge(struct dt_pcb_list *plist)
{
        struct dt_pcb *dp;

        while ((dp = TAILQ_FIRST(plist)) != NULL) {
                TAILQ_REMOVE(plist, dp, dp_snext);
                dt_pcb_free(dp);
        }
}

void
dt_pcb_ring_skiptick(struct dt_pcb *dp, unsigned int skip)
{
        struct dt_cpubuf *dc = &dp->dp_sc->ds_cpu[cpu_number()];

        dc->dc_skiptick += skip;
        membar_producer();
}

/*
 * Get a reference to the next free event state from the ring.
 */
struct dt_evt *
dt_pcb_ring_get(struct dt_pcb *dp, int profiling)
{
        struct proc *p = curproc;
        struct dt_evt *dtev;
        int prod, cons, distance;
        struct dt_cpubuf *dc = &dp->dp_sc->ds_cpu[cpu_number()];

        if (dc->dc_inevt == 1) {
                dc->dc_recurevt++;
                membar_producer();
                return NULL;
        }

        dc->dc_inevt = 1;

        membar_consumer();
        prod = dc->dc_prod;
        cons = dc->dc_cons;
        distance = prod - cons;
        if (distance == 1 || distance == (1 - DT_EVTRING_SIZE)) {
                /* read(2) isn't finished */
                dc->dc_dropevt++;
                membar_producer();

                dc->dc_inevt = 0;
                return NULL;
        }

        /*
         * Save states in next free event slot.
         */
        dtev = &dc->dc_ring[cons];
        memset(dtev, 0, sizeof(*dtev));

        dtev->dtev_pbn = dp->dp_dtp->dtp_pbn;
        dtev->dtev_cpu = cpu_number();
        dtev->dtev_pid = p->p_p->ps_pid;
        dtev->dtev_tid = p->p_tid + THREAD_PID_OFFSET;
        nanotime(&dtev->dtev_tsp);

        if (ISSET(dp->dp_evtflags, DTEVT_EXECNAME))
                strlcpy(dtev->dtev_comm, p->p_p->ps_comm, sizeof(dtev->dtev_comm));

        if (ISSET(dp->dp_evtflags, DTEVT_KSTACK)) {
                if (profiling)
                        stacktrace_save_at(&dtev->dtev_kstack, DT_FA_PROFILE);
                else
                        stacktrace_save_at(&dtev->dtev_kstack, DT_FA_STATIC);
        }
        if (ISSET(dp->dp_evtflags, DTEVT_USTACK))
                stacktrace_save_utrace(&dtev->dtev_ustack);

        return dtev;
}

void
dt_pcb_ring_consume(struct dt_pcb *dp, struct dt_evt *dtev)
{
        struct dt_cpubuf *dc = &dp->dp_sc->ds_cpu[cpu_number()];

        KASSERT(dtev == &dc->dc_ring[dc->dc_cons]);

        dc->dc_cons = (dc->dc_cons + 1) % DT_EVTRING_SIZE;
        membar_producer();

        atomic_inc_int(&dp->dp_sc->ds_evtcnt);
        dc->dc_inevt = 0;

        dt_wakeup(dp->dp_sc);
}

/*
 * Copy at most `max' events from `dc', producing the same amount
 * of free slots.
 */
int
dt_ring_copy(struct dt_cpubuf *dc, struct uio *uio, size_t max, size_t *rcvd)
{
        size_t count, copied = 0;
        unsigned int cons, prod;
        int error = 0;

        KASSERT(max > 0);

        membar_consumer();
        cons = dc->dc_cons;
        prod = dc->dc_prod;

        if (cons < prod)
                count = DT_EVTRING_SIZE - prod;
        else
                count = cons - prod;

        if (count == 0)
                return 0;

        count = MIN(count, max);
        error = uiomove(&dc->dc_ring[prod], count * sizeof(struct dt_evt), uio);
        if (error)
                return error;
        copied += count;

        /* Produce */
        prod = (prod + count) % DT_EVTRING_SIZE;

        /* If the ring didn't wrap, stop here. */
        if (max == copied || prod != 0 || cons == 0)
                goto out;

        count = MIN(cons, (max - copied));
        error = uiomove(&dc->dc_ring[0], count * sizeof(struct dt_evt), uio);
        if (error)
                goto out;

        copied += count;
        prod += count;

out:
        dc->dc_readevt += copied;
        dc->dc_prod = prod;
        membar_producer();

        *rcvd = copied;
        return error;
}

void
dt_wakeup(struct dt_softc *sc)
{
        /*
         * It is not always safe or possible to call wakeup(9) and grab
         * the SCHED_LOCK() from a given tracepoint.  This is true for
         * any tracepoint that might trigger inside the scheduler or at
         * any IPL higher than IPL_SCHED.  For this reason use a soft-
         * interrupt to defer the wakeup.
         */
        softintr_schedule(sc->ds_si);
}

void
dt_deferred_wakeup(void *arg)
{
        struct dt_softc *sc = arg;

        wakeup(sc);
}