root/sys/arm64/spe/arm_spe_backend.c
/*-
 * SPDX-License-Identifier: BSD-2-Clause
 *
 * Copyright (c) 2024 Arm Ltd
 * Copyright (c) 2022 The FreeBSD Foundation
 *
 * Portions of this software were developed by Andrew Turner under sponsorship
 * from the FreeBSD Foundation.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

/*
 * Arm Statistical Profiling Extension (SPE) backend
 *
 * Basic SPE operation
 *
 *   SPE is enabled and configured on a per-core basis, with each core requiring
 *   separate code to enable and configure. Each core also requires a separate
 *   buffer passed as config where the CPU will write profiling data. When the
 *   profiling buffer is full, an interrupt will be taken on the same CPU.
 *
 * Driver Design
 *
 * - HWT allocates a large single buffer per core. This buffer is split in half
 *   to create a 2 element circular buffer (aka ping-pong buffer) where the
 *   kernel writes to one half while userspace is copying the other half
 * - SMP calls are used to enable and configure each core, with SPE initially
 *   configured to write to the first half of the buffer
 * - When the first half of the buffer is full, a buffer full interrupt will
 *   immediately switch writing to the second half. The kernel adds the details
 *   of the half that needs copying to a FIFO STAILQ and notifies userspace via
 *   kqueue by sending a ARM_SPE_KQ_BUF kevent with how many buffers on the
 *   queue need servicing
 * - The kernel responds to HWT_IOC_BUFPTR_GET ioctl by sending details of the
 *   first item from the queue
 * - The buffers pending copying will not be overwritten until an
 *   HWT_IOC_SVC_BUF ioctl is received from userspace confirming the data has
 *   been copied out
 * - In the case where both halfs of the buffer are full, profiling will be
 *   paused until notification via HWT_IOC_SVC_BUF is received
 *
 * Future improvements and limitations
 *
 * - Using large buffer sizes should minimise pauses and loss of profiling
 *   data while kernel is waiting for userspace to copy out data. Since it is
 *   generally expected that consuming (copying) this data is faster than
 *   producing it, in practice this has not so far been an issue. If it does
 *   prove to be an issue even with large buffer sizes then additional buffering
 *   i.e. n element circular buffers might be required.
 *
 * - kqueue can only notify and queue one kevent of the same type, with
 *   subsequent events overwriting data in the first event. The kevent
 *   ARM_SPE_KQ_BUF can therefore only contain the number of buffers on the
 *   STAILQ, incrementing each time a new buffer is full. In this case kqueue
 *   serves just as a notification to userspace to wake up and query the kernel
 *   with the appropriate ioctl. An alternative might be custom kevents where
 *   the kevent identifier is encoded with something like n+cpu_id or n+tid. In
 *   this case data could be sent directly with kqueue via the kevent data and
 *   fflags elements, avoiding the extra ioctl.
 *
 */

#include <sys/param.h>
#include <sys/bus.h>
#include <sys/conf.h>
#include <sys/hwt.h>
#include <sys/kernel.h>
#include <sys/lock.h>
#include <sys/malloc.h>
#include <sys/mman.h>
#include <sys/module.h>
#include <sys/mutex.h>
#include <sys/proc.h>
#include <sys/queue.h>
#include <sys/rman.h>
#include <sys/rwlock.h>
#include <sys/smp.h>
#include <sys/sysctl.h>
#include <sys/systm.h>
#include <sys/taskqueue.h>

#include <machine/bus.h>

#include <arm64/spe/arm_spe_dev.h>

#include <dev/hwt/hwt_vm.h>
#include <dev/hwt/hwt_backend.h>
#include <dev/hwt/hwt_config.h>
#include <dev/hwt/hwt_context.h>
#include <dev/hwt/hwt_cpu.h>
#include <dev/hwt/hwt_thread.h>

MALLOC_DECLARE(M_ARM_SPE);

extern u_int mp_maxid;
extern struct taskqueue *taskqueue_arm_spe;

int spe_backend_disable_smp(struct hwt_context *ctx);

static device_t spe_dev;
static struct hwt_backend_ops spe_ops;
static struct hwt_backend backend = {
        .ops = &spe_ops,
        .name = "spe",
        .kva_req = 1,
};

/* Pointers to current info structure per CPU. This points to either a per-CPU
 * structure (for CPU mode) or a per-thread structure (for thread mode).
 */
static struct arm_spe_info **spe_info;

static struct arm_spe_info *spe_info_cpu;

static void
spe_backend_init_cpu(struct hwt_context *ctx)
{
        struct arm_spe_info *info;
        struct arm_spe_softc *sc = device_get_softc(spe_dev);
        char lock_name[32];
        char *tmp = "Arm SPE lock/cpu/";
        int cpu_id;

        spe_info_cpu = malloc(sizeof(struct arm_spe_info) * mp_ncpus,
           M_ARM_SPE, M_WAITOK | M_ZERO);


        CPU_FOREACH_ISSET(cpu_id, &ctx->cpu_map) {
                info = &spe_info_cpu[cpu_id];
                info->sc = sc;
                info->ident = cpu_id;
                info->buf_info[0].info = info;
                info->buf_info[0].buf_idx = 0;
                info->buf_info[1].info = info;
                info->buf_info[1].buf_idx = 1;
                snprintf(lock_name, sizeof(lock_name), "%s%d", tmp, cpu_id);
                mtx_init(&info->lock, lock_name, NULL, MTX_SPIN);

                spe_info[cpu_id] = info;
        }
}

static int
spe_backend_init(struct hwt_context *ctx)
{
        struct arm_spe_softc *sc = device_get_softc(spe_dev);
        int error = 0;

        /*
         * HWT currently specifies buffer size must be a multiple of PAGE_SIZE,
         * i.e. minimum 4KB + the maximum PMBIDR.Align is 2KB
         * This should never happen but it's good to sense check
         */
        if (ctx->bufsize % sc->kva_align != 0)
                return (EINVAL);

        /*
         * Since we're splitting the buffer in half + PMBLIMITR needs to be page
         * aligned, minimum buffer size needs to be 2x PAGE_SIZE
         */
        if (ctx->bufsize < (2 * PAGE_SIZE))
                return (EINVAL);

        sc->ctx = ctx;
        sc->kqueue_fd = ctx->kqueue_fd;
        sc->hwt_td = ctx->hwt_td;

        spe_info = malloc(sizeof(struct arm_spe_info *) * mp_ncpus,
           M_ARM_SPE, M_WAITOK | M_ZERO);
        sc->spe_info = spe_info;

        if (ctx->mode == HWT_MODE_CPU)
                spe_backend_init_cpu(ctx);

        return (error);
}

#ifdef ARM_SPE_DEBUG
static void hex_dump(uint8_t *buf, size_t len)
{
        size_t i;

        printf("--------------------------------------------------------------\n");
        for (i = 0; i < len; ++i) {
                if (i % 8 == 0) {
                        printf(" ");
                }
                if (i % 16 == 0) {
                        if (i != 0) {
                                printf("\r\n");
                        }
                        printf("\t");
                }
                printf("%02X ", buf[i]);
        }
        printf("\r\n");
}
#endif

static int
spe_backend_deinit(struct hwt_context *ctx)
{
#ifdef ARM_SPE_DEBUG
        struct arm_spe_info *info;
        struct hwt_thread *thr;
        int cpu_id;

        if (ctx->mode == HWT_MODE_CPU) {
                CPU_FOREACH_ISSET(cpu_id, &ctx->cpu_map) {
                        info = &spe_info_cpu[cpu_id];
                        printf("CPU %u:\n", cpu_id);
                        hex_dump((void *)info->kvaddr, 128);
                        hex_dump((void *)(info->kvaddr + (info->buf_size/2)), 128);
                }
        } else {
                TAILQ_FOREACH(thr, &ctx->threads, next) {
                        info = (struct arm_spe_info *)thr->private;
                        printf("TID %u:\n", thr->thread_id);
                        hex_dump((void *)info->kvaddr, 128);
                        hex_dump((void *)(info->kvaddr + (info->buf_size/2)), 128);
                }
        }
#endif

        spe_backend_disable_smp(ctx);

        if (ctx->mode == HWT_MODE_CPU)
                free(spe_info_cpu, M_ARM_SPE);

        free(spe_info, M_ARM_SPE);

        return (0);
}

static uint64_t
arm_spe_min_interval(struct arm_spe_softc *sc)
{
        /* IMPLEMENTATION DEFINED */
        switch (PMSIDR_Interval_VAL(sc->pmsidr))
        {
        case PMSIDR_Interval_256:
                return (256);
        case PMSIDR_Interval_512:
                return (512);
        case PMSIDR_Interval_768:
                return (768);
        case PMSIDR_Interval_1024:
                return (1024);
        case PMSIDR_Interval_1536:
                return (1536);
        case PMSIDR_Interval_2048:
                return (2048);
        case PMSIDR_Interval_3072:
                return (3072);
        case PMSIDR_Interval_4096:
                return (4096);
        default:
                return (4096);
        }
}

static inline void
arm_spe_set_interval(struct arm_spe_info *info, uint64_t interval)
{
        uint64_t min_interval = arm_spe_min_interval(info->sc);

        interval = MAX(interval, min_interval);
        interval = MIN(interval, 1 << 24);      /* max 24 bits */

        dprintf("%s %lu\n", __func__, interval);

        info->pmsirr &= ~(PMSIRR_INTERVAL_MASK);
        info->pmsirr |= (interval << PMSIRR_INTERVAL_SHIFT);
}

static int
spe_backend_configure(struct hwt_context *ctx, int cpu_id, int thread_id)
{
        struct arm_spe_info *info = NULL;
        struct arm_spe_config *cfg;
        struct hwt_thread *thr = NULL;
        int err = 0;

        if (ctx->mode == HWT_MODE_CPU)
                info = &spe_info_cpu[cpu_id];
        else {
                TAILQ_FOREACH(thr, &ctx->threads, next) {
                        if (thr->thread_id != thread_id)
                                continue;
                        info = (struct arm_spe_info *)thr->private;
                        break;
                }
                if (info == NULL)
                        return (ENOENT);
        }

        mtx_lock_spin(&info->lock);
        if (ctx->mode == HWT_MODE_CPU)
                info->ident = cpu_id;
        else
                info->ident = thread_id;
        /* Set defaults */
        info->pmsfcr = 0;
        info->pmsevfr = 0xFFFFFFFFFFFFFFFFUL;
        info->pmslatfr = 0;
        info->pmsirr =
            (arm_spe_min_interval(info->sc) << PMSIRR_INTERVAL_SHIFT)
            | PMSIRR_RND;
        info->pmsicr = 0;
        info->pmscr = PMSCR_TS | PMSCR_PA | PMSCR_CX | PMSCR_E1SPE | PMSCR_E0SPE;

        if (ctx->config != NULL &&
            ctx->config_size == sizeof(struct arm_spe_config) &&
            ctx->config_version == 1) {
                cfg = (struct arm_spe_config *)ctx->config;
                if (cfg->interval)
                        arm_spe_set_interval(info, cfg->interval);
                if (cfg->level == ARM_SPE_KERNEL_ONLY)
                        info->pmscr &= ~(PMSCR_E0SPE); /* turn off user */
                if (cfg->level == ARM_SPE_USER_ONLY)
                        info->pmscr &= ~(PMSCR_E1SPE); /* turn off kern */
                if (cfg->ctx_field)
                        info->ctx_field = cfg->ctx_field;
        } else
                err = (EINVAL);

        if (ctx->mode == HWT_MODE_THREAD) {
                info->kvaddr = thr->vm->kvaddr;
                info->buf_size = ctx->bufsize;
        }

        spe_info[cpu_id] = info;
        mtx_unlock_spin(&info->lock);

        return (err);
}


static void
arm_spe_enable(void *arg __unused)
{
        struct arm_spe_info *info = spe_info[PCPU_GET(cpuid)];
        struct arm_spe_buf_info *buf = &info->buf_info[info->buf_idx];
        struct hwt_context *ctx = info->sc->ctx;
        uint64_t base, limit;

        dprintf("%s on cpu:%d\n", __func__, PCPU_GET(cpuid));

        mtx_lock_spin(&info->lock);

        if (info->stopped) {
                mtx_unlock_spin(&info->lock);
                return;
        }

        if (info->ctx_field == ARM_SPE_CTX_CPU_ID)
                WRITE_SPECIALREG(CONTEXTIDR_EL1_REG, PCPU_GET(cpuid));

        WRITE_SPECIALREG(PMSFCR_EL1_REG, info->pmsfcr);
        WRITE_SPECIALREG(PMSEVFR_EL1_REG, info->pmsevfr);
        WRITE_SPECIALREG(PMSLATFR_EL1_REG, info->pmslatfr);

        /* Set the sampling interval */
        WRITE_SPECIALREG(PMSIRR_EL1_REG, info->pmsirr);
        isb();

        /* Write 0 here before enabling sampling */
        WRITE_SPECIALREG(PMSICR_EL1_REG, info->pmsicr);
        isb();

        base = buf_start_addr(info->buf_idx, info);
        limit = base + (info->buf_size/2);
        /* Enable the buffer */
        limit &= PMBLIMITR_LIMIT_MASK; /* Zero lower 12 bits */
        limit |= PMBLIMITR_E;
        /* Set the base and limit. Restore base pointer if sampling has previously
         * been enabled for this thread.
         */
        if (buf->pmbptr == 0) {
                WRITE_SPECIALREG(PMBPTR_EL1_REG, base);
        } else {
                WRITE_SPECIALREG(PMBPTR_EL1_REG, buf->pmbptr);
        }
        WRITE_SPECIALREG(PMBLIMITR_EL1_REG, limit);
        isb();

        /* Enable sampling */
        WRITE_SPECIALREG(PMSCR_EL1_REG, info->pmscr);
        isb();

        info->enabled = true;

        if (ctx->mode == HWT_MODE_THREAD)
                CPU_SET(PCPU_GET(cpuid), &ctx->cpu_map);

        mtx_unlock_spin(&info->lock);
}

static int
spe_backend_enable_smp(struct hwt_context *ctx)
{
        struct arm_spe_info *info;
        struct hwt_vm *vm;
        int cpu_id;

        KASSERT(ctx->mode == HWT_MODE_CPU, ("%s: should only be called for CPU mode", __func__));

        HWT_CTX_LOCK(ctx);
        CPU_FOREACH_ISSET(cpu_id, &ctx->cpu_map) {
                vm = hwt_cpu_get(ctx, cpu_id)->vm;
                KASSERT(spe_info[cpu_id] == &spe_info_cpu[cpu_id], ("%s: spe_info mismatch for cpu_id=%u", __func__, cpu_id));
                info = &spe_info_cpu[cpu_id];

                mtx_lock_spin(&info->lock);
                info->kvaddr = vm->kvaddr;
                info->buf_size = ctx->bufsize;
                mtx_unlock_spin(&info->lock);
        }
        HWT_CTX_UNLOCK(ctx);

        cpu_id = CPU_FFS(&ctx->cpu_map) - 1;
        KASSERT(spe_info[cpu_id] == &spe_info_cpu[cpu_id], ("%s: spe_info mismatch for cpu_id=%u", __func__, cpu_id));
        info = spe_info[cpu_id];
        if (info->ctx_field == ARM_SPE_CTX_PID)
                arm64_pid_in_contextidr = true;
        else
                arm64_pid_in_contextidr = false;

        smp_rendezvous_cpus(ctx->cpu_map, smp_no_rendezvous_barrier,
            arm_spe_enable, smp_no_rendezvous_barrier, NULL);

        return (0);
}

static void
arm_spe_disable_nolock(void)
{
        struct arm_spe_info *info = spe_info[PCPU_GET(cpuid)];
        struct arm_spe_buf_info *buf = &info->buf_info[info->buf_idx];
        struct hwt_context *ctx = info->sc->ctx;

        if (!info->enabled)
                return;

        dprintf("%s on cpu:%d\n", __func__, PCPU_GET(cpuid));

        /* Disable profiling */
        WRITE_SPECIALREG(PMSCR_EL1_REG, 0x0);
        isb();

        /* Drain any remaining tracing data */
        psb_csync();
        dsb(nsh);

        /* Disable the profiling buffer */
        WRITE_SPECIALREG(PMBLIMITR_EL1_REG, 0);
        isb();

        /* Clear interrupt status reg */
        WRITE_SPECIALREG(PMBSR_EL1_REG, 0x0);

        /* Clear PID/CPU_ID from context ID reg */
        WRITE_SPECIALREG(CONTEXTIDR_EL1_REG, 0);

        buf->pmbptr = READ_SPECIALREG(PMBPTR_EL1_REG);
        info->enabled = false;

        if (ctx->mode == HWT_MODE_THREAD)
                CPU_CLR(PCPU_GET(cpuid), &ctx->cpu_map);
}

void
arm_spe_disable(void *arg __unused)
{
        struct arm_spe_info *info = spe_info[PCPU_GET(cpuid)];

        mtx_lock_spin(&info->lock);
        arm_spe_disable_nolock();
        mtx_unlock_spin(&info->lock);
}

int
spe_backend_disable_smp(struct hwt_context *ctx)
{
        struct kevent kev;
        struct arm_spe_info *info;
        struct arm_spe_buf_info *buf;
        int cpu_id;
        int ret;

        if (!CPU_EMPTY(&ctx->cpu_map)) {
                /* Disable and send out remaining data in bufs */
                smp_rendezvous_cpus(ctx->cpu_map, smp_no_rendezvous_barrier,
                    arm_spe_disable, smp_no_rendezvous_barrier, NULL);

                CPU_FOREACH_ISSET(cpu_id, &ctx->cpu_map) {
                        info = spe_info[cpu_id];
                        buf = &info->buf_info[info->buf_idx];
                        arm_spe_send_buffer(buf, 0);
                }
        }

        arm64_pid_in_contextidr = false;

        /*
         * Tracing on all CPUs has been disabled, and we've sent write ptr
         * offsets for all bufs - let userspace know it can shutdown
         */
        EV_SET(&kev, ARM_SPE_KQ_SHUTDOWN, EVFILT_USER, 0, NOTE_TRIGGER, 0, NULL);
        ret = kqfd_register(ctx->kqueue_fd, &kev, ctx->hwt_td, M_WAITOK);
        if (ret)
                dprintf("%s kqfd_register ret:%d\n", __func__, ret);

        return (0);
}

static void
spe_backend_enable(struct hwt_context *ctx, int cpu_id)
{
        struct arm_spe_info *info;

        if (ctx->mode == HWT_MODE_CPU)
                return;
        KASSERT(curcpu == cpu_id, ("%s: attempting to enable SPE on another cpu", __func__));

        info = spe_info[cpu_id];

        KASSERT(info != NULL, ("%s: info=NULL", __func__));

        if (info->ctx_field == ARM_SPE_CTX_PID)
                arm64_pid_in_contextidr = true;
        else
                arm64_pid_in_contextidr = false;

        arm_spe_enable(NULL);
}

static void
spe_backend_disable(struct hwt_context *ctx, int cpu_id)
{
        struct arm_spe_info *info = spe_info[PCPU_GET(cpuid)];

        if (ctx->mode == HWT_MODE_CPU)
                return;

        KASSERT(curcpu == cpu_id, ("%s: attempting to disable SPE on another cpu", __func__));

        mtx_lock_spin(&info->lock);

        if (!info->stopped)
                arm_spe_disable_nolock();

        mtx_unlock_spin(&info->lock);
}

static void
arm_spe_flush(void *arg, int pending __unused)
{
        struct arm_spe_info *info = arg;
        struct arm_spe_buf_info *buf = &info->buf_info[info->buf_idx];

        arm_spe_send_buffer(buf, 0);
}

static void
spe_backend_stop(struct hwt_context *ctx)
{
        struct arm_spe_info *info;
        struct hwt_thread *thr;

        HWT_CTX_LOCK(ctx);

        if (ctx->mode == HWT_MODE_THREAD) {
                ctx->state = CTX_STATE_STOPPED;

                TAILQ_FOREACH(thr, &ctx->threads, next) {
                        info = (struct arm_spe_info *)thr->private;

                        mtx_lock_spin(&info->lock);

                        info->stopped = true;

                        if (!info->enabled) {
                                /* Not currently tracing. Enqueue buffer for sending */
                                TASK_INIT(&info->flush_task, 0, (task_fn_t *)arm_spe_flush, info);
                                taskqueue_enqueue(taskqueue_arm_spe, &info->flush_task);
                        }
                        /* Otherwise tracing currently active. As this thread has been
                         * marked as stopped, buffer will be sent on next disable
                         */

                        mtx_unlock_spin(&info->lock);
                }

        }

        HWT_CTX_UNLOCK(ctx);

        taskqueue_drain_all(taskqueue_arm_spe);

        spe_backend_disable_smp(ctx);
}

static void
arm_spe_reenable(void *arg __unused)
{
        struct arm_spe_info *info = spe_info[PCPU_GET(cpuid)];

        WRITE_SPECIALREG(PMSCR_EL1_REG, info->pmscr);
        isb();
}

static int
spe_backend_svc_buf(struct hwt_context *ctx, void *data, size_t data_size,
    int data_version)
{
        struct arm_spe_info *info = NULL;
        struct arm_spe_buf_info *buf;
        struct arm_spe_svc_buf *s;
        struct hwt_thread *thr;
        int err = 0;
        cpuset_t cpu_set;

        if (data_size != sizeof(struct arm_spe_svc_buf))
                return (E2BIG);

        if (data_version != 1)
                return (EINVAL);

        s = (struct arm_spe_svc_buf *)data;
        if (s->buf_idx > 1)
                return (ENODEV);

        if (ctx->mode == HWT_MODE_CPU) {
                if (s->ident >= mp_ncpus)
                        return (EINVAL);

                info = spe_info[s->ident];
        } else {
                TAILQ_FOREACH(thr, &ctx->threads, next) {
                        if (thr->thread_id != s->ident)
                                continue;
                        info = (struct arm_spe_info *)thr->private;
                        break;
                }

                if (info == NULL)
                        return (ENOENT);
        }

        mtx_lock_spin(&info->lock);

        buf = &info->buf_info[s->buf_idx];

        if (!info->enabled && ctx->mode == HWT_MODE_CPU) {
                err = ENXIO;
                goto end;
        }

        /* Clear the flag the signals buffer needs servicing */
        buf->buf_svc = false;

        /* Re-enable profiling if we've been waiting for this notification */
        if (buf->buf_wait && !info->stopped) {
                CPU_SETOF(s->ident, &cpu_set);

                mtx_unlock_spin(&info->lock);
                smp_rendezvous_cpus(cpu_set, smp_no_rendezvous_barrier,
                    arm_spe_reenable, smp_no_rendezvous_barrier, NULL);
                mtx_lock_spin(&info->lock);

                buf->buf_wait = false;
        }

end:
        mtx_unlock_spin(&info->lock);
        return (err);
}

static int
spe_backend_read(struct hwt_vm *vm, int *ident, vm_offset_t *offset,
    uint64_t *data)
{
        struct arm_spe_queue *q;
        struct arm_spe_softc *sc = device_get_softc(spe_dev);
        int error = 0;

        mtx_lock_spin(&sc->sc_lock);

        /* Return the first pending buffer that needs servicing */
        q = STAILQ_FIRST(&sc->pending);
        if (q == NULL) {
                error = ENOENT;
                goto error;
        }
        *ident = q->ident;
        *offset = q->offset;
        *data = (q->buf_idx << KQ_BUF_POS_SHIFT) |
            (q->partial_rec << KQ_PARTREC_SHIFT) |
            (q->final_buf << KQ_FINAL_BUF_SHIFT);

        STAILQ_REMOVE_HEAD(&sc->pending, next);
        sc->npending--;

error:
        mtx_unlock_spin(&sc->sc_lock);
        if (error)
                return (error);

        free(q, M_ARM_SPE);
        return (0);
}

static int
spe_backend_thread_alloc(struct hwt_thread *thr)
{
        struct arm_spe_softc *sc = device_get_softc(spe_dev);
        char lock_name[32];
        struct arm_spe_info *info;

        info = malloc(sizeof(*info), M_ARM_SPE, M_WAITOK | M_ZERO);

        info->sc = sc;
        info->buf_info[0].info = info;
        info->buf_info[0].buf_idx = 0;
        info->buf_info[1].info = info;
        info->buf_info[1].buf_idx = 1;
        snprintf(lock_name, sizeof(lock_name), "Arm SPE lock/thr/%d", thr->thread_id);
        mtx_init(&info->lock, lock_name, NULL, MTX_SPIN);

        thr->private = info;

        return (0);
}

static void
spe_backend_thread_free(struct hwt_thread *thr)
{
        struct arm_spe_info *info;

        info = (struct arm_spe_info *)thr->private;

        free(info, M_ARM_SPE);
}

static struct hwt_backend_ops spe_ops = {
        .hwt_backend_init = spe_backend_init,
        .hwt_backend_deinit = spe_backend_deinit,

        .hwt_backend_configure = spe_backend_configure,
        .hwt_backend_svc_buf = spe_backend_svc_buf,
        .hwt_backend_stop = spe_backend_stop,

        .hwt_backend_enable = spe_backend_enable,
        .hwt_backend_disable = spe_backend_disable,

        .hwt_backend_enable_smp = spe_backend_enable_smp,
        .hwt_backend_disable_smp = spe_backend_disable_smp,

        .hwt_backend_read = spe_backend_read,

        .hwt_backend_thread_alloc = spe_backend_thread_alloc,
        .hwt_backend_thread_free = spe_backend_thread_free,
};

int
spe_register(device_t dev)
{
        spe_dev = dev;

        return (hwt_backend_register(&backend));
}