root/drivers/perf/riscv_pmu.c
// SPDX-License-Identifier: GPL-2.0
/*
 * RISC-V performance counter support.
 *
 * Copyright (C) 2021 Western Digital Corporation or its affiliates.
 *
 * This implementation is based on old RISC-V perf and ARM perf event code
 * which are in turn based on sparc64 and x86 code.
 */

#include <linux/cpumask.h>
#include <linux/irq.h>
#include <linux/irqdesc.h>
#include <linux/perf/riscv_pmu.h>
#include <linux/printk.h>
#include <linux/smp.h>
#include <linux/sched_clock.h>

#include <asm/sbi.h>

static bool riscv_perf_user_access(struct perf_event *event)
{
        return ((event->attr.type == PERF_TYPE_HARDWARE) ||
                (event->attr.type == PERF_TYPE_HW_CACHE) ||
                (event->attr.type == PERF_TYPE_RAW)) &&
                !!(event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT) &&
                (event->hw.idx != -1);
}

void arch_perf_update_userpage(struct perf_event *event,
                               struct perf_event_mmap_page *userpg, u64 now)
{
        struct clock_read_data *rd;
        unsigned int seq;
        u64 ns;

        userpg->cap_user_time = 0;
        userpg->cap_user_time_zero = 0;
        userpg->cap_user_time_short = 0;
        userpg->cap_user_rdpmc = riscv_perf_user_access(event);

        /*
         * The counters are 64-bit but the priv spec doesn't mandate all the
         * bits to be implemented: that's why, counter width can vary based on
         * the cpu vendor.
         */
        if (userpg->cap_user_rdpmc)
                userpg->pmc_width = to_riscv_pmu(event->pmu)->ctr_get_width(event->hw.idx) + 1;

        do {
                rd = sched_clock_read_begin(&seq);

                userpg->time_mult = rd->mult;
                userpg->time_shift = rd->shift;
                userpg->time_zero = rd->epoch_ns;
                userpg->time_cycles = rd->epoch_cyc;
                userpg->time_mask = rd->sched_clock_mask;

                /*
                 * Subtract the cycle base, such that software that
                 * doesn't know about cap_user_time_short still 'works'
                 * assuming no wraps.
                 */
                ns = mul_u64_u32_shr(rd->epoch_cyc, rd->mult, rd->shift);
                userpg->time_zero -= ns;

        } while (sched_clock_read_retry(seq));

        userpg->time_offset = userpg->time_zero - now;

        /*
         * time_shift is not expected to be greater than 31 due to
         * the original published conversion algorithm shifting a
         * 32-bit value (now specifies a 64-bit value) - refer
         * perf_event_mmap_page documentation in perf_event.h.
         */
        if (userpg->time_shift == 32) {
                userpg->time_shift = 31;
                userpg->time_mult >>= 1;
        }

        /*
         * Internal timekeeping for enabled/running/stopped times
         * is always computed with the sched_clock.
         */
        userpg->cap_user_time = 1;
        userpg->cap_user_time_zero = 1;
        userpg->cap_user_time_short = 1;
}

static unsigned long csr_read_num(int csr_num)
{
#define switchcase_csr_read(__csr_num, __val)           {\
        case __csr_num:                                 \
                __val = csr_read(__csr_num);            \
                break; }
#define switchcase_csr_read_2(__csr_num, __val)         {\
        switchcase_csr_read(__csr_num + 0, __val)        \
        switchcase_csr_read(__csr_num + 1, __val)}
#define switchcase_csr_read_4(__csr_num, __val)         {\
        switchcase_csr_read_2(__csr_num + 0, __val)      \
        switchcase_csr_read_2(__csr_num + 2, __val)}
#define switchcase_csr_read_8(__csr_num, __val)         {\
        switchcase_csr_read_4(__csr_num + 0, __val)      \
        switchcase_csr_read_4(__csr_num + 4, __val)}
#define switchcase_csr_read_16(__csr_num, __val)        {\
        switchcase_csr_read_8(__csr_num + 0, __val)      \
        switchcase_csr_read_8(__csr_num + 8, __val)}
#define switchcase_csr_read_32(__csr_num, __val)        {\
        switchcase_csr_read_16(__csr_num + 0, __val)     \
        switchcase_csr_read_16(__csr_num + 16, __val)}

        unsigned long ret = 0;

        switch (csr_num) {
        switchcase_csr_read_32(CSR_CYCLE, ret)
        switchcase_csr_read_32(CSR_CYCLEH, ret)
        default :
                break;
        }

        return ret;
#undef switchcase_csr_read_32
#undef switchcase_csr_read_16
#undef switchcase_csr_read_8
#undef switchcase_csr_read_4
#undef switchcase_csr_read_2
#undef switchcase_csr_read
}

/*
 * Read the CSR of a corresponding counter.
 */
unsigned long riscv_pmu_ctr_read_csr(unsigned long csr)
{
        if (csr < CSR_CYCLE || csr > CSR_HPMCOUNTER31H ||
           (csr > CSR_HPMCOUNTER31 && csr < CSR_CYCLEH)) {
                pr_err("Invalid performance counter csr %lx\n", csr);
                return -EINVAL;
        }

        return csr_read_num(csr);
}

u64 riscv_pmu_ctr_get_width_mask(struct perf_event *event)
{
        int cwidth;
        struct riscv_pmu *rvpmu = to_riscv_pmu(event->pmu);
        struct hw_perf_event *hwc = &event->hw;

        if (hwc->idx == -1)
                /* Handle init case where idx is not initialized yet */
                cwidth = rvpmu->ctr_get_width(0);
        else
                cwidth = rvpmu->ctr_get_width(hwc->idx);

        return GENMASK_ULL(cwidth, 0);
}

u64 riscv_pmu_event_update(struct perf_event *event)
{
        struct riscv_pmu *rvpmu = to_riscv_pmu(event->pmu);
        struct hw_perf_event *hwc = &event->hw;
        u64 prev_raw_count, new_raw_count;
        unsigned long cmask;
        u64 oldval, delta;

        if (!rvpmu->ctr_read || (hwc->state & PERF_HES_UPTODATE))
                return 0;

        cmask = riscv_pmu_ctr_get_width_mask(event);

        do {
                prev_raw_count = local64_read(&hwc->prev_count);
                new_raw_count = rvpmu->ctr_read(event);
                oldval = local64_cmpxchg(&hwc->prev_count, prev_raw_count,
                                         new_raw_count);
        } while (oldval != prev_raw_count);

        delta = (new_raw_count - prev_raw_count) & cmask;
        local64_add(delta, &event->count);
        local64_sub(delta, &hwc->period_left);

        return delta;
}

void riscv_pmu_stop(struct perf_event *event, int flags)
{
        struct hw_perf_event *hwc = &event->hw;
        struct riscv_pmu *rvpmu = to_riscv_pmu(event->pmu);

        if (!(hwc->state & PERF_HES_STOPPED)) {
                if (rvpmu->ctr_stop) {
                        rvpmu->ctr_stop(event, 0);
                        hwc->state |= PERF_HES_STOPPED;
                }
                riscv_pmu_event_update(event);
                hwc->state |= PERF_HES_UPTODATE;
        }
}

int riscv_pmu_event_set_period(struct perf_event *event)
{
        struct hw_perf_event *hwc = &event->hw;
        s64 left = local64_read(&hwc->period_left);
        s64 period = hwc->sample_period;
        int overflow = 0;
        uint64_t max_period = riscv_pmu_ctr_get_width_mask(event);

        if (unlikely(left <= -period)) {
                left = period;
                local64_set(&hwc->period_left, left);
                hwc->last_period = period;
                overflow = 1;
        }

        if (unlikely(left <= 0)) {
                left += period;
                local64_set(&hwc->period_left, left);
                hwc->last_period = period;
                overflow = 1;
        }

        /*
         * Limit the maximum period to prevent the counter value
         * from overtaking the one we are about to program. In
         * effect we are reducing max_period to account for
         * interrupt latency (and we are being very conservative).
         */
        if (left > (max_period >> 1))
                left = (max_period >> 1);

        local64_set(&hwc->prev_count, (u64)-left);

        perf_event_update_userpage(event);

        return overflow;
}

void riscv_pmu_start(struct perf_event *event, int flags)
{
        struct hw_perf_event *hwc = &event->hw;
        struct riscv_pmu *rvpmu = to_riscv_pmu(event->pmu);
        uint64_t max_period = riscv_pmu_ctr_get_width_mask(event);
        u64 init_val;

        if (flags & PERF_EF_RELOAD)
                WARN_ON_ONCE(!(event->hw.state & PERF_HES_UPTODATE));

        hwc->state = 0;
        riscv_pmu_event_set_period(event);
        init_val = local64_read(&hwc->prev_count) & max_period;
        rvpmu->ctr_start(event, init_val);
        perf_event_update_userpage(event);
}

static int riscv_pmu_add(struct perf_event *event, int flags)
{
        struct riscv_pmu *rvpmu = to_riscv_pmu(event->pmu);
        struct cpu_hw_events *cpuc = this_cpu_ptr(rvpmu->hw_events);
        struct hw_perf_event *hwc = &event->hw;
        int idx;

        idx = rvpmu->ctr_get_idx(event);
        if (idx < 0)
                return idx;

        hwc->idx = idx;
        cpuc->events[idx] = event;
        cpuc->n_events++;
        hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
        if (flags & PERF_EF_START)
                riscv_pmu_start(event, PERF_EF_RELOAD);

        /* Propagate our changes to the userspace mapping. */
        perf_event_update_userpage(event);

        return 0;
}

static void riscv_pmu_del(struct perf_event *event, int flags)
{
        struct riscv_pmu *rvpmu = to_riscv_pmu(event->pmu);
        struct cpu_hw_events *cpuc = this_cpu_ptr(rvpmu->hw_events);
        struct hw_perf_event *hwc = &event->hw;

        riscv_pmu_stop(event, PERF_EF_UPDATE);
        cpuc->events[hwc->idx] = NULL;
        /* The firmware need to reset the counter mapping */
        if (rvpmu->ctr_stop)
                rvpmu->ctr_stop(event, RISCV_PMU_STOP_FLAG_RESET);
        cpuc->n_events--;
        if (rvpmu->ctr_clear_idx)
                rvpmu->ctr_clear_idx(event);
        perf_event_update_userpage(event);
        hwc->idx = -1;
}

static void riscv_pmu_read(struct perf_event *event)
{
        riscv_pmu_event_update(event);
}

static int riscv_pmu_event_init(struct perf_event *event)
{
        struct hw_perf_event *hwc = &event->hw;
        struct riscv_pmu *rvpmu = to_riscv_pmu(event->pmu);
        int mapped_event;
        u64 event_config = 0;
        uint64_t cmask;

        /* driver does not support branch stack sampling */
        if (has_branch_stack(event))
                return -EOPNOTSUPP;

        hwc->flags = 0;
        mapped_event = rvpmu->event_map(event, &event_config);
        if (mapped_event < 0) {
                pr_debug("event %x:%llx not supported\n", event->attr.type,
                         event->attr.config);
                return mapped_event;
        }

        /*
         * idx is set to -1 because the index of a general event should not be
         * decided until binding to some counter in pmu->add().
         * config will contain the information about counter CSR
         * the idx will contain the counter index
         */
        hwc->config = event_config;
        hwc->idx = -1;
        hwc->event_base = mapped_event;

        if (rvpmu->event_init)
                rvpmu->event_init(event);

        if (!is_sampling_event(event)) {
                /*
                 * For non-sampling runs, limit the sample_period to half
                 * of the counter width. That way, the new counter value
                 * is far less likely to overtake the previous one unless
                 * you have some serious IRQ latency issues.
                 */
                cmask = riscv_pmu_ctr_get_width_mask(event);
                hwc->sample_period  =  cmask >> 1;
                hwc->last_period    = hwc->sample_period;
                local64_set(&hwc->period_left, hwc->sample_period);
        }

        return 0;
}

static int riscv_pmu_event_idx(struct perf_event *event)
{
        struct riscv_pmu *rvpmu = to_riscv_pmu(event->pmu);

        if (!(event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT))
                return 0;

        if (rvpmu->csr_index)
                return rvpmu->csr_index(event) + 1;

        return 0;
}

static void riscv_pmu_event_mapped(struct perf_event *event, struct mm_struct *mm)
{
        struct riscv_pmu *rvpmu = to_riscv_pmu(event->pmu);

        if (rvpmu->event_mapped) {
                rvpmu->event_mapped(event, mm);
                perf_event_update_userpage(event);
        }
}

static void riscv_pmu_event_unmapped(struct perf_event *event, struct mm_struct *mm)
{
        struct riscv_pmu *rvpmu = to_riscv_pmu(event->pmu);

        if (rvpmu->event_unmapped) {
                rvpmu->event_unmapped(event, mm);
                perf_event_update_userpage(event);
        }
}

struct riscv_pmu *riscv_pmu_alloc(void)
{
        struct riscv_pmu *pmu;
        int cpuid, i;
        struct cpu_hw_events *cpuc;

        pmu = kzalloc_obj(*pmu);
        if (!pmu)
                goto out;

        pmu->hw_events = alloc_percpu_gfp(struct cpu_hw_events, GFP_KERNEL);
        if (!pmu->hw_events) {
                pr_info("failed to allocate per-cpu PMU data.\n");
                goto out_free_pmu;
        }

        for_each_possible_cpu(cpuid) {
                cpuc = per_cpu_ptr(pmu->hw_events, cpuid);
                cpuc->n_events = 0;
                for (i = 0; i < RISCV_MAX_COUNTERS; i++)
                        cpuc->events[i] = NULL;
                cpuc->snapshot_addr = NULL;
        }
        pmu->pmu = (struct pmu) {
                .event_init     = riscv_pmu_event_init,
                .event_mapped   = riscv_pmu_event_mapped,
                .event_unmapped = riscv_pmu_event_unmapped,
                .event_idx      = riscv_pmu_event_idx,
                .add            = riscv_pmu_add,
                .del            = riscv_pmu_del,
                .start          = riscv_pmu_start,
                .stop           = riscv_pmu_stop,
                .read           = riscv_pmu_read,
        };

        return pmu;

out_free_pmu:
        kfree(pmu);
out:
        return NULL;
}