root/tools/perf/util/arm-spe.c
// SPDX-License-Identifier: GPL-2.0
/*
 * Arm Statistical Profiling Extensions (SPE) support
 * Copyright (c) 2017-2018, Arm Ltd.
 */

#include <byteswap.h>
#include <endian.h>
#include <errno.h>
#include <inttypes.h>
#include <linux/bitops.h>
#include <linux/kernel.h>
#include <linux/log2.h>
#include <linux/types.h>
#include <linux/zalloc.h>
#include <stdlib.h>
#include <unistd.h>

#include "auxtrace.h"
#include "color.h"
#include "debug.h"
#include "evlist.h"
#include "evsel.h"
#include "machine.h"
#include "session.h"
#include "symbol.h"
#include "thread.h"
#include "thread-stack.h"
#include "tsc.h"
#include "tool.h"
#include "util/synthetic-events.h"

#include "arm-spe.h"
#include "arm-spe-decoder/arm-spe-decoder.h"
#include "arm-spe-decoder/arm-spe-pkt-decoder.h"

#include "../../arch/arm64/include/asm/cputype.h"
#define MAX_TIMESTAMP (~0ULL)

#define is_ldst_op(op)          (!!((op) & ARM_SPE_OP_LDST))

#define is_simd_op(op)          (!!((op) & (ARM_SPE_OP_SIMD_FP | ARM_SPE_OP_SVE | \
                                            ARM_SPE_OP_SME | ARM_SPE_OP_ASE)))

#define is_mem_op(op)           (is_ldst_op(op) || is_simd_op(op))

#define ARM_SPE_CACHE_EVENT(lvl) \
        (ARM_SPE_##lvl##_ACCESS | ARM_SPE_##lvl##_MISS)

#define arm_spe_is_cache_level(type, lvl) \
        ((type) & ARM_SPE_CACHE_EVENT(lvl))

#define arm_spe_is_cache_hit(type, lvl) \
        (((type) & ARM_SPE_CACHE_EVENT(lvl)) == ARM_SPE_##lvl##_ACCESS)

#define arm_spe_is_cache_miss(type, lvl) \
        ((type) & ARM_SPE_##lvl##_MISS)

struct arm_spe {
        struct auxtrace                 auxtrace;
        struct auxtrace_queues          queues;
        struct auxtrace_heap            heap;
        struct itrace_synth_opts        synth_opts;
        u32                             auxtrace_type;
        struct perf_session             *session;
        struct machine                  *machine;
        u32                             pmu_type;

        struct perf_tsc_conversion      tc;

        u8                              timeless_decoding;
        u8                              data_queued;

        u64                             sample_type;
        u8                              sample_flc;
        u8                              sample_llc;
        u8                              sample_tlb;
        u8                              sample_branch;
        u8                              sample_remote_access;
        u8                              sample_memory;
        u8                              sample_instructions;

        u64                             l1d_miss_id;
        u64                             l1d_access_id;
        u64                             llc_miss_id;
        u64                             llc_access_id;
        u64                             tlb_miss_id;
        u64                             tlb_access_id;
        u64                             branch_id;
        u64                             remote_access_id;
        u64                             memory_id;
        u64                             instructions_id;

        u64                             kernel_start;

        unsigned long                   num_events;
        u8                              use_ctx_pkt_for_pid;

        u64                             **metadata;
        u64                             metadata_ver;
        u64                             metadata_nr_cpu;
        bool                            is_homogeneous;
};

struct arm_spe_queue {
        struct arm_spe                  *spe;
        unsigned int                    queue_nr;
        struct auxtrace_buffer          *buffer;
        struct auxtrace_buffer          *old_buffer;
        union perf_event                *event_buf;
        bool                            on_heap;
        bool                            done;
        pid_t                           pid;
        pid_t                           tid;
        int                             cpu;
        struct arm_spe_decoder          *decoder;
        u64                             time;
        u64                             timestamp;
        struct thread                   *thread;
        u64                             sample_count;
        u32                             flags;
        struct branch_stack             *last_branch;
};

struct data_source_handle {
        const struct midr_range *midr_ranges;
        void (*ds_synth)(const struct arm_spe_record *record,
                         union perf_mem_data_src *data_src);
};

#define DS(range, func)                                 \
        {                                               \
                .midr_ranges = range,                   \
                .ds_synth = arm_spe__synth_##func,      \
        }

static void arm_spe_dump(struct arm_spe *spe __maybe_unused,
                         unsigned char *buf, size_t len)
{
        struct arm_spe_pkt packet;
        size_t pos = 0;
        int ret, pkt_len, i;
        char desc[ARM_SPE_PKT_DESC_MAX];
        const char *color = PERF_COLOR_BLUE;

        color_fprintf(stdout, color,
                      ". ... ARM SPE data: size %#zx bytes\n",
                      len);

        while (len) {
                ret = arm_spe_get_packet(buf, len, &packet);
                if (ret > 0)
                        pkt_len = ret;
                else
                        pkt_len = 1;
                printf(".");
                color_fprintf(stdout, color, "  %08zx: ", pos);
                for (i = 0; i < pkt_len; i++)
                        color_fprintf(stdout, color, " %02x", buf[i]);
                for (; i < 16; i++)
                        color_fprintf(stdout, color, "   ");
                if (ret > 0) {
                        ret = arm_spe_pkt_desc(&packet, desc,
                                               ARM_SPE_PKT_DESC_MAX);
                        if (!ret)
                                color_fprintf(stdout, color, " %s\n", desc);
                } else {
                        color_fprintf(stdout, color, " Bad packet!\n");
                }
                pos += pkt_len;
                buf += pkt_len;
                len -= pkt_len;
        }
}

static void arm_spe_dump_event(struct arm_spe *spe, unsigned char *buf,
                               size_t len)
{
        printf(".\n");
        arm_spe_dump(spe, buf, len);
}

static int arm_spe_get_trace(struct arm_spe_buffer *b, void *data)
{
        struct arm_spe_queue *speq = data;
        struct auxtrace_buffer *buffer = speq->buffer;
        struct auxtrace_buffer *old_buffer = speq->old_buffer;
        struct auxtrace_queue *queue;

        queue = &speq->spe->queues.queue_array[speq->queue_nr];

        buffer = auxtrace_buffer__next(queue, buffer);
        /* If no more data, drop the previous auxtrace_buffer and return */
        if (!buffer) {
                if (old_buffer)
                        auxtrace_buffer__drop_data(old_buffer);
                b->len = 0;
                return 0;
        }

        speq->buffer = buffer;

        /* If the aux_buffer doesn't have data associated, try to load it */
        if (!buffer->data) {
                /* get the file desc associated with the perf data file */
                int fd = perf_data__fd(speq->spe->session->data);

                buffer->data = auxtrace_buffer__get_data(buffer, fd);
                if (!buffer->data)
                        return -ENOMEM;
        }

        b->len = buffer->size;
        b->buf = buffer->data;

        if (b->len) {
                if (old_buffer)
                        auxtrace_buffer__drop_data(old_buffer);
                speq->old_buffer = buffer;
        } else {
                auxtrace_buffer__drop_data(buffer);
                return arm_spe_get_trace(b, data);
        }

        return 0;
}

static struct arm_spe_queue *arm_spe__alloc_queue(struct arm_spe *spe,
                unsigned int queue_nr)
{
        struct arm_spe_params params = { .get_trace = 0, };
        struct arm_spe_queue *speq;

        speq = zalloc(sizeof(*speq));
        if (!speq)
                return NULL;

        speq->event_buf = malloc(PERF_SAMPLE_MAX_SIZE);
        if (!speq->event_buf)
                goto out_free;

        speq->spe = spe;
        speq->queue_nr = queue_nr;
        speq->pid = -1;
        speq->tid = -1;
        speq->cpu = -1;

        /* params set */
        params.get_trace = arm_spe_get_trace;
        params.data = speq;

        if (spe->synth_opts.last_branch) {
                size_t sz = sizeof(struct branch_stack);

                /* Allocate up to two entries for PBT + TGT */
                sz += sizeof(struct branch_entry) *
                        min(spe->synth_opts.last_branch_sz, 2U);
                speq->last_branch = zalloc(sz);
                if (!speq->last_branch)
                        goto out_free;
        }

        /* create new decoder */
        speq->decoder = arm_spe_decoder_new(&params);
        if (!speq->decoder)
                goto out_free;

        return speq;

out_free:
        zfree(&speq->event_buf);
        zfree(&speq->last_branch);
        free(speq);

        return NULL;
}

static inline u8 arm_spe_cpumode(struct arm_spe *spe, u64 ip)
{
        return ip >= spe->kernel_start ?
                PERF_RECORD_MISC_KERNEL :
                PERF_RECORD_MISC_USER;
}

static void arm_spe_set_pid_tid_cpu(struct arm_spe *spe,
                                    struct auxtrace_queue *queue)
{
        struct arm_spe_queue *speq = queue->priv;
        pid_t tid;

        tid = machine__get_current_tid(spe->machine, speq->cpu);
        if (tid != -1) {
                speq->tid = tid;
                thread__zput(speq->thread);
        } else
                speq->tid = queue->tid;

        if ((!speq->thread) && (speq->tid != -1)) {
                speq->thread = machine__find_thread(spe->machine, -1,
                                                    speq->tid);
        }

        if (speq->thread) {
                speq->pid = thread__pid(speq->thread);
                if (queue->cpu == -1)
                        speq->cpu = thread__cpu(speq->thread);
        }
}

static int arm_spe_set_tid(struct arm_spe_queue *speq, pid_t tid)
{
        struct arm_spe *spe = speq->spe;
        int err = machine__set_current_tid(spe->machine, speq->cpu, -1, tid);

        if (err)
                return err;

        arm_spe_set_pid_tid_cpu(spe, &spe->queues.queue_array[speq->queue_nr]);

        return 0;
}

static u64 *arm_spe__get_metadata_by_cpu(struct arm_spe *spe, int cpu)
{
        u64 i;

        if (!spe->metadata)
                return NULL;

        /* CPU ID is -1 for per-thread mode */
        if (cpu < 0) {
                /*
                 * On the heterogeneous system, due to CPU ID is -1,
                 * cannot confirm the data source packet is supported.
                 */
                if (!spe->is_homogeneous)
                        return NULL;

                /* In homogeneous system, simply use CPU0's metadata */
                return spe->metadata[0];
        }

        for (i = 0; i < spe->metadata_nr_cpu; i++)
                if (spe->metadata[i][ARM_SPE_CPU] == (u64)cpu)
                        return spe->metadata[i];

        return NULL;
}

static struct simd_flags arm_spe__synth_simd_flags(const struct arm_spe_record *record)
{
        struct simd_flags simd_flags = {};

        if (record->op & ARM_SPE_OP_SVE)
                simd_flags.arch |= SIMD_OP_FLAGS_ARCH_SVE;

        if (record->type & ARM_SPE_SVE_PARTIAL_PRED)
                simd_flags.pred |= SIMD_OP_FLAGS_PRED_PARTIAL;

        if (record->type & ARM_SPE_SVE_EMPTY_PRED)
                simd_flags.pred |= SIMD_OP_FLAGS_PRED_EMPTY;

        return simd_flags;
}

static void arm_spe_prep_sample(struct arm_spe *spe,
                                struct arm_spe_queue *speq,
                                union perf_event *event,
                                struct perf_sample *sample)
{
        struct arm_spe_record *record = &speq->decoder->record;

        if (!spe->timeless_decoding)
                sample->time = tsc_to_perf_time(record->timestamp, &spe->tc);

        sample->ip = record->from_ip;
        sample->cpumode = arm_spe_cpumode(spe, sample->ip);
        sample->pid = speq->pid;
        sample->tid = speq->tid;
        sample->period = spe->synth_opts.period;
        sample->cpu = speq->cpu;
        sample->simd_flags = arm_spe__synth_simd_flags(record);

        event->sample.header.type = PERF_RECORD_SAMPLE;
        event->sample.header.misc = sample->cpumode;
        event->sample.header.size = sizeof(struct perf_event_header);
}

static void arm_spe__prep_branch_stack(struct arm_spe_queue *speq)
{
        struct arm_spe *spe = speq->spe;
        struct arm_spe_record *record = &speq->decoder->record;
        struct branch_stack *bstack = speq->last_branch;
        struct branch_flags *bs_flags;
        unsigned int last_branch_sz = spe->synth_opts.last_branch_sz;
        bool have_tgt = !!(speq->flags & PERF_IP_FLAG_BRANCH);
        bool have_pbt = last_branch_sz >= (have_tgt + 1U) && record->prev_br_tgt;
        size_t sz = sizeof(struct branch_stack) +
                    sizeof(struct branch_entry) * min(last_branch_sz, 2U) /* PBT + TGT */;
        int i = 0;

        /* Clean up branch stack */
        memset(bstack, 0x0, sz);

        if (!have_tgt && !have_pbt)
                return;

        if (have_tgt) {
                bstack->entries[i].from = record->from_ip;
                bstack->entries[i].to = record->to_ip;

                bs_flags = &bstack->entries[i].flags;
                bs_flags->value = 0;

                if (record->op & ARM_SPE_OP_BR_CR_BL) {
                        if (record->op & ARM_SPE_OP_BR_COND)
                                bs_flags->type |= PERF_BR_COND_CALL;
                        else
                                bs_flags->type |= PERF_BR_CALL;
                /*
                 * Indirect branch instruction without link (e.g. BR),
                 * take this case as function return.
                 */
                } else if (record->op & ARM_SPE_OP_BR_CR_RET ||
                           record->op & ARM_SPE_OP_BR_INDIRECT) {
                        if (record->op & ARM_SPE_OP_BR_COND)
                                bs_flags->type |= PERF_BR_COND_RET;
                        else
                                bs_flags->type |= PERF_BR_RET;
                } else if (record->op & ARM_SPE_OP_BR_CR_NON_BL_RET) {
                        if (record->op & ARM_SPE_OP_BR_COND)
                                bs_flags->type |= PERF_BR_COND;
                        else
                                bs_flags->type |= PERF_BR_UNCOND;
                } else {
                        if (record->op & ARM_SPE_OP_BR_COND)
                                bs_flags->type |= PERF_BR_COND;
                        else
                                bs_flags->type |= PERF_BR_UNKNOWN;
                }

                if (record->type & ARM_SPE_BRANCH_MISS) {
                        bs_flags->mispred = 1;
                        bs_flags->predicted = 0;
                } else {
                        bs_flags->mispred = 0;
                        bs_flags->predicted = 1;
                }

                if (record->type & ARM_SPE_BRANCH_NOT_TAKEN)
                        bs_flags->not_taken = 1;

                if (record->type & ARM_SPE_IN_TXN)
                        bs_flags->in_tx = 1;

                bs_flags->cycles = min(record->latency, 0xFFFFU);
                i++;
        }

        if (have_pbt) {
                bs_flags = &bstack->entries[i].flags;
                bs_flags->type |= PERF_BR_UNKNOWN;
                bstack->entries[i].to = record->prev_br_tgt;
                i++;
        }

        bstack->nr = i;
        bstack->hw_idx = -1ULL;
}

static int arm_spe__inject_event(union perf_event *event, struct perf_sample *sample, u64 type)
{
        event->header.size = perf_event__sample_event_size(sample, type, 0);
        return perf_event__synthesize_sample(event, type, 0, sample);
}

static inline int
arm_spe_deliver_synth_event(struct arm_spe *spe,
                            struct arm_spe_queue *speq __maybe_unused,
                            union perf_event *event,
                            struct perf_sample *sample)
{
        int ret;

        if (spe->synth_opts.inject) {
                ret = arm_spe__inject_event(event, sample, spe->sample_type);
                if (ret)
                        return ret;
        }

        ret = perf_session__deliver_synth_event(spe->session, event, sample);
        if (ret)
                pr_err("ARM SPE: failed to deliver event, error %d\n", ret);

        return ret;
}

static int arm_spe__synth_mem_sample(struct arm_spe_queue *speq,
                                     u64 spe_events_id,
                                     union perf_mem_data_src data_src)
{
        struct arm_spe *spe = speq->spe;
        struct arm_spe_record *record = &speq->decoder->record;
        union perf_event *event = speq->event_buf;
        struct perf_sample sample;
        int ret;

        perf_sample__init(&sample, /*all=*/true);
        arm_spe_prep_sample(spe, speq, event, &sample);

        sample.id = spe_events_id;
        sample.stream_id = spe_events_id;
        sample.addr = record->virt_addr;
        sample.phys_addr = record->phys_addr;
        sample.data_src = data_src.val;
        sample.weight = record->latency;

        ret = arm_spe_deliver_synth_event(spe, speq, event, &sample);
        perf_sample__exit(&sample);
        return ret;
}

static int arm_spe__synth_branch_sample(struct arm_spe_queue *speq,
                                        u64 spe_events_id)
{
        struct arm_spe *spe = speq->spe;
        struct arm_spe_record *record = &speq->decoder->record;
        union perf_event *event = speq->event_buf;
        struct perf_sample sample;
        int ret;

        perf_sample__init(&sample, /*all=*/true);
        arm_spe_prep_sample(spe, speq, event, &sample);

        sample.id = spe_events_id;
        sample.stream_id = spe_events_id;
        sample.addr = record->to_ip;
        sample.weight = record->latency;
        sample.flags = speq->flags;
        sample.branch_stack = speq->last_branch;

        ret = arm_spe_deliver_synth_event(spe, speq, event, &sample);
        perf_sample__exit(&sample);
        return ret;
}

static int arm_spe__synth_instruction_sample(struct arm_spe_queue *speq,
                                             u64 spe_events_id,
                                             union perf_mem_data_src data_src)
{
        struct arm_spe *spe = speq->spe;
        struct arm_spe_record *record = &speq->decoder->record;
        union perf_event *event = speq->event_buf;
        struct perf_sample sample;
        int ret;

        perf_sample__init(&sample, /*all=*/true);
        arm_spe_prep_sample(spe, speq, event, &sample);

        sample.id = spe_events_id;
        sample.stream_id = spe_events_id;
        sample.addr = record->to_ip;
        sample.phys_addr = record->phys_addr;
        sample.data_src = data_src.val;
        sample.weight = record->latency;
        sample.flags = speq->flags;
        sample.branch_stack = speq->last_branch;

        ret = arm_spe_deliver_synth_event(spe, speq, event, &sample);
        perf_sample__exit(&sample);
        return ret;
}

static const struct midr_range common_ds_encoding_cpus[] = {
        MIDR_ALL_VERSIONS(MIDR_CORTEX_A715),
        MIDR_ALL_VERSIONS(MIDR_CORTEX_A720),
        MIDR_ALL_VERSIONS(MIDR_CORTEX_A720AE),
        MIDR_ALL_VERSIONS(MIDR_CORTEX_A725),
        MIDR_ALL_VERSIONS(MIDR_CORTEX_A78C),
        MIDR_ALL_VERSIONS(MIDR_CORTEX_X1),
        MIDR_ALL_VERSIONS(MIDR_CORTEX_X1C),
        MIDR_ALL_VERSIONS(MIDR_CORTEX_X3),
        MIDR_ALL_VERSIONS(MIDR_CORTEX_X4),
        MIDR_ALL_VERSIONS(MIDR_CORTEX_X925),
        MIDR_ALL_VERSIONS(MIDR_NEOVERSE_N1),
        MIDR_ALL_VERSIONS(MIDR_NEOVERSE_N2),
        MIDR_ALL_VERSIONS(MIDR_NEOVERSE_V1),
        MIDR_ALL_VERSIONS(MIDR_NEOVERSE_V2),
        MIDR_ALL_VERSIONS(MIDR_NEOVERSE_V3),
        MIDR_ALL_VERSIONS(MIDR_NVIDIA_OLYMPUS),
        {},
};

static const struct midr_range ampereone_ds_encoding_cpus[] = {
        MIDR_ALL_VERSIONS(MIDR_AMPERE1A),
        {},
};

static const struct midr_range hisi_hip_ds_encoding_cpus[] = {
        MIDR_ALL_VERSIONS(MIDR_HISI_HIP12),
        {},
};

static void arm_spe__sample_flags(struct arm_spe_queue *speq)
{
        const struct arm_spe_record *record = &speq->decoder->record;

        speq->flags = 0;
        if (record->op & ARM_SPE_OP_BRANCH_ERET) {
                speq->flags = PERF_IP_FLAG_BRANCH;

                if (record->type & ARM_SPE_BRANCH_MISS)
                        speq->flags |= PERF_IP_FLAG_BRANCH_MISS;

                if (record->type & ARM_SPE_BRANCH_NOT_TAKEN)
                        speq->flags |= PERF_IP_FLAG_NOT_TAKEN;

                if (record->type & ARM_SPE_IN_TXN)
                        speq->flags |= PERF_IP_FLAG_IN_TX;

                if (record->op & ARM_SPE_OP_BR_COND)
                        speq->flags |= PERF_IP_FLAG_CONDITIONAL;

                if (record->op & ARM_SPE_OP_BR_CR_BL)
                        speq->flags |= PERF_IP_FLAG_CALL;
                else if (record->op & ARM_SPE_OP_BR_CR_RET)
                        speq->flags |= PERF_IP_FLAG_RETURN;
                /*
                 * Indirect branch instruction without link (e.g. BR),
                 * take it as a function return.
                 */
                else if (record->op & ARM_SPE_OP_BR_INDIRECT)
                        speq->flags |= PERF_IP_FLAG_RETURN;
        }
}

static void arm_spe__synth_data_source_common(const struct arm_spe_record *record,
                                              union perf_mem_data_src *data_src)
{
        /*
         * Even though four levels of cache hierarchy are possible, no known
         * production Neoverse systems currently include more than three levels
         * so for the time being we assume three exist. If a production system
         * is built with four the this function would have to be changed to
         * detect the number of levels for reporting.
         */

        /*
         * We have no data on the hit level or data source for stores in the
         * Neoverse SPE records.
         */
        if (record->op & ARM_SPE_OP_ST) {
                data_src->mem_lvl = PERF_MEM_LVL_NA;
                data_src->mem_lvl_num = PERF_MEM_LVLNUM_NA;
                data_src->mem_snoop = PERF_MEM_SNOOP_NA;
                return;
        }

        switch (record->source) {
        case ARM_SPE_COMMON_DS_L1D:
                data_src->mem_lvl = PERF_MEM_LVL_L1 | PERF_MEM_LVL_HIT;
                data_src->mem_lvl_num = PERF_MEM_LVLNUM_L1;
                data_src->mem_snoop = PERF_MEM_SNOOP_NONE;
                break;
        case ARM_SPE_COMMON_DS_L2:
                data_src->mem_lvl = PERF_MEM_LVL_L2 | PERF_MEM_LVL_HIT;
                data_src->mem_lvl_num = PERF_MEM_LVLNUM_L2;
                data_src->mem_snoop = PERF_MEM_SNOOP_NONE;
                break;
        case ARM_SPE_COMMON_DS_PEER_CORE:
                data_src->mem_lvl = PERF_MEM_LVL_L2 | PERF_MEM_LVL_HIT;
                data_src->mem_lvl_num = PERF_MEM_LVLNUM_L2;
                data_src->mem_snoopx = PERF_MEM_SNOOPX_PEER;
                break;
        /*
         * We don't know if this is L1, L2 but we do know it was a cache-2-cache
         * transfer, so set SNOOPX_PEER
         */
        case ARM_SPE_COMMON_DS_LOCAL_CLUSTER:
        case ARM_SPE_COMMON_DS_PEER_CLUSTER:
                data_src->mem_lvl = PERF_MEM_LVL_L3 | PERF_MEM_LVL_HIT;
                data_src->mem_lvl_num = PERF_MEM_LVLNUM_L3;
                data_src->mem_snoopx = PERF_MEM_SNOOPX_PEER;
                break;
        /*
         * System cache is assumed to be L3
         */
        case ARM_SPE_COMMON_DS_SYS_CACHE:
                data_src->mem_lvl = PERF_MEM_LVL_L3 | PERF_MEM_LVL_HIT;
                data_src->mem_lvl_num = PERF_MEM_LVLNUM_L3;
                data_src->mem_snoop = PERF_MEM_SNOOP_HIT;
                break;
        /*
         * We don't know what level it hit in, except it came from the other
         * socket
         */
        case ARM_SPE_COMMON_DS_REMOTE:
                data_src->mem_lvl = PERF_MEM_LVL_NA;
                data_src->mem_lvl_num = PERF_MEM_LVLNUM_NA;
                data_src->mem_remote = PERF_MEM_REMOTE_REMOTE;
                data_src->mem_snoopx = PERF_MEM_SNOOPX_PEER;
                break;
        case ARM_SPE_COMMON_DS_DRAM:
                data_src->mem_lvl = PERF_MEM_LVL_LOC_RAM | PERF_MEM_LVL_HIT;
                data_src->mem_lvl_num = PERF_MEM_LVLNUM_RAM;
                data_src->mem_snoop = PERF_MEM_SNOOP_NONE;
                break;
        default:
                break;
        }
}

/*
 * Source is IMPDEF. Here we convert the source code used on AmpereOne cores
 * to the common (Neoverse, Cortex) to avoid duplicating the decoding code.
 */
static void arm_spe__synth_data_source_ampereone(const struct arm_spe_record *record,
                                                 union perf_mem_data_src *data_src)
{
        struct arm_spe_record common_record;

        switch (record->source) {
        case ARM_SPE_AMPEREONE_LOCAL_CHIP_CACHE_OR_DEVICE:
                common_record.source = ARM_SPE_COMMON_DS_PEER_CORE;
                break;
        case ARM_SPE_AMPEREONE_SLC:
                common_record.source = ARM_SPE_COMMON_DS_SYS_CACHE;
                break;
        case ARM_SPE_AMPEREONE_REMOTE_CHIP_CACHE:
                common_record.source = ARM_SPE_COMMON_DS_REMOTE;
                break;
        case ARM_SPE_AMPEREONE_DDR:
                common_record.source = ARM_SPE_COMMON_DS_DRAM;
                break;
        case ARM_SPE_AMPEREONE_L1D:
                common_record.source = ARM_SPE_COMMON_DS_L1D;
                break;
        case ARM_SPE_AMPEREONE_L2D:
                common_record.source = ARM_SPE_COMMON_DS_L2;
                break;
        default:
                pr_warning_once("AmpereOne: Unknown data source (0x%x)\n",
                                record->source);
                return;
        }

        common_record.op = record->op;
        arm_spe__synth_data_source_common(&common_record, data_src);
}

static void arm_spe__synth_data_source_hisi_hip(const struct arm_spe_record *record,
                                                union perf_mem_data_src *data_src)
{
        /* Use common synthesis method to handle store operations */
        if (record->op & ARM_SPE_OP_ST) {
                arm_spe__synth_data_source_common(record, data_src);
                return;
        }

        switch (record->source) {
        case ARM_SPE_HISI_HIP_PEER_CPU:
                data_src->mem_lvl = PERF_MEM_LVL_L2 | PERF_MEM_LVL_HIT;
                data_src->mem_lvl_num = PERF_MEM_LVLNUM_L2;
                data_src->mem_snoopx = PERF_MEM_SNOOPX_PEER;
                break;
        case ARM_SPE_HISI_HIP_PEER_CPU_HITM:
                data_src->mem_lvl = PERF_MEM_LVL_L2 | PERF_MEM_LVL_HIT;
                data_src->mem_lvl_num = PERF_MEM_LVLNUM_L2;
                data_src->mem_snoop = PERF_MEM_SNOOP_HITM;
                data_src->mem_snoopx = PERF_MEM_SNOOPX_PEER;
                break;
        case ARM_SPE_HISI_HIP_L3:
                data_src->mem_lvl = PERF_MEM_LVL_L3 | PERF_MEM_LVL_HIT;
                data_src->mem_lvl_num = PERF_MEM_LVLNUM_L3;
                data_src->mem_snoop = PERF_MEM_SNOOP_HIT;
                break;
        case ARM_SPE_HISI_HIP_L3_HITM:
                data_src->mem_lvl = PERF_MEM_LVL_L3 | PERF_MEM_LVL_HIT;
                data_src->mem_lvl_num = PERF_MEM_LVLNUM_L3;
                data_src->mem_snoop = PERF_MEM_SNOOP_HITM;
                break;
        case ARM_SPE_HISI_HIP_PEER_CLUSTER:
                data_src->mem_lvl = PERF_MEM_LVL_REM_CCE1 | PERF_MEM_LVL_HIT;
                data_src->mem_lvl_num = PERF_MEM_LVLNUM_L3;
                data_src->mem_snoopx = PERF_MEM_SNOOPX_PEER;
                break;
        case ARM_SPE_HISI_HIP_PEER_CLUSTER_HITM:
                data_src->mem_lvl = PERF_MEM_LVL_REM_CCE1 | PERF_MEM_LVL_HIT;
                data_src->mem_lvl_num = PERF_MEM_LVLNUM_L3;
                data_src->mem_snoop = PERF_MEM_SNOOP_HITM;
                data_src->mem_snoopx = PERF_MEM_SNOOPX_PEER;
                break;
        case ARM_SPE_HISI_HIP_REMOTE_SOCKET:
                data_src->mem_lvl = PERF_MEM_LVL_REM_CCE2;
                data_src->mem_lvl_num = PERF_MEM_LVLNUM_ANY_CACHE;
                data_src->mem_remote = PERF_MEM_REMOTE_REMOTE;
                data_src->mem_snoopx = PERF_MEM_SNOOPX_PEER;
                break;
        case ARM_SPE_HISI_HIP_REMOTE_SOCKET_HITM:
                data_src->mem_lvl = PERF_MEM_LVL_REM_CCE2;
                data_src->mem_lvl_num = PERF_MEM_LVLNUM_ANY_CACHE;
                data_src->mem_snoop = PERF_MEM_SNOOP_HITM;
                data_src->mem_remote = PERF_MEM_REMOTE_REMOTE;
                data_src->mem_snoopx = PERF_MEM_SNOOPX_PEER;
                break;
        case ARM_SPE_HISI_HIP_LOCAL_MEM:
                data_src->mem_lvl = PERF_MEM_LVL_LOC_RAM | PERF_MEM_LVL_HIT;
                data_src->mem_lvl_num = PERF_MEM_LVLNUM_RAM;
                data_src->mem_snoop = PERF_MEM_SNOOP_NONE;
                break;
        case ARM_SPE_HISI_HIP_REMOTE_MEM:
                data_src->mem_lvl = PERF_MEM_LVL_REM_RAM1 | PERF_MEM_LVL_HIT;
                data_src->mem_lvl_num = PERF_MEM_LVLNUM_RAM;
                data_src->mem_remote = PERF_MEM_REMOTE_REMOTE;
                break;
        case ARM_SPE_HISI_HIP_NC_DEV:
                data_src->mem_lvl = PERF_MEM_LVL_IO | PERF_MEM_LVL_HIT;
                data_src->mem_lvl_num = PERF_MEM_LVLNUM_IO;
                data_src->mem_snoop = PERF_MEM_SNOOP_NONE;
                break;
        case ARM_SPE_HISI_HIP_L2:
                data_src->mem_lvl = PERF_MEM_LVL_L2 | PERF_MEM_LVL_HIT;
                data_src->mem_lvl_num = PERF_MEM_LVLNUM_L2;
                data_src->mem_snoop = PERF_MEM_SNOOP_NONE;
                break;
        case ARM_SPE_HISI_HIP_L2_HITM:
                data_src->mem_lvl = PERF_MEM_LVL_L2 | PERF_MEM_LVL_HIT;
                data_src->mem_lvl_num = PERF_MEM_LVLNUM_L2;
                data_src->mem_snoop = PERF_MEM_SNOOP_HITM;
                break;
        case ARM_SPE_HISI_HIP_L1:
                data_src->mem_lvl = PERF_MEM_LVL_L1 | PERF_MEM_LVL_HIT;
                data_src->mem_lvl_num = PERF_MEM_LVLNUM_L1;
                data_src->mem_snoop = PERF_MEM_SNOOP_NONE;
                break;
        default:
                break;
        }
}

static const struct data_source_handle data_source_handles[] = {
        DS(common_ds_encoding_cpus, data_source_common),
        DS(ampereone_ds_encoding_cpus, data_source_ampereone),
        DS(hisi_hip_ds_encoding_cpus, data_source_hisi_hip),
};

static void arm_spe__synth_ld_memory_level(const struct arm_spe_record *record,
                                           union perf_mem_data_src *data_src)
{
        /*
         * To find a cache hit, search in ascending order from the lower level
         * caches to the higher level caches. This reflects the best scenario
         * for a cache hit.
         */
        if (arm_spe_is_cache_hit(record->type, L1D)) {
                data_src->mem_lvl = PERF_MEM_LVL_L1 | PERF_MEM_LVL_HIT;
                data_src->mem_lvl_num = PERF_MEM_LVLNUM_L1;
        } else if (record->type & ARM_SPE_RECENTLY_FETCHED) {
                data_src->mem_lvl = PERF_MEM_LVL_LFB | PERF_MEM_LVL_HIT;
                data_src->mem_lvl_num = PERF_MEM_LVLNUM_LFB;
        } else if (arm_spe_is_cache_hit(record->type, L2D)) {
                data_src->mem_lvl = PERF_MEM_LVL_L2 | PERF_MEM_LVL_HIT;
                data_src->mem_lvl_num = PERF_MEM_LVLNUM_L2;
        } else if (arm_spe_is_cache_hit(record->type, LLC)) {
                data_src->mem_lvl = PERF_MEM_LVL_L3 | PERF_MEM_LVL_HIT;
                data_src->mem_lvl_num = PERF_MEM_LVLNUM_L3;
        /*
         * To find a cache miss, search in descending order from the higher
         * level cache to the lower level cache. This represents the worst
         * scenario for a cache miss.
         */
        } else if (arm_spe_is_cache_miss(record->type, LLC)) {
                data_src->mem_lvl = PERF_MEM_LVL_L3 | PERF_MEM_LVL_MISS;
                data_src->mem_lvl_num = PERF_MEM_LVLNUM_L3;
        } else if (arm_spe_is_cache_miss(record->type, L2D)) {
                data_src->mem_lvl = PERF_MEM_LVL_L2 | PERF_MEM_LVL_MISS;
                data_src->mem_lvl_num = PERF_MEM_LVLNUM_L2;
        } else if (arm_spe_is_cache_miss(record->type, L1D)) {
                data_src->mem_lvl = PERF_MEM_LVL_L1 | PERF_MEM_LVL_MISS;
                data_src->mem_lvl_num = PERF_MEM_LVLNUM_L1;
        }
}

static void arm_spe__synth_st_memory_level(const struct arm_spe_record *record,
                                           union perf_mem_data_src *data_src)
{
        /* Record the greatest level info for a store operation. */
        if (arm_spe_is_cache_level(record->type, LLC)) {
                data_src->mem_lvl = PERF_MEM_LVL_L3;
                data_src->mem_lvl |= arm_spe_is_cache_miss(record->type, LLC) ?
                                     PERF_MEM_LVL_MISS : PERF_MEM_LVL_HIT;
                data_src->mem_lvl_num = PERF_MEM_LVLNUM_L3;
        } else if (arm_spe_is_cache_level(record->type, L2D)) {
                data_src->mem_lvl = PERF_MEM_LVL_L2;
                data_src->mem_lvl |= arm_spe_is_cache_miss(record->type, L2D) ?
                                     PERF_MEM_LVL_MISS : PERF_MEM_LVL_HIT;
                data_src->mem_lvl_num = PERF_MEM_LVLNUM_L2;
        } else if (arm_spe_is_cache_level(record->type, L1D)) {
                data_src->mem_lvl = PERF_MEM_LVL_L1;
                data_src->mem_lvl |= arm_spe_is_cache_miss(record->type, L1D) ?
                                     PERF_MEM_LVL_MISS : PERF_MEM_LVL_HIT;
                data_src->mem_lvl_num = PERF_MEM_LVLNUM_L1;
        }
}

static void arm_spe__synth_memory_level(struct arm_spe_queue *speq,
                                        const struct arm_spe_record *record,
                                        union perf_mem_data_src *data_src)
{
        struct arm_spe *spe = speq->spe;

        /*
         * The data source packet contains more info for cache levels for
         * peer snooping. So respect the memory level if has been set by
         * data source parsing.
         */
        if (!data_src->mem_lvl) {
                if (data_src->mem_op == PERF_MEM_OP_LOAD)
                        arm_spe__synth_ld_memory_level(record, data_src);
                if (data_src->mem_op == PERF_MEM_OP_STORE)
                        arm_spe__synth_st_memory_level(record, data_src);
        }

        if (!data_src->mem_lvl) {
                data_src->mem_lvl = PERF_MEM_LVL_NA;
                data_src->mem_lvl_num = PERF_MEM_LVLNUM_NA;
        }

        /*
         * If 'mem_snoop' has been set by data source packet, skip to set
         * it at here.
         */
        if (!data_src->mem_snoop) {
                if (record->type & ARM_SPE_DATA_SNOOPED) {
                        if (record->type & ARM_SPE_HITM)
                                data_src->mem_snoop = PERF_MEM_SNOOP_HITM;
                        else
                                data_src->mem_snoop = PERF_MEM_SNOOP_HIT;
                } else {
                        u64 *metadata =
                                arm_spe__get_metadata_by_cpu(spe, speq->cpu);

                        /*
                         * Set NA ("Not available") mode if no meta data or the
                         * SNOOPED event is not supported.
                         */
                        if (!metadata ||
                            !(metadata[ARM_SPE_CAP_EVENT_FILTER] & ARM_SPE_DATA_SNOOPED))
                                data_src->mem_snoop = PERF_MEM_SNOOP_NA;
                        else
                                data_src->mem_snoop = PERF_MEM_SNOOP_NONE;
                }
        }

        if (!data_src->mem_remote) {
                if (record->type & ARM_SPE_REMOTE_ACCESS)
                        data_src->mem_remote = PERF_MEM_REMOTE_REMOTE;
        }
}

static void arm_spe__synth_ds(struct arm_spe_queue *speq,
                              const struct arm_spe_record *record,
                              union perf_mem_data_src *data_src)
{
        struct arm_spe *spe = speq->spe;
        u64 *metadata = NULL;
        u64 midr;
        unsigned int i;

        /* Metadata version 1 assumes all CPUs are the same (old behavior) */
        if (spe->metadata_ver == 1) {
                const char *cpuid;

                pr_warning_once("Old SPE metadata, re-record to improve decode accuracy\n");
                cpuid = perf_env__cpuid(perf_session__env(spe->session));
                midr = strtol(cpuid, NULL, 16);
        } else {
                metadata = arm_spe__get_metadata_by_cpu(spe, speq->cpu);
                if (!metadata)
                        return;

                midr = metadata[ARM_SPE_CPU_MIDR];
        }

        for (i = 0; i < ARRAY_SIZE(data_source_handles); i++) {
                if (is_midr_in_range_list(midr, data_source_handles[i].midr_ranges)) {
                        return data_source_handles[i].ds_synth(record, data_src);
                }
        }

        return;
}

static union perf_mem_data_src
arm_spe__synth_data_source(struct arm_spe_queue *speq,
                           const struct arm_spe_record *record)
{
        union perf_mem_data_src data_src = {};

        if (!is_mem_op(record->op))
                return data_src;

        if (record->op & ARM_SPE_OP_LD)
                data_src.mem_op = PERF_MEM_OP_LOAD;
        else if (record->op & ARM_SPE_OP_ST)
                data_src.mem_op = PERF_MEM_OP_STORE;
        else
                data_src.mem_op = PERF_MEM_OP_NA;

        arm_spe__synth_ds(speq, record, &data_src);
        arm_spe__synth_memory_level(speq, record, &data_src);

        if (record->type & (ARM_SPE_TLB_ACCESS | ARM_SPE_TLB_MISS)) {
                data_src.mem_dtlb = PERF_MEM_TLB_WK;

                if (record->type & ARM_SPE_TLB_MISS)
                        data_src.mem_dtlb |= PERF_MEM_TLB_MISS;
                else
                        data_src.mem_dtlb |= PERF_MEM_TLB_HIT;
        }

        return data_src;
}

static int arm_spe_sample(struct arm_spe_queue *speq)
{
        const struct arm_spe_record *record = &speq->decoder->record;
        struct arm_spe *spe = speq->spe;
        union perf_mem_data_src data_src;
        int err;

        /*
         * Discard all samples until period is reached
         */
        speq->sample_count++;
        if (speq->sample_count < spe->synth_opts.period)
                return 0;
        speq->sample_count = 0;

        arm_spe__sample_flags(speq);
        data_src = arm_spe__synth_data_source(speq, record);

        if (spe->sample_flc) {
                if (record->type & ARM_SPE_L1D_MISS) {
                        err = arm_spe__synth_mem_sample(speq, spe->l1d_miss_id,
                                                        data_src);
                        if (err)
                                return err;
                }

                if (record->type & ARM_SPE_L1D_ACCESS) {
                        err = arm_spe__synth_mem_sample(speq, spe->l1d_access_id,
                                                        data_src);
                        if (err)
                                return err;
                }
        }

        if (spe->sample_llc) {
                if (record->type & ARM_SPE_LLC_MISS) {
                        err = arm_spe__synth_mem_sample(speq, spe->llc_miss_id,
                                                        data_src);
                        if (err)
                                return err;
                }

                if (record->type & ARM_SPE_LLC_ACCESS) {
                        err = arm_spe__synth_mem_sample(speq, spe->llc_access_id,
                                                        data_src);
                        if (err)
                                return err;
                }
        }

        if (spe->sample_tlb) {
                if (record->type & ARM_SPE_TLB_MISS) {
                        err = arm_spe__synth_mem_sample(speq, spe->tlb_miss_id,
                                                        data_src);
                        if (err)
                                return err;
                }

                if (record->type & ARM_SPE_TLB_ACCESS) {
                        err = arm_spe__synth_mem_sample(speq, spe->tlb_access_id,
                                                        data_src);
                        if (err)
                                return err;
                }
        }

        if (spe->synth_opts.last_branch &&
            (spe->sample_branch || spe->sample_instructions))
                arm_spe__prep_branch_stack(speq);

        if (spe->sample_branch && (record->op & ARM_SPE_OP_BRANCH_ERET)) {
                err = arm_spe__synth_branch_sample(speq, spe->branch_id);
                if (err)
                        return err;
        }

        if (spe->sample_remote_access &&
            (record->type & ARM_SPE_REMOTE_ACCESS)) {
                err = arm_spe__synth_mem_sample(speq, spe->remote_access_id,
                                                data_src);
                if (err)
                        return err;
        }

        if (spe->sample_memory && is_mem_op(record->op)) {
                err = arm_spe__synth_mem_sample(speq, spe->memory_id, data_src);
                if (err)
                        return err;
        }

        if (spe->sample_instructions) {
                err = arm_spe__synth_instruction_sample(speq, spe->instructions_id, data_src);
                if (err)
                        return err;
        }

        return 0;
}

static int arm_spe_run_decoder(struct arm_spe_queue *speq, u64 *timestamp)
{
        struct arm_spe *spe = speq->spe;
        struct arm_spe_record *record;
        int ret;

        if (!spe->kernel_start)
                spe->kernel_start = machine__kernel_start(spe->machine);

        while (1) {
                /*
                 * The usual logic is firstly to decode the packets, and then
                 * based the record to synthesize sample; but here the flow is
                 * reversed: it calls arm_spe_sample() for synthesizing samples
                 * prior to arm_spe_decode().
                 *
                 * Two reasons for this code logic:
                 * 1. Firstly, when setup queue in arm_spe__setup_queue(), it
                 * has decoded trace data and generated a record, but the record
                 * is left to generate sample until run to here, so it's correct
                 * to synthesize sample for the left record.
                 * 2. After decoding trace data, it needs to compare the record
                 * timestamp with the coming perf event, if the record timestamp
                 * is later than the perf event, it needs bail out and pushs the
                 * record into auxtrace heap, thus the record can be deferred to
                 * synthesize sample until run to here at the next time; so this
                 * can correlate samples between Arm SPE trace data and other
                 * perf events with correct time ordering.
                 */

                /*
                 * Update pid/tid info.
                 */
                record = &speq->decoder->record;
                if (!spe->timeless_decoding && record->context_id != (u64)-1) {
                        ret = arm_spe_set_tid(speq, record->context_id);
                        if (ret)
                                return ret;

                        spe->use_ctx_pkt_for_pid = true;
                }

                ret = arm_spe_sample(speq);
                if (ret)
                        return ret;

                ret = arm_spe_decode(speq->decoder);
                if (!ret) {
                        pr_debug("No data or all data has been processed.\n");
                        return 1;
                }

                /*
                 * Error is detected when decode SPE trace data, continue to
                 * the next trace data and find out more records.
                 */
                if (ret < 0)
                        continue;

                record = &speq->decoder->record;

                /* Update timestamp for the last record */
                if (record->timestamp > speq->timestamp)
                        speq->timestamp = record->timestamp;

                /*
                 * If the timestamp of the queue is later than timestamp of the
                 * coming perf event, bail out so can allow the perf event to
                 * be processed ahead.
                 */
                if (!spe->timeless_decoding && speq->timestamp >= *timestamp) {
                        *timestamp = speq->timestamp;
                        return 0;
                }
        }

        return 0;
}

static int arm_spe__setup_queue(struct arm_spe *spe,
                               struct auxtrace_queue *queue,
                               unsigned int queue_nr)
{
        struct arm_spe_queue *speq = queue->priv;
        struct arm_spe_record *record;

        if (list_empty(&queue->head) || speq)
                return 0;

        speq = arm_spe__alloc_queue(spe, queue_nr);

        if (!speq)
                return -ENOMEM;

        queue->priv = speq;

        if (queue->cpu != -1)
                speq->cpu = queue->cpu;

        if (!speq->on_heap) {
                int ret;

                if (spe->timeless_decoding)
                        return 0;

retry:
                ret = arm_spe_decode(speq->decoder);

                if (!ret)
                        return 0;

                if (ret < 0)
                        goto retry;

                record = &speq->decoder->record;

                speq->timestamp = record->timestamp;
                ret = auxtrace_heap__add(&spe->heap, queue_nr, speq->timestamp);
                if (ret)
                        return ret;
                speq->on_heap = true;
        }

        return 0;
}

static int arm_spe__setup_queues(struct arm_spe *spe)
{
        unsigned int i;
        int ret;

        for (i = 0; i < spe->queues.nr_queues; i++) {
                ret = arm_spe__setup_queue(spe, &spe->queues.queue_array[i], i);
                if (ret)
                        return ret;
        }

        return 0;
}

static int arm_spe__update_queues(struct arm_spe *spe)
{
        if (spe->queues.new_data) {
                spe->queues.new_data = false;
                return arm_spe__setup_queues(spe);
        }

        return 0;
}

static bool arm_spe__is_timeless_decoding(struct arm_spe *spe)
{
        struct evsel *evsel;
        struct evlist *evlist = spe->session->evlist;
        bool timeless_decoding = true;

        /*
         * Circle through the list of event and complain if we find one
         * with the time bit set.
         */
        evlist__for_each_entry(evlist, evsel) {
                if ((evsel->core.attr.sample_type & PERF_SAMPLE_TIME))
                        timeless_decoding = false;
        }

        return timeless_decoding;
}

static int arm_spe_process_queues(struct arm_spe *spe, u64 timestamp)
{
        unsigned int queue_nr;
        u64 ts;
        int ret;

        while (1) {
                struct auxtrace_queue *queue;
                struct arm_spe_queue *speq;

                if (!spe->heap.heap_cnt)
                        return 0;

                if (spe->heap.heap_array[0].ordinal >= timestamp)
                        return 0;

                queue_nr = spe->heap.heap_array[0].queue_nr;
                queue = &spe->queues.queue_array[queue_nr];
                speq = queue->priv;

                auxtrace_heap__pop(&spe->heap);

                if (spe->heap.heap_cnt) {
                        ts = spe->heap.heap_array[0].ordinal + 1;
                        if (ts > timestamp)
                                ts = timestamp;
                } else {
                        ts = timestamp;
                }

                /*
                 * A previous context-switch event has set pid/tid in the machine's context, so
                 * here we need to update the pid/tid in the thread and SPE queue.
                 */
                if (!spe->use_ctx_pkt_for_pid)
                        arm_spe_set_pid_tid_cpu(spe, queue);

                ret = arm_spe_run_decoder(speq, &ts);
                if (ret < 0) {
                        auxtrace_heap__add(&spe->heap, queue_nr, ts);
                        return ret;
                }

                if (!ret) {
                        ret = auxtrace_heap__add(&spe->heap, queue_nr, ts);
                        if (ret < 0)
                                return ret;
                } else {
                        speq->on_heap = false;
                }
        }

        return 0;
}

static int arm_spe_process_timeless_queues(struct arm_spe *spe, pid_t tid,
                                            u64 time_)
{
        struct auxtrace_queues *queues = &spe->queues;
        unsigned int i;
        u64 ts = 0;

        for (i = 0; i < queues->nr_queues; i++) {
                struct auxtrace_queue *queue = &spe->queues.queue_array[i];
                struct arm_spe_queue *speq = queue->priv;

                if (speq && (tid == -1 || speq->tid == tid)) {
                        speq->time = time_;
                        arm_spe_set_pid_tid_cpu(spe, queue);
                        arm_spe_run_decoder(speq, &ts);
                }
        }
        return 0;
}

static int arm_spe_context_switch(struct arm_spe *spe, union perf_event *event,
                                  struct perf_sample *sample)
{
        pid_t pid, tid;
        int cpu;

        if (!(event->header.misc & PERF_RECORD_MISC_SWITCH_OUT))
                return 0;

        pid = event->context_switch.next_prev_pid;
        tid = event->context_switch.next_prev_tid;
        cpu = sample->cpu;

        if (tid == -1)
                pr_warning("context_switch event has no tid\n");

        return machine__set_current_tid(spe->machine, cpu, pid, tid);
}

static int arm_spe_process_event(struct perf_session *session,
                                 union perf_event *event,
                                 struct perf_sample *sample,
                                 const struct perf_tool *tool)
{
        int err = 0;
        u64 timestamp;
        struct arm_spe *spe = container_of(session->auxtrace,
                        struct arm_spe, auxtrace);

        if (dump_trace)
                return 0;

        if (!tool->ordered_events) {
                pr_err("SPE trace requires ordered events\n");
                return -EINVAL;
        }

        if (sample->time && (sample->time != (u64) -1))
                timestamp = perf_time_to_tsc(sample->time, &spe->tc);
        else
                timestamp = 0;

        if (timestamp || spe->timeless_decoding) {
                err = arm_spe__update_queues(spe);
                if (err)
                        return err;
        }

        if (spe->timeless_decoding) {
                if (event->header.type == PERF_RECORD_EXIT) {
                        err = arm_spe_process_timeless_queues(spe,
                                        event->fork.tid,
                                        sample->time);
                }
        } else if (timestamp) {
                err = arm_spe_process_queues(spe, timestamp);
                if (err)
                        return err;

                if (!spe->use_ctx_pkt_for_pid &&
                    (event->header.type == PERF_RECORD_SWITCH_CPU_WIDE ||
                    event->header.type == PERF_RECORD_SWITCH))
                        err = arm_spe_context_switch(spe, event, sample);
        }

        return err;
}

static int arm_spe_process_auxtrace_event(struct perf_session *session,
                                          union perf_event *event,
                                          const struct perf_tool *tool __maybe_unused)
{
        struct arm_spe *spe = container_of(session->auxtrace, struct arm_spe,
                                             auxtrace);

        if (!spe->data_queued) {
                struct auxtrace_buffer *buffer;
                off_t data_offset;
                int fd = perf_data__fd(session->data);
                int err;

                if (perf_data__is_pipe(session->data)) {
                        data_offset = 0;
                } else {
                        data_offset = lseek(fd, 0, SEEK_CUR);
                        if (data_offset == -1)
                                return -errno;
                }

                err = auxtrace_queues__add_event(&spe->queues, session, event,
                                data_offset, &buffer);
                if (err)
                        return err;

                /* Dump here now we have copied a piped trace out of the pipe */
                if (dump_trace) {
                        if (auxtrace_buffer__get_data(buffer, fd)) {
                                arm_spe_dump_event(spe, buffer->data,
                                                buffer->size);
                                auxtrace_buffer__put_data(buffer);
                        }
                }
        }

        return 0;
}

static int arm_spe_flush(struct perf_session *session __maybe_unused,
                         const struct perf_tool *tool __maybe_unused)
{
        struct arm_spe *spe = container_of(session->auxtrace, struct arm_spe,
                        auxtrace);
        int ret;

        if (dump_trace)
                return 0;

        if (!tool->ordered_events)
                return -EINVAL;

        ret = arm_spe__update_queues(spe);
        if (ret < 0)
                return ret;

        if (spe->timeless_decoding)
                return arm_spe_process_timeless_queues(spe, -1,
                                MAX_TIMESTAMP - 1);

        ret = arm_spe_process_queues(spe, MAX_TIMESTAMP);
        if (ret)
                return ret;

        if (!spe->use_ctx_pkt_for_pid)
                ui__warning("Arm SPE CONTEXT packets not found in the traces.\n"
                            "Matching of TIDs to SPE events could be inaccurate.\n");

        return 0;
}

static u64 *arm_spe__alloc_per_cpu_metadata(u64 *buf, int per_cpu_size)
{
        u64 *metadata;

        metadata = zalloc(per_cpu_size);
        if (!metadata)
                return NULL;

        memcpy(metadata, buf, per_cpu_size);
        return metadata;
}

static void arm_spe__free_metadata(u64 **metadata, int nr_cpu)
{
        int i;

        for (i = 0; i < nr_cpu; i++)
                zfree(&metadata[i]);
        free(metadata);
}

static u64 **arm_spe__alloc_metadata(struct perf_record_auxtrace_info *info,
                                     u64 *ver, int *nr_cpu)
{
        u64 *ptr = (u64 *)info->priv;
        u64 metadata_size;
        u64 **metadata = NULL;
        int hdr_sz, per_cpu_sz, i;

        metadata_size = info->header.size -
                sizeof(struct perf_record_auxtrace_info);

        /* Metadata version 1 */
        if (metadata_size == ARM_SPE_AUXTRACE_V1_PRIV_SIZE) {
                *ver = 1;
                *nr_cpu = 0;
                /* No per CPU metadata */
                return NULL;
        }

        *ver = ptr[ARM_SPE_HEADER_VERSION];
        hdr_sz = ptr[ARM_SPE_HEADER_SIZE];
        *nr_cpu = ptr[ARM_SPE_CPUS_NUM];

        metadata = calloc(*nr_cpu, sizeof(*metadata));
        if (!metadata)
                return NULL;

        /* Locate the start address of per CPU metadata */
        ptr += hdr_sz;
        per_cpu_sz = (metadata_size - (hdr_sz * sizeof(u64))) / (*nr_cpu);

        for (i = 0; i < *nr_cpu; i++) {
                metadata[i] = arm_spe__alloc_per_cpu_metadata(ptr, per_cpu_sz);
                if (!metadata[i])
                        goto err_per_cpu_metadata;

                ptr += per_cpu_sz / sizeof(u64);
        }

        return metadata;

err_per_cpu_metadata:
        arm_spe__free_metadata(metadata, *nr_cpu);
        return NULL;
}

static void arm_spe_free_queue(void *priv)
{
        struct arm_spe_queue *speq = priv;

        if (!speq)
                return;
        thread__zput(speq->thread);
        arm_spe_decoder_free(speq->decoder);
        zfree(&speq->event_buf);
        zfree(&speq->last_branch);
        free(speq);
}

static void arm_spe_free_events(struct perf_session *session)
{
        struct arm_spe *spe = container_of(session->auxtrace, struct arm_spe,
                                             auxtrace);
        struct auxtrace_queues *queues = &spe->queues;
        unsigned int i;

        for (i = 0; i < queues->nr_queues; i++) {
                arm_spe_free_queue(queues->queue_array[i].priv);
                queues->queue_array[i].priv = NULL;
        }
        auxtrace_queues__free(queues);
}

static void arm_spe_free(struct perf_session *session)
{
        struct arm_spe *spe = container_of(session->auxtrace, struct arm_spe,
                                             auxtrace);

        auxtrace_heap__free(&spe->heap);
        arm_spe_free_events(session);
        session->auxtrace = NULL;
        arm_spe__free_metadata(spe->metadata, spe->metadata_nr_cpu);
        free(spe);
}

static bool arm_spe_evsel_is_auxtrace(struct perf_session *session,
                                      struct evsel *evsel)
{
        struct arm_spe *spe = container_of(session->auxtrace, struct arm_spe, auxtrace);

        return evsel->core.attr.type == spe->pmu_type;
}

static const char * const metadata_hdr_v1_fmts[] = {
        [ARM_SPE_PMU_TYPE]              = "  PMU Type           :%"PRId64"\n",
        [ARM_SPE_PER_CPU_MMAPS]         = "  Per CPU mmaps      :%"PRId64"\n",
};

static const char * const metadata_hdr_fmts[] = {
        [ARM_SPE_HEADER_VERSION]        = "  Header version     :%"PRId64"\n",
        [ARM_SPE_HEADER_SIZE]           = "  Header size        :%"PRId64"\n",
        [ARM_SPE_PMU_TYPE_V2]           = "  PMU type v2        :%"PRId64"\n",
        [ARM_SPE_CPUS_NUM]              = "  CPU number         :%"PRId64"\n",
};

static const char * const metadata_per_cpu_fmts[] = {
        [ARM_SPE_MAGIC]                 = "    Magic            :0x%"PRIx64"\n",
        [ARM_SPE_CPU]                   = "    CPU #            :%"PRId64"\n",
        [ARM_SPE_CPU_NR_PARAMS]         = "    Num of params    :%"PRId64"\n",
        [ARM_SPE_CPU_MIDR]              = "    MIDR             :0x%"PRIx64"\n",
        [ARM_SPE_CPU_PMU_TYPE]          = "    PMU Type         :%"PRId64"\n",
        [ARM_SPE_CAP_MIN_IVAL]          = "    Min Interval     :%"PRId64"\n",
        [ARM_SPE_CAP_EVENT_FILTER]      = "    Event Filter     :0x%"PRIx64"\n",
};

static void arm_spe_print_info(struct arm_spe *spe, __u64 *arr)
{
        unsigned int i, cpu, hdr_size, cpu_num, cpu_size;
        const char * const *hdr_fmts;

        if (!dump_trace)
                return;

        if (spe->metadata_ver == 1) {
                cpu_num = 0;
                hdr_size = ARM_SPE_AUXTRACE_V1_PRIV_MAX;
                hdr_fmts = metadata_hdr_v1_fmts;
        } else {
                cpu_num = arr[ARM_SPE_CPUS_NUM];
                hdr_size = arr[ARM_SPE_HEADER_SIZE];
                hdr_fmts = metadata_hdr_fmts;
        }

        for (i = 0; i < hdr_size; i++)
                fprintf(stdout, hdr_fmts[i], arr[i]);

        arr += hdr_size;
        for (cpu = 0; cpu < cpu_num; cpu++) {
                /*
                 * The parameters from ARM_SPE_MAGIC to ARM_SPE_CPU_NR_PARAMS
                 * are fixed. The sequential parameter size is decided by the
                 * field 'ARM_SPE_CPU_NR_PARAMS'.
                 */
                cpu_size = (ARM_SPE_CPU_NR_PARAMS + 1) + arr[ARM_SPE_CPU_NR_PARAMS];
                for (i = 0; i < cpu_size; i++)
                        fprintf(stdout, metadata_per_cpu_fmts[i], arr[i]);
                arr += cpu_size;
        }
}

static void arm_spe_set_event_name(struct evlist *evlist, u64 id,
                                    const char *name)
{
        struct evsel *evsel;

        evlist__for_each_entry(evlist, evsel) {
                if (evsel->core.id && evsel->core.id[0] == id) {
                        if (evsel->name)
                                zfree(&evsel->name);
                        evsel->name = strdup(name);
                        break;
                }
        }
}

static int
arm_spe_synth_events(struct arm_spe *spe, struct perf_session *session)
{
        struct evlist *evlist = session->evlist;
        struct evsel *evsel;
        struct perf_event_attr attr;
        bool found = false;
        u64 id;
        int err;

        evlist__for_each_entry(evlist, evsel) {
                if (evsel->core.attr.type == spe->pmu_type) {
                        found = true;
                        break;
                }
        }

        if (!found) {
                pr_debug("No selected events with SPE trace data\n");
                return 0;
        }

        memset(&attr, 0, sizeof(struct perf_event_attr));
        attr.size = sizeof(struct perf_event_attr);
        attr.type = PERF_TYPE_HARDWARE;
        attr.sample_type = evsel->core.attr.sample_type &
                                (PERF_SAMPLE_MASK | PERF_SAMPLE_PHYS_ADDR);
        attr.sample_type |= PERF_SAMPLE_IP | PERF_SAMPLE_TID |
                            PERF_SAMPLE_PERIOD | PERF_SAMPLE_DATA_SRC |
                            PERF_SAMPLE_WEIGHT | PERF_SAMPLE_ADDR;
        if (spe->timeless_decoding)
                attr.sample_type &= ~(u64)PERF_SAMPLE_TIME;
        else
                attr.sample_type |= PERF_SAMPLE_TIME;

        spe->sample_type = attr.sample_type;

        attr.exclude_user = evsel->core.attr.exclude_user;
        attr.exclude_kernel = evsel->core.attr.exclude_kernel;
        attr.exclude_hv = evsel->core.attr.exclude_hv;
        attr.exclude_host = evsel->core.attr.exclude_host;
        attr.exclude_guest = evsel->core.attr.exclude_guest;
        attr.sample_id_all = evsel->core.attr.sample_id_all;
        attr.read_format = evsel->core.attr.read_format;
        attr.sample_period = spe->synth_opts.period;

        /* create new id val to be a fixed offset from evsel id */
        id = auxtrace_synth_id_range_start(evsel);

        if (spe->synth_opts.flc) {
                spe->sample_flc = true;

                /* Level 1 data cache miss */
                err = perf_session__deliver_synth_attr_event(session, &attr, id);
                if (err)
                        return err;
                spe->l1d_miss_id = id;
                arm_spe_set_event_name(evlist, id, "l1d-miss");
                id += 1;

                /* Level 1 data cache access */
                err = perf_session__deliver_synth_attr_event(session, &attr, id);
                if (err)
                        return err;
                spe->l1d_access_id = id;
                arm_spe_set_event_name(evlist, id, "l1d-access");
                id += 1;
        }

        if (spe->synth_opts.llc) {
                spe->sample_llc = true;

                /* Last level cache miss */
                err = perf_session__deliver_synth_attr_event(session, &attr, id);
                if (err)
                        return err;
                spe->llc_miss_id = id;
                arm_spe_set_event_name(evlist, id, "llc-miss");
                id += 1;

                /* Last level cache access */
                err = perf_session__deliver_synth_attr_event(session, &attr, id);
                if (err)
                        return err;
                spe->llc_access_id = id;
                arm_spe_set_event_name(evlist, id, "llc-access");
                id += 1;
        }

        if (spe->synth_opts.tlb) {
                spe->sample_tlb = true;

                /* TLB miss */
                err = perf_session__deliver_synth_attr_event(session, &attr, id);
                if (err)
                        return err;
                spe->tlb_miss_id = id;
                arm_spe_set_event_name(evlist, id, "tlb-miss");
                id += 1;

                /* TLB access */
                err = perf_session__deliver_synth_attr_event(session, &attr, id);
                if (err)
                        return err;
                spe->tlb_access_id = id;
                arm_spe_set_event_name(evlist, id, "tlb-access");
                id += 1;
        }

        if (spe->synth_opts.last_branch) {
                if (spe->synth_opts.last_branch_sz > 2)
                        pr_debug("Arm SPE supports only two bstack entries (PBT+TGT).\n");

                attr.sample_type |= PERF_SAMPLE_BRANCH_STACK;
                /*
                 * We don't use the hardware index, but the sample generation
                 * code uses the new format branch_stack with this field,
                 * so the event attributes must indicate that it's present.
                 */
                attr.branch_sample_type |= PERF_SAMPLE_BRANCH_HW_INDEX;
        }

        if (spe->synth_opts.branches) {
                spe->sample_branch = true;

                /* Branch */
                err = perf_session__deliver_synth_attr_event(session, &attr, id);
                if (err)
                        return err;
                spe->branch_id = id;
                arm_spe_set_event_name(evlist, id, "branch");
                id += 1;
        }

        if (spe->synth_opts.remote_access) {
                spe->sample_remote_access = true;

                /* Remote access */
                err = perf_session__deliver_synth_attr_event(session, &attr, id);
                if (err)
                        return err;
                spe->remote_access_id = id;
                arm_spe_set_event_name(evlist, id, "remote-access");
                id += 1;
        }

        if (spe->synth_opts.mem) {
                spe->sample_memory = true;

                err = perf_session__deliver_synth_attr_event(session, &attr, id);
                if (err)
                        return err;
                spe->memory_id = id;
                arm_spe_set_event_name(evlist, id, "memory");
                id += 1;
        }

        if (spe->synth_opts.instructions) {
                spe->sample_instructions = true;
                attr.config = PERF_COUNT_HW_INSTRUCTIONS;

                err = perf_session__deliver_synth_attr_event(session, &attr, id);
                if (err)
                        return err;
                spe->instructions_id = id;
                arm_spe_set_event_name(evlist, id, "instructions");
        }

        return 0;
}

static bool arm_spe__is_homogeneous(u64 **metadata, int nr_cpu)
{
        u64 midr;
        int i;

        if (!nr_cpu)
                return false;

        for (i = 0; i < nr_cpu; i++) {
                if (!metadata[i])
                        return false;

                if (i == 0) {
                        midr = metadata[i][ARM_SPE_CPU_MIDR];
                        continue;
                }

                if (midr != metadata[i][ARM_SPE_CPU_MIDR])
                        return false;
        }

        return true;
}

int arm_spe_process_auxtrace_info(union perf_event *event,
                                  struct perf_session *session)
{
        struct perf_record_auxtrace_info *auxtrace_info = &event->auxtrace_info;
        size_t min_sz = ARM_SPE_AUXTRACE_V1_PRIV_SIZE;
        struct perf_record_time_conv *tc = &session->time_conv;
        struct arm_spe *spe;
        u64 **metadata = NULL;
        u64 metadata_ver;
        int nr_cpu, err;

        if (auxtrace_info->header.size < sizeof(struct perf_record_auxtrace_info) +
                                        min_sz)
                return -EINVAL;

        metadata = arm_spe__alloc_metadata(auxtrace_info, &metadata_ver,
                                           &nr_cpu);
        if (!metadata && metadata_ver != 1) {
                pr_err("Failed to parse Arm SPE metadata.\n");
                return -EINVAL;
        }

        spe = zalloc(sizeof(struct arm_spe));
        if (!spe) {
                err = -ENOMEM;
                goto err_free_metadata;
        }

        err = auxtrace_queues__init(&spe->queues);
        if (err)
                goto err_free;

        spe->session = session;
        spe->machine = &session->machines.host; /* No kvm support */
        spe->auxtrace_type = auxtrace_info->type;
        if (metadata_ver == 1)
                spe->pmu_type = auxtrace_info->priv[ARM_SPE_PMU_TYPE];
        else
                spe->pmu_type = auxtrace_info->priv[ARM_SPE_PMU_TYPE_V2];
        spe->metadata = metadata;
        spe->metadata_ver = metadata_ver;
        spe->metadata_nr_cpu = nr_cpu;
        spe->is_homogeneous = arm_spe__is_homogeneous(metadata, nr_cpu);

        spe->timeless_decoding = arm_spe__is_timeless_decoding(spe);

        /*
         * The synthesized event PERF_RECORD_TIME_CONV has been handled ahead
         * and the parameters for hardware clock are stored in the session
         * context.  Passes these parameters to the struct perf_tsc_conversion
         * in "spe->tc", which is used for later conversion between clock
         * counter and timestamp.
         *
         * For backward compatibility, copies the fields starting from
         * "time_cycles" only if they are contained in the event.
         */
        spe->tc.time_shift = tc->time_shift;
        spe->tc.time_mult = tc->time_mult;
        spe->tc.time_zero = tc->time_zero;

        if (event_contains(*tc, time_cycles)) {
                spe->tc.time_cycles = tc->time_cycles;
                spe->tc.time_mask = tc->time_mask;
                spe->tc.cap_user_time_zero = tc->cap_user_time_zero;
                spe->tc.cap_user_time_short = tc->cap_user_time_short;
        }

        spe->auxtrace.process_event = arm_spe_process_event;
        spe->auxtrace.process_auxtrace_event = arm_spe_process_auxtrace_event;
        spe->auxtrace.flush_events = arm_spe_flush;
        spe->auxtrace.free_events = arm_spe_free_events;
        spe->auxtrace.free = arm_spe_free;
        spe->auxtrace.evsel_is_auxtrace = arm_spe_evsel_is_auxtrace;
        session->auxtrace = &spe->auxtrace;

        arm_spe_print_info(spe, &auxtrace_info->priv[0]);

        if (dump_trace)
                return 0;

        if (session->itrace_synth_opts && session->itrace_synth_opts->set) {
                spe->synth_opts = *session->itrace_synth_opts;
        } else {
                itrace_synth_opts__set_default(&spe->synth_opts, false);
                /* Default nanoseconds period not supported */
                spe->synth_opts.period_type = PERF_ITRACE_PERIOD_INSTRUCTIONS;
                spe->synth_opts.period = 1;
        }

        if (spe->synth_opts.period_type != PERF_ITRACE_PERIOD_INSTRUCTIONS) {
                ui__error("You must only use i (instructions) --itrace period with Arm SPE. e.g --itrace=i1i\n");
                err = -EINVAL;
                goto err_free_queues;
        }
        if (spe->synth_opts.period > 1)
                ui__warning("Arm SPE has a hardware-based sampling period.\n\n"
                            "--itrace periods > 1i downsample by an interval of n SPE samples rather than n instructions.\n");

        err = arm_spe_synth_events(spe, session);
        if (err)
                goto err_free_queues;

        err = auxtrace_queues__process_index(&spe->queues, session);
        if (err)
                goto err_free_queues;

        if (spe->queues.populated)
                spe->data_queued = true;

        return 0;

err_free_queues:
        auxtrace_queues__free(&spe->queues);
        session->auxtrace = NULL;
err_free:
        free(spe);
err_free_metadata:
        arm_spe__free_metadata(metadata, nr_cpu);
        return err;
}