root/tools/perf/util/bpf-trace-summary.c
/* SPDX-License-Identifier: GPL-2.0 */
#include <errno.h>
#include <inttypes.h>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>

#include "dwarf-regs.h" /* for EM_HOST */
#include "syscalltbl.h"
#include "util/cgroup.h"
#include "util/hashmap.h"
#include "util/trace.h"
#include "util/util.h"
#include <bpf/bpf.h>
#include <linux/rbtree.h>
#include <linux/time64.h>
#include <tools/libc_compat.h> /* reallocarray */

#include "bpf_skel/syscall_summary.h"
#include "bpf_skel/syscall_summary.skel.h"


static struct syscall_summary_bpf *skel;
static struct rb_root cgroups = RB_ROOT;

int trace_prepare_bpf_summary(enum trace_summary_mode mode)
{
        skel = syscall_summary_bpf__open();
        if (skel == NULL) {
                fprintf(stderr, "failed to open syscall summary bpf skeleton\n");
                return -1;
        }

        if (mode == SUMMARY__BY_THREAD)
                skel->rodata->aggr_mode = SYSCALL_AGGR_THREAD;
        else if (mode == SUMMARY__BY_CGROUP)
                skel->rodata->aggr_mode = SYSCALL_AGGR_CGROUP;
        else
                skel->rodata->aggr_mode = SYSCALL_AGGR_CPU;

        if (cgroup_is_v2("perf_event") > 0)
                skel->rodata->use_cgroup_v2 = 1;

        if (syscall_summary_bpf__load(skel) < 0) {
                fprintf(stderr, "failed to load syscall summary bpf skeleton\n");
                return -1;
        }

        if (syscall_summary_bpf__attach(skel) < 0) {
                fprintf(stderr, "failed to attach syscall summary bpf skeleton\n");
                return -1;
        }

        if (mode == SUMMARY__BY_CGROUP)
                read_all_cgroups(&cgroups);

        return 0;
}

void trace_start_bpf_summary(void)
{
        skel->bss->enabled = 1;
}

void trace_end_bpf_summary(void)
{
        skel->bss->enabled = 0;
}

struct syscall_node {
        int syscall_nr;
        struct syscall_stats stats;
};

static double rel_stddev(struct syscall_stats *stat)
{
        double variance, average;

        if (stat->count < 2)
                return 0;

        average = (double)stat->total_time / stat->count;

        variance = stat->squared_sum;
        variance -= (stat->total_time * stat->total_time) / stat->count;
        variance /= stat->count - 1;

        return 100 * sqrt(variance / stat->count) / average;
}

/*
 * The syscall_data is to maintain syscall stats ordered by total time.
 * It supports different summary modes like per-thread or global.
 *
 * For per-thread stats, it uses two-level data strurcture -
 * syscall_data is keyed by TID and has an array of nodes which
 * represents each syscall for the thread.
 *
 * For global stats, it's still two-level technically but we don't need
 * per-cpu analysis so it's keyed by the syscall number to combine stats
 * from different CPUs.  And syscall_data always has a syscall_node so
 * it can effectively work as flat hierarchy.
 *
 * For per-cgroup stats, it uses two-level data structure like thread
 * syscall_data is keyed by CGROUP and has an array of node which
 * represents each syscall for the cgroup.
 */
struct syscall_data {
        u64 key; /* tid if AGGR_THREAD, syscall-nr if AGGR_CPU, cgroup if AGGR_CGROUP */
        int nr_events;
        int nr_nodes;
        u64 total_time;
        struct syscall_node *nodes;
};

static int datacmp(const void *a, const void *b)
{
        const struct syscall_data * const *sa = a;
        const struct syscall_data * const *sb = b;

        return (*sa)->total_time > (*sb)->total_time ? -1 : 1;
}

static int nodecmp(const void *a, const void *b)
{
        const struct syscall_node *na = a;
        const struct syscall_node *nb = b;

        return na->stats.total_time > nb->stats.total_time ? -1 : 1;
}

static size_t sc_node_hash(long key, void *ctx __maybe_unused)
{
        return key;
}

static bool sc_node_equal(long key1, long key2, void *ctx __maybe_unused)
{
        return key1 == key2;
}

static int print_common_stats(struct syscall_data *data, int max_summary, FILE *fp)
{
        int printed = 0;

        if (max_summary == 0 || max_summary > data->nr_nodes)
                max_summary = data->nr_nodes;

        for (int i = 0; i < max_summary; i++) {
                struct syscall_node *node = &data->nodes[i];
                struct syscall_stats *stat = &node->stats;
                double total = (double)(stat->total_time) / NSEC_PER_MSEC;
                double min = (double)(stat->min_time) / NSEC_PER_MSEC;
                double max = (double)(stat->max_time) / NSEC_PER_MSEC;
                double avg = total / stat->count;
                const char *name;

                /* TODO: support other ABIs */
                name = syscalltbl__name(EM_HOST, node->syscall_nr);
                if (name)
                        printed += fprintf(fp, "   %-15s", name);
                else
                        printed += fprintf(fp, "   syscall:%-7d", node->syscall_nr);

                printed += fprintf(fp, " %8u %6u %9.3f %9.3f %9.3f %9.3f %9.2f%%\n",
                                   stat->count, stat->error, total, min, avg, max,
                                   rel_stddev(stat));
        }
        return printed;
}

static int update_thread_stats(struct hashmap *hash, struct syscall_key *map_key,
                               struct syscall_stats *map_data)
{
        struct syscall_data *data;
        struct syscall_node *nodes;

        if (!hashmap__find(hash, map_key->cpu_or_tid, &data)) {
                data = zalloc(sizeof(*data));
                if (data == NULL)
                        return -ENOMEM;

                data->key = map_key->cpu_or_tid;
                if (hashmap__add(hash, data->key, data) < 0) {
                        free(data);
                        return -ENOMEM;
                }
        }

        /* update thread total stats */
        data->nr_events += map_data->count;
        data->total_time += map_data->total_time;

        nodes = reallocarray(data->nodes, data->nr_nodes + 1, sizeof(*nodes));
        if (nodes == NULL)
                return -ENOMEM;

        data->nodes = nodes;
        nodes = &data->nodes[data->nr_nodes++];
        nodes->syscall_nr = map_key->nr;

        /* each thread has an entry for each syscall, just use the stat */
        memcpy(&nodes->stats, map_data, sizeof(*map_data));
        return 0;
}

static int print_thread_stat(struct syscall_data *data, int max_summary, FILE *fp)
{
        int printed = 0;

        qsort(data->nodes, data->nr_nodes, sizeof(*data->nodes), nodecmp);

        printed += fprintf(fp, " thread (%d), ", (int)data->key);
        printed += fprintf(fp, "%d events\n\n", data->nr_events);

        printed += fprintf(fp, "   syscall            calls  errors  total       min       avg       max       stddev\n");
        printed += fprintf(fp, "                                     (msec)    (msec)    (msec)    (msec)        (%%)\n");
        printed += fprintf(fp, "   --------------- --------  ------ -------- --------- --------- ---------     ------\n");

        printed += print_common_stats(data, max_summary, fp);
        printed += fprintf(fp, "\n\n");

        return printed;
}

static int print_thread_stats(struct syscall_data **data, int nr_data, int max_summary, FILE *fp)
{
        int printed = 0;

        for (int i = 0; i < nr_data; i++)
                printed += print_thread_stat(data[i], max_summary, fp);

        return printed;
}

static int update_total_stats(struct hashmap *hash, struct syscall_key *map_key,
                              struct syscall_stats *map_data)
{
        struct syscall_data *data;
        struct syscall_stats *stat;

        if (!hashmap__find(hash, map_key->nr, &data)) {
                data = zalloc(sizeof(*data));
                if (data == NULL)
                        return -ENOMEM;

                data->nodes = zalloc(sizeof(*data->nodes));
                if (data->nodes == NULL) {
                        free(data);
                        return -ENOMEM;
                }

                data->nr_nodes = 1;
                data->key = map_key->nr;
                data->nodes->syscall_nr = data->key;

                if (hashmap__add(hash, data->key, data) < 0) {
                        free(data->nodes);
                        free(data);
                        return -ENOMEM;
                }
        }

        /* update total stats for this syscall */
        data->nr_events += map_data->count;
        data->total_time += map_data->total_time;

        /* This is sum of the same syscall from different CPUs */
        stat = &data->nodes->stats;

        stat->total_time += map_data->total_time;
        stat->squared_sum += map_data->squared_sum;
        stat->count += map_data->count;
        stat->error += map_data->error;

        if (stat->max_time < map_data->max_time)
                stat->max_time = map_data->max_time;
        if (stat->min_time > map_data->min_time || stat->min_time == 0)
                stat->min_time = map_data->min_time;

        return 0;
}

static int print_total_stats(struct syscall_data **data, int nr_data, int max_summary, FILE *fp)
{
        int printed = 0;
        int nr_events = 0;

        for (int i = 0; i < nr_data; i++)
                nr_events += data[i]->nr_events;

        printed += fprintf(fp, " total, %d events\n\n", nr_events);

        printed += fprintf(fp, "   syscall            calls  errors  total       min       avg       max       stddev\n");
        printed += fprintf(fp, "                                     (msec)    (msec)    (msec)    (msec)        (%%)\n");
        printed += fprintf(fp, "   --------------- --------  ------ -------- --------- --------- ---------     ------\n");

        if (max_summary == 0 || max_summary > nr_data)
                max_summary = nr_data;

        for (int i = 0; i < max_summary; i++)
                printed += print_common_stats(data[i], max_summary, fp);

        printed += fprintf(fp, "\n\n");
        return printed;
}

static int update_cgroup_stats(struct hashmap *hash, struct syscall_key *map_key,
                               struct syscall_stats *map_data)
{
        struct syscall_data *data;
        struct syscall_node *nodes;

        if (!hashmap__find(hash, map_key->cgroup, &data)) {
                data = zalloc(sizeof(*data));
                if (data == NULL)
                        return -ENOMEM;

                data->key = map_key->cgroup;
                if (hashmap__add(hash, data->key, data) < 0) {
                        free(data);
                        return -ENOMEM;
                }
        }

        /* update thread total stats */
        data->nr_events += map_data->count;
        data->total_time += map_data->total_time;

        nodes = reallocarray(data->nodes, data->nr_nodes + 1, sizeof(*nodes));
        if (nodes == NULL)
                return -ENOMEM;

        data->nodes = nodes;
        nodes = &data->nodes[data->nr_nodes++];
        nodes->syscall_nr = map_key->nr;

        /* each thread has an entry for each syscall, just use the stat */
        memcpy(&nodes->stats, map_data, sizeof(*map_data));
        return 0;
}

static int print_cgroup_stat(struct syscall_data *data, int max_summary, FILE *fp)
{
        int printed = 0;
        struct cgroup *cgrp = __cgroup__find(&cgroups, data->key);

        qsort(data->nodes, data->nr_nodes, sizeof(*data->nodes), nodecmp);

        if (cgrp)
                printed += fprintf(fp, " cgroup %s,", cgrp->name);
        else
                printed += fprintf(fp, " cgroup id:%lu,", (unsigned long)data->key);

        printed += fprintf(fp, " %d events\n\n", data->nr_events);

        printed += fprintf(fp, "   syscall            calls  errors  total       min       avg       max       stddev\n");
        printed += fprintf(fp, "                                     (msec)    (msec)    (msec)    (msec)        (%%)\n");
        printed += fprintf(fp, "   --------------- --------  ------ -------- --------- --------- ---------     ------\n");

        printed += print_common_stats(data, max_summary, fp);
        printed += fprintf(fp, "\n\n");

        return printed;
}

static int print_cgroup_stats(struct syscall_data **data, int nr_data, int max_summary, FILE *fp)
{
        int printed = 0;

        for (int i = 0; i < nr_data; i++)
                printed += print_cgroup_stat(data[i], max_summary, fp);

        return printed;
}

int trace_print_bpf_summary(FILE *fp, int max_summary)
{
        struct bpf_map *map = skel->maps.syscall_stats_map;
        struct syscall_key *prev_key, key;
        struct syscall_data **data = NULL;
        struct hashmap schash;
        struct hashmap_entry *entry;
        int nr_data = 0;
        int printed = 0;
        int i;
        size_t bkt;

        hashmap__init(&schash, sc_node_hash, sc_node_equal, /*ctx=*/NULL);

        printed = fprintf(fp, "\n Summary of events:\n\n");

        /* get stats from the bpf map */
        prev_key = NULL;
        while (!bpf_map__get_next_key(map, prev_key, &key, sizeof(key))) {
                struct syscall_stats stat;

                if (!bpf_map__lookup_elem(map, &key, sizeof(key), &stat, sizeof(stat), 0)) {
                        switch (skel->rodata->aggr_mode) {
                        case SYSCALL_AGGR_THREAD:
                                update_thread_stats(&schash, &key, &stat);
                                break;
                        case SYSCALL_AGGR_CPU:
                                update_total_stats(&schash, &key, &stat);
                                break;
                        case SYSCALL_AGGR_CGROUP:
                                update_cgroup_stats(&schash, &key, &stat);
                                break;
                        default:
                                break;
                        }
                }

                prev_key = &key;
        }

        nr_data = hashmap__size(&schash);
        data = calloc(nr_data, sizeof(*data));
        if (data == NULL)
                goto out;

        i = 0;
        hashmap__for_each_entry(&schash, entry, bkt)
                data[i++] = entry->pvalue;

        qsort(data, nr_data, sizeof(*data), datacmp);

        switch (skel->rodata->aggr_mode) {
        case SYSCALL_AGGR_THREAD:
                printed += print_thread_stats(data, nr_data, max_summary, fp);
                break;
        case SYSCALL_AGGR_CPU:
                printed += print_total_stats(data, nr_data, max_summary, fp);
                break;
        case SYSCALL_AGGR_CGROUP:
                printed += print_cgroup_stats(data, nr_data, max_summary, fp);
                break;
        default:
                break;
        }

        for (i = 0; i < nr_data && data; i++) {
                free(data[i]->nodes);
                free(data[i]);
        }
        free(data);

out:
        hashmap__clear(&schash);
        return printed;
}

void trace_cleanup_bpf_summary(void)
{
        if (!RB_EMPTY_ROOT(&cgroups)) {
                struct cgroup *cgrp, *tmp;

                rbtree_postorder_for_each_entry_safe(cgrp, tmp, &cgroups, node)
                        cgroup__put(cgrp);

                cgroups = RB_ROOT;
        }

        syscall_summary_bpf__destroy(skel);
}