root/tools/perf/util/bpf_skel/bperf_cgroup.bpf.c
// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
// Copyright (c) 2021 Facebook
// Copyright (c) 2021 Google
#include "bperf_cgroup.h"
#include "vmlinux.h"
#include <bpf/bpf_helpers.h>
#include <bpf/bpf_tracing.h>
#include <bpf/bpf_core_read.h>

// NOTE: many of map and global data will be modified before loading
//       from the userspace (perf tool) using the skeleton helpers.

// single set of global perf events to measure
struct {
        __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
        __uint(key_size, sizeof(__u32));
        __uint(value_size, sizeof(int));
        __uint(max_entries, 1);
} events SEC(".maps");

// from cgroup id to event index
struct {
        __uint(type, BPF_MAP_TYPE_HASH);
        __uint(key_size, sizeof(__u64));
        __uint(value_size, sizeof(__u32));
        __uint(max_entries, 1);
} cgrp_idx SEC(".maps");

// per-cpu event snapshots to calculate delta
struct {
        __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
        __uint(key_size, sizeof(__u32));
        __uint(value_size, sizeof(struct bpf_perf_event_value));
} prev_readings SEC(".maps");

// aggregated event values for each cgroup (per-cpu)
// will be read from the user-space
struct {
        __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
        __uint(key_size, sizeof(__u32));
        __uint(value_size, sizeof(struct bpf_perf_event_value));
} cgrp_readings SEC(".maps");

/* new kernel cgroup definition */
struct cgroup___new {
        int level;
        struct cgroup *ancestors[];
} __attribute__((preserve_access_index));

/* old kernel cgroup definition */
struct cgroup___old {
        int level;
        u64 ancestor_ids[];
} __attribute__((preserve_access_index));

const volatile __u32 num_events = 1;
const volatile __u32 num_cpus = 1;
const volatile int use_cgroup_v2 = 0;

int enabled = 0;
int perf_subsys_id = -1;

static inline __u64 get_cgroup_v1_ancestor_id(struct cgroup *cgrp, int level)
{
        /* recast pointer to capture new type for compiler */
        struct cgroup___new *cgrp_new = (void *)cgrp;

        if (bpf_core_field_exists(cgrp_new->ancestors)) {
                return BPF_CORE_READ(cgrp_new, ancestors[level], kn, id);
        } else {
                /* recast pointer to capture old type for compiler */
                struct cgroup___old *cgrp_old = (void *)cgrp;

                return BPF_CORE_READ(cgrp_old, ancestor_ids[level]);
        }
}

static inline int get_cgroup_v1_idx(__u32 *cgrps, int size)
{
        struct task_struct *p = (void *)bpf_get_current_task();
        struct cgroup *cgrp;
        register int i = 0;
        __u32 *elem;
        int level;
        int cnt;

        if (perf_subsys_id == -1) {
#if __has_builtin(__builtin_preserve_enum_value)
                perf_subsys_id = bpf_core_enum_value(enum cgroup_subsys_id,
                                                     perf_event_cgrp_id);
#else
                perf_subsys_id = perf_event_cgrp_id;
#endif
        }
        cgrp = BPF_CORE_READ(p, cgroups, subsys[perf_subsys_id], cgroup);
        level = BPF_CORE_READ(cgrp, level);

        for (cnt = 0; i < BPERF_CGROUP__MAX_LEVELS; i++) {
                __u64 cgrp_id;

                if (i > level)
                        break;

                // convert cgroup-id to a map index
                cgrp_id = get_cgroup_v1_ancestor_id(cgrp, i);
                elem = bpf_map_lookup_elem(&cgrp_idx, &cgrp_id);
                if (!elem)
                        continue;

                cgrps[cnt++] = *elem;
                if (cnt == size)
                        break;
        }

        return cnt;
}

static inline int get_cgroup_v2_idx(__u32 *cgrps, int size)
{
        register int i = 0;
        __u32 *elem;
        int cnt;

        for (cnt = 0; i < BPERF_CGROUP__MAX_LEVELS; i++) {
                __u64 cgrp_id = bpf_get_current_ancestor_cgroup_id(i);

                if (cgrp_id == 0)
                        break;

                // convert cgroup-id to a map index
                elem = bpf_map_lookup_elem(&cgrp_idx, &cgrp_id);
                if (!elem)
                        continue;

                cgrps[cnt++] = *elem;
                if (cnt == size)
                        break;
        }

        return cnt;
}

static int bperf_cgroup_count(void)
{
        register __u32 idx = 0;  // to have it in a register to pass BPF verifier
        register int c = 0;
        struct bpf_perf_event_value val, delta, *prev_val, *cgrp_val;
        __u32 cpu = bpf_get_smp_processor_id();
        __u32 cgrp_idx[BPERF_CGROUP__MAX_LEVELS];
        int cgrp_cnt;
        __u32 key, cgrp;
        long err;

        if (use_cgroup_v2)
                cgrp_cnt = get_cgroup_v2_idx(cgrp_idx, BPERF_CGROUP__MAX_LEVELS);
        else
                cgrp_cnt = get_cgroup_v1_idx(cgrp_idx, BPERF_CGROUP__MAX_LEVELS);

        for ( ; idx < BPERF_CGROUP__MAX_EVENTS; idx++) {
                if (idx == num_events)
                        break;

                // XXX: do not pass idx directly (for verifier)
                key = idx;
                // this is per-cpu array for diff
                prev_val = bpf_map_lookup_elem(&prev_readings, &key);
                if (!prev_val) {
                        val.counter = val.enabled = val.running = 0;
                        bpf_map_update_elem(&prev_readings, &key, &val, BPF_ANY);

                        prev_val = bpf_map_lookup_elem(&prev_readings, &key);
                        if (!prev_val)
                                continue;
                }

                // read from global perf_event array
                key = idx * num_cpus + cpu;
                err = bpf_perf_event_read_value(&events, key, &val, sizeof(val));
                if (err)
                        continue;

                if (enabled) {
                        delta.counter = val.counter - prev_val->counter;
                        delta.enabled = val.enabled - prev_val->enabled;
                        delta.running = val.running - prev_val->running;

                        for (c = 0; c < BPERF_CGROUP__MAX_LEVELS; c++) {
                                if (c == cgrp_cnt)
                                        break;

                                cgrp = cgrp_idx[c];

                                // aggregate the result by cgroup
                                key = cgrp * num_events + idx;
                                cgrp_val = bpf_map_lookup_elem(&cgrp_readings, &key);
                                if (cgrp_val) {
                                        cgrp_val->counter += delta.counter;
                                        cgrp_val->enabled += delta.enabled;
                                        cgrp_val->running += delta.running;
                                } else {
                                        bpf_map_update_elem(&cgrp_readings, &key,
                                                            &delta, BPF_ANY);
                                }
                        }
                }

                *prev_val = val;
        }
        return 0;
}

// This will be attached to cgroup-switches event for each cpu
SEC("perf_event")
int BPF_PROG(on_cgrp_switch)
{
        return bperf_cgroup_count();
}

SEC("raw_tp/sched_switch")
int BPF_PROG(trigger_read)
{
        return bperf_cgroup_count();
}

char LICENSE[] SEC("license") = "Dual BSD/GPL";