root/tools/perf/util/bpf_skel/off_cpu.bpf.c
// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
// Copyright (c) 2022 Google
#include "vmlinux.h"
#include <bpf/bpf_helpers.h>
#include <bpf/bpf_tracing.h>
#include <bpf/bpf_core_read.h>

/* task->flags for off-cpu analysis */
#define PF_KTHREAD   0x00200000  /* I am a kernel thread */

/* task->state for off-cpu analysis */
#define TASK_INTERRUPTIBLE      0x0001
#define TASK_UNINTERRUPTIBLE    0x0002

/* create a new thread */
#define CLONE_THREAD  0x10000

#define MAX_STACKS   32
#define MAX_ENTRIES  102400

#define MAX_CPUS  4096
#define MAX_OFFCPU_LEN 37

// We have a 'struct stack' in vmlinux.h when building with GEN_VMLINUX_H=1
struct __stack {
        u64 array[MAX_STACKS];
};

struct tstamp_data {
        __u32 stack_id;
        __u32 state;
        __u64 timestamp;
        struct __stack stack;
};

struct offcpu_key {
        __u32 pid;
        __u32 tgid;
        __u32 stack_id;
        __u32 state;
        __u64 cgroup_id;
};

struct {
        __uint(type, BPF_MAP_TYPE_STACK_TRACE);
        __uint(key_size, sizeof(__u32));
        __uint(value_size, MAX_STACKS * sizeof(__u64));
        __uint(max_entries, MAX_ENTRIES);
} stacks SEC(".maps");

struct offcpu_data {
        u64 array[MAX_OFFCPU_LEN];
};

struct {
        __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
        __uint(key_size, sizeof(int));
        __uint(value_size, sizeof(int));
        __uint(max_entries, MAX_CPUS);
} offcpu_output SEC(".maps");

struct {
        __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
        __uint(key_size, sizeof(__u32));
        __uint(value_size, sizeof(struct offcpu_data));
        __uint(max_entries, 1);
} offcpu_payload SEC(".maps");

struct {
        __uint(type, BPF_MAP_TYPE_TASK_STORAGE);
        __uint(map_flags, BPF_F_NO_PREALLOC);
        __type(key, int);
        __type(value, struct tstamp_data);
} tstamp SEC(".maps");

struct {
        __uint(type, BPF_MAP_TYPE_HASH);
        __uint(key_size, sizeof(struct offcpu_key));
        __uint(value_size, sizeof(__u64));
        __uint(max_entries, MAX_ENTRIES);
} off_cpu SEC(".maps");

struct {
        __uint(type, BPF_MAP_TYPE_HASH);
        __uint(key_size, sizeof(__u32));
        __uint(value_size, sizeof(__u8));
        __uint(max_entries, 1);
} cpu_filter SEC(".maps");

struct {
        __uint(type, BPF_MAP_TYPE_HASH);
        __uint(key_size, sizeof(__u32));
        __uint(value_size, sizeof(__u8));
        __uint(max_entries, 1);
} task_filter SEC(".maps");

struct {
        __uint(type, BPF_MAP_TYPE_HASH);
        __uint(key_size, sizeof(__u64));
        __uint(value_size, sizeof(__u8));
        __uint(max_entries, 1);
} cgroup_filter SEC(".maps");

/* new kernel task_struct definition */
struct task_struct___new {
        long __state;
} __attribute__((preserve_access_index));

/* old kernel task_struct definition */
struct task_struct___old {
        long state;
} __attribute__((preserve_access_index));

int enabled = 0;

const volatile int has_cpu = 0;
const volatile int has_task = 0;
const volatile int has_cgroup = 0;
const volatile int uses_tgid = 0;

const volatile bool has_prev_state = false;
const volatile bool needs_cgroup = false;
const volatile bool uses_cgroup_v1 = false;

int perf_subsys_id = -1;

__u64 offcpu_thresh_ns;

/*
 * Old kernel used to call it task_struct->state and now it's '__state'.
 * Use BPF CO-RE "ignored suffix rule" to deal with it like below:
 *
 * https://nakryiko.com/posts/bpf-core-reference-guide/#handling-incompatible-field-and-type-changes
 */
static inline int get_task_state(struct task_struct *t)
{
        /* recast pointer to capture new type for compiler */
        struct task_struct___new *t_new = (void *)t;

        if (bpf_core_field_exists(t_new->__state)) {
                return BPF_CORE_READ(t_new, __state);
        } else {
                /* recast pointer to capture old type for compiler */
                struct task_struct___old *t_old = (void *)t;

                return BPF_CORE_READ(t_old, state);
        }
}

static inline __u64 get_cgroup_id(struct task_struct *t)
{
        struct cgroup *cgrp;

        if (!uses_cgroup_v1)
                return BPF_CORE_READ(t, cgroups, dfl_cgrp, kn, id);

        if (perf_subsys_id == -1) {
#if __has_builtin(__builtin_preserve_enum_value)
                perf_subsys_id = bpf_core_enum_value(enum cgroup_subsys_id,
                                                     perf_event_cgrp_id);
#else
                perf_subsys_id = perf_event_cgrp_id;
#endif
        }

        cgrp = BPF_CORE_READ(t, cgroups, subsys[perf_subsys_id], cgroup);
        return BPF_CORE_READ(cgrp, kn, id);
}

static inline int can_record(struct task_struct *t, int state)
{
        /* kernel threads don't have user stack */
        if (t->flags & PF_KTHREAD)
                return 0;

        if (state != TASK_INTERRUPTIBLE &&
            state != TASK_UNINTERRUPTIBLE)
                return 0;

        if (has_cpu) {
                __u32 cpu = bpf_get_smp_processor_id();
                __u8 *ok;

                ok = bpf_map_lookup_elem(&cpu_filter, &cpu);
                if (!ok)
                        return 0;
        }

        if (has_task) {
                __u8 *ok;
                __u32 pid;

                if (uses_tgid)
                        pid = t->tgid;
                else
                        pid = t->pid;

                ok = bpf_map_lookup_elem(&task_filter, &pid);
                if (!ok)
                        return 0;
        }

        if (has_cgroup) {
                __u8 *ok;
                __u64 cgrp_id = get_cgroup_id(t);

                ok = bpf_map_lookup_elem(&cgroup_filter, &cgrp_id);
                if (!ok)
                        return 0;
        }

        return 1;
}

static inline int copy_stack(struct __stack *from, struct offcpu_data *to, int n)
{
        int len = 0;

        for (int i = 0; i < MAX_STACKS && from->array[i]; ++i, ++len)
                to->array[n + 2 + i] = from->array[i];

        return len;
}

/**
 * off_cpu_dump - dump off-cpu samples to ring buffer
 * @data: payload for dumping off-cpu samples
 * @key: off-cpu data
 * @stack: stack trace of the task before being scheduled out
 *
 * If the threshold of off-cpu time is reached, acquire tid, period, callchain, and cgroup id
 * information of the task, and dump it as a raw sample to perf ring buffer
 */
static int off_cpu_dump(void *ctx, struct offcpu_data *data, struct offcpu_key *key,
                        struct __stack *stack, __u64 delta)
{
        int n = 0, len = 0;

        data->array[n++] = (u64)key->tgid << 32 | key->pid;
        data->array[n++] = delta;

        /* data->array[n] is callchain->nr (updated later) */
        data->array[n + 1] = PERF_CONTEXT_USER;
        data->array[n + 2] = 0;
        len = copy_stack(stack, data, n);

        /* update length of callchain */
        data->array[n] = len + 1;
        n += len + 2;

        data->array[n++] = key->cgroup_id;

        return bpf_perf_event_output(ctx, &offcpu_output, BPF_F_CURRENT_CPU, data, n * sizeof(u64));
}

static int off_cpu_stat(u64 *ctx, struct task_struct *prev,
                        struct task_struct *next, int state)
{
        __u64 ts;
        __u32 stack_id;
        struct tstamp_data *pelem;

        ts = bpf_ktime_get_ns();

        if (!can_record(prev, state))
                goto next;

        stack_id = bpf_get_stackid(ctx, &stacks,
                                   BPF_F_FAST_STACK_CMP | BPF_F_USER_STACK);

        pelem = bpf_task_storage_get(&tstamp, prev, NULL,
                                     BPF_LOCAL_STORAGE_GET_F_CREATE);
        if (!pelem)
                goto next;

        pelem->timestamp = ts;
        pelem->state = state;
        pelem->stack_id = stack_id;

        /*
         * If stacks are successfully collected by bpf_get_stackid(), collect them once more
         * in task_storage for direct off-cpu sample dumping
         */
        if (stack_id > 0 && bpf_get_stack(ctx, &pelem->stack, MAX_STACKS * sizeof(u64), BPF_F_USER_STACK)) {
                /*
                 * This empty if block is used to avoid 'result unused warning' from bpf_get_stack().
                 * If the collection fails, continue with the logic for the next task.
                 */
        }
next:
        pelem = bpf_task_storage_get(&tstamp, next, NULL, 0);

        if (pelem && pelem->timestamp) {
                struct offcpu_key key = {
                        .pid = next->pid,
                        .tgid = next->tgid,
                        .stack_id = pelem->stack_id,
                        .state = pelem->state,
                        .cgroup_id = needs_cgroup ? get_cgroup_id(next) : 0,
                };
                __u64 delta = ts - pelem->timestamp;
                __u64 *total;

                if (delta >= offcpu_thresh_ns) {
                        int zero = 0;
                        struct offcpu_data *data = bpf_map_lookup_elem(&offcpu_payload, &zero);

                        if (data)
                                off_cpu_dump(ctx, data, &key, &pelem->stack, delta);
                } else {
                        total = bpf_map_lookup_elem(&off_cpu, &key);
                        if (total)
                                *total += delta;
                        else
                                bpf_map_update_elem(&off_cpu, &key, &delta, BPF_ANY);
                }

                /* prevent to reuse the timestamp later */
                pelem->timestamp = 0;
        }

        return 0;
}

SEC("tp_btf/task_newtask")
int on_newtask(u64 *ctx)
{
        struct task_struct *task;
        u64 clone_flags;
        u32 pid;
        u8 val = 1;

        if (!uses_tgid)
                return 0;

        task = (struct task_struct *)bpf_get_current_task();

        pid = BPF_CORE_READ(task, tgid);
        if (!bpf_map_lookup_elem(&task_filter, &pid))
                return 0;

        task = (struct task_struct *)ctx[0];
        clone_flags = ctx[1];

        pid = task->tgid;
        if (!(clone_flags & CLONE_THREAD))
                bpf_map_update_elem(&task_filter, &pid, &val, BPF_NOEXIST);

        return 0;
}

SEC("tp_btf/sched_switch")
int on_switch(u64 *ctx)
{
        struct task_struct *prev, *next;
        int prev_state;

        if (!enabled)
                return 0;

        prev = (struct task_struct *)ctx[1];
        next = (struct task_struct *)ctx[2];

        if (has_prev_state)
                prev_state = (int)ctx[3];
        else
                prev_state = get_task_state(prev);

        return off_cpu_stat(ctx, prev, next, prev_state & 0xff);
}

char LICENSE[] SEC("license") = "Dual BSD/GPL";