root/tools/testing/selftests/bpf/map_tests/map_percpu_stats.c
// SPDX-License-Identifier: GPL-2.0
/* Copyright (c) 2023 Isovalent */

#include <errno.h>
#include <unistd.h>
#include <pthread.h>

#include <bpf/bpf.h>
#include <bpf/libbpf.h>

#include <bpf_util.h>
#include <test_maps.h>

#include "map_percpu_stats.skel.h"

#define MAX_ENTRIES                     16384
#define MAX_ENTRIES_HASH_OF_MAPS        64
#define N_THREADS                       8
#define MAX_MAP_KEY_SIZE                4
#define PCPU_MIN_UNIT_SIZE              32768

static void map_info(int map_fd, struct bpf_map_info *info)
{
        __u32 len = sizeof(*info);
        int ret;

        memset(info, 0, sizeof(*info));

        ret = bpf_obj_get_info_by_fd(map_fd, info, &len);
        CHECK(ret < 0, "bpf_obj_get_info_by_fd", "error: %s\n", strerror(errno));
}

static const char *map_type_to_s(__u32 type)
{
        switch (type) {
        case BPF_MAP_TYPE_HASH:
                return "HASH";
        case BPF_MAP_TYPE_PERCPU_HASH:
                return "PERCPU_HASH";
        case BPF_MAP_TYPE_LRU_HASH:
                return "LRU_HASH";
        case BPF_MAP_TYPE_LRU_PERCPU_HASH:
                return "LRU_PERCPU_HASH";
        case BPF_MAP_TYPE_HASH_OF_MAPS:
                return "BPF_MAP_TYPE_HASH_OF_MAPS";
        default:
                return "<define-me>";
        }
}

static __u32 map_count_elements(__u32 type, int map_fd)
{
        __u32 key = -1;
        int n = 0;

        while (!bpf_map_get_next_key(map_fd, &key, &key))
                n++;
        return n;
}

#define BATCH   true

static void delete_and_lookup_batch(int map_fd, void *keys, __u32 count)
{
        static __u8 values[(8 << 10) * MAX_ENTRIES];
        void *in_batch = NULL, *out_batch;
        __u32 save_count = count;
        int ret;

        ret = bpf_map_lookup_and_delete_batch(map_fd,
                                              &in_batch, &out_batch,
                                              keys, values, &count,
                                              NULL);

        /*
         * Despite what uapi header says, lookup_and_delete_batch will return
         * -ENOENT in case we successfully have deleted all elements, so check
         * this separately
         */
        CHECK(ret < 0 && (errno != ENOENT || !count), "bpf_map_lookup_and_delete_batch",
                       "error: %s\n", strerror(errno));

        CHECK(count != save_count,
                        "bpf_map_lookup_and_delete_batch",
                        "deleted not all elements: removed=%u expected=%u\n",
                        count, save_count);
}

static void delete_all_elements(__u32 type, int map_fd, bool batch)
{
        static __u8 val[8 << 10]; /* enough for 1024 CPUs */
        __u32 key = -1;
        void *keys;
        __u32 i, n;
        int ret;

        keys = calloc(MAX_MAP_KEY_SIZE, MAX_ENTRIES);
        CHECK(!keys, "calloc", "error: %s\n", strerror(errno));

        for (n = 0; !bpf_map_get_next_key(map_fd, &key, &key); n++)
                memcpy(keys + n*MAX_MAP_KEY_SIZE, &key, MAX_MAP_KEY_SIZE);

        if (batch) {
                /* Can't mix delete_batch and delete_and_lookup_batch because
                 * they have different semantics in relation to the keys
                 * argument. However, delete_batch utilize map_delete_elem,
                 * so we actually test it in non-batch scenario */
                delete_and_lookup_batch(map_fd, keys, n);
        } else {
                /* Intentionally mix delete and lookup_and_delete so we can test both */
                for (i = 0; i < n; i++) {
                        void *keyp = keys + i*MAX_MAP_KEY_SIZE;

                        if (i % 2 || type == BPF_MAP_TYPE_HASH_OF_MAPS) {
                                ret = bpf_map_delete_elem(map_fd, keyp);
                                CHECK(ret < 0, "bpf_map_delete_elem",
                                               "error: key %u: %s\n", i, strerror(errno));
                        } else {
                                ret = bpf_map_lookup_and_delete_elem(map_fd, keyp, val);
                                CHECK(ret < 0, "bpf_map_lookup_and_delete_elem",
                                               "error: key %u: %s\n", i, strerror(errno));
                        }
                }
        }

        free(keys);
}

static bool is_lru(__u32 map_type)
{
        return map_type == BPF_MAP_TYPE_LRU_HASH ||
               map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH;
}

static bool is_percpu(__u32 map_type)
{
        return map_type == BPF_MAP_TYPE_PERCPU_HASH ||
               map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH;
}

struct upsert_opts {
        __u32 map_type;
        int map_fd;
        __u32 n;
        bool retry_for_nomem;
};

static int create_small_hash(void)
{
        int map_fd;

        map_fd = bpf_map_create(BPF_MAP_TYPE_HASH, "small", 4, 4, 4, NULL);
        CHECK(map_fd < 0, "bpf_map_create()", "error:%s (name=%s)\n",
                        strerror(errno), "small");

        return map_fd;
}

static bool retry_for_nomem_fn(int err)
{
        return err == ENOMEM;
}

static void *patch_map_thread(void *arg)
{
        /* 8KB is enough for 1024 CPUs. And it is shared between N_THREADS. */
        static __u8 blob[8 << 10];
        struct upsert_opts *opts = arg;
        void *val_ptr;
        int val;
        int ret;
        int i;

        for (i = 0; i < opts->n; i++) {
                if (opts->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) {
                        val = create_small_hash();
                        val_ptr = &val;
                } else if (is_percpu(opts->map_type)) {
                        val_ptr = blob;
                } else {
                        val = rand();
                        val_ptr = &val;
                }

                /* 2 seconds may be enough ? */
                if (opts->retry_for_nomem)
                        ret = map_update_retriable(opts->map_fd, &i, val_ptr, 0,
                                                   40, retry_for_nomem_fn);
                else
                        ret = bpf_map_update_elem(opts->map_fd, &i, val_ptr, 0);
                CHECK(ret < 0, "bpf_map_update_elem", "key=%d error: %s\n", i, strerror(errno));

                if (opts->map_type == BPF_MAP_TYPE_HASH_OF_MAPS)
                        close(val);
        }
        return NULL;
}

static void upsert_elements(struct upsert_opts *opts)
{
        pthread_t threads[N_THREADS];
        int ret;
        int i;

        for (i = 0; i < ARRAY_SIZE(threads); i++) {
                ret = pthread_create(&i[threads], NULL, patch_map_thread, opts);
                CHECK(ret != 0, "pthread_create", "error: %s\n", strerror(ret));
        }

        for (i = 0; i < ARRAY_SIZE(threads); i++) {
                ret = pthread_join(i[threads], NULL);
                CHECK(ret != 0, "pthread_join", "error: %s\n", strerror(ret));
        }
}

static __u32 read_cur_elements(int iter_fd)
{
        char buf[64];
        ssize_t n;
        __u32 ret;

        n = read(iter_fd, buf, sizeof(buf)-1);
        CHECK(n <= 0, "read", "error: %s\n", strerror(errno));
        buf[n] = '\0';

        errno = 0;
        ret = (__u32)strtol(buf, NULL, 10);
        CHECK(errno != 0, "strtol", "error: %s\n", strerror(errno));

        return ret;
}

static __u32 get_cur_elements(int map_id)
{
        struct map_percpu_stats *skel;
        struct bpf_link *link;
        __u32 n_elements;
        int iter_fd;
        int ret;

        skel = map_percpu_stats__open();
        CHECK(skel == NULL, "map_percpu_stats__open", "error: %s", strerror(errno));

        skel->bss->target_id = map_id;

        ret = map_percpu_stats__load(skel);
        CHECK(ret != 0, "map_percpu_stats__load", "error: %s", strerror(errno));

        link = bpf_program__attach_iter(skel->progs.dump_bpf_map, NULL);
        CHECK(!link, "bpf_program__attach_iter", "error: %s\n", strerror(errno));

        iter_fd = bpf_iter_create(bpf_link__fd(link));
        CHECK(iter_fd < 0, "bpf_iter_create", "error: %s\n", strerror(errno));

        n_elements = read_cur_elements(iter_fd);

        close(iter_fd);
        bpf_link__destroy(link);
        map_percpu_stats__destroy(skel);

        return n_elements;
}

static void check_expected_number_elements(__u32 n_inserted, int map_fd,
                                           struct bpf_map_info *info)
{
        __u32 n_real;
        __u32 n_iter;

        /* Count the current number of elements in the map by iterating through
         * all the map keys via bpf_get_next_key
         */
        n_real = map_count_elements(info->type, map_fd);

        /* The "real" number of elements should be the same as the inserted
         * number of elements in all cases except LRU maps, where some elements
         * may have been evicted
         */
        if (n_inserted == 0 || !is_lru(info->type))
                CHECK(n_inserted != n_real, "map_count_elements",
                      "n_real(%u) != n_inserted(%u)\n", n_real, n_inserted);

        /* Count the current number of elements in the map using an iterator */
        n_iter = get_cur_elements(info->id);

        /* Both counts should be the same, as all updates are over */
        CHECK(n_iter != n_real, "get_cur_elements",
              "n_iter=%u, expected %u (map_type=%s,map_flags=%08x)\n",
              n_iter, n_real, map_type_to_s(info->type), info->map_flags);
}

static void __test(int map_fd)
{
        struct upsert_opts opts = {
                .map_fd = map_fd,
        };
        struct bpf_map_info info;

        map_info(map_fd, &info);
        opts.map_type = info.type;
        opts.n = info.max_entries;

        /* Reduce the number of elements we are updating such that we don't
         * bump into -E2BIG from non-preallocated hash maps, but still will
         * have some evictions for LRU maps  */
        if (opts.map_type != BPF_MAP_TYPE_HASH_OF_MAPS)
                opts.n -= 512;
        else
                opts.n /= 2;

        /* per-cpu bpf memory allocator may not be able to allocate per-cpu
         * pointer successfully and it can not refill free llist timely, and
         * bpf_map_update_elem() will return -ENOMEM. so just retry to mitigate
         * the problem temporarily.
         */
        opts.retry_for_nomem = is_percpu(opts.map_type) && (info.map_flags & BPF_F_NO_PREALLOC);

        /*
         * Upsert keys [0, n) under some competition: with random values from
         * N_THREADS threads. Check values, then delete all elements and check
         * values again.
         */
        upsert_elements(&opts);
        check_expected_number_elements(opts.n, map_fd, &info);
        delete_all_elements(info.type, map_fd, !BATCH);
        check_expected_number_elements(0, map_fd, &info);

        /* Now do the same, but using batch delete operations */
        upsert_elements(&opts);
        check_expected_number_elements(opts.n, map_fd, &info);
        delete_all_elements(info.type, map_fd, BATCH);
        check_expected_number_elements(0, map_fd, &info);

        close(map_fd);
}

static int map_create_opts(__u32 type, const char *name,
                           struct bpf_map_create_opts *map_opts,
                           __u32 key_size, __u32 val_size)
{
        int max_entries;
        int map_fd;

        if (type == BPF_MAP_TYPE_HASH_OF_MAPS)
                max_entries = MAX_ENTRIES_HASH_OF_MAPS;
        else
                max_entries = MAX_ENTRIES;

        map_fd = bpf_map_create(type, name, key_size, val_size, max_entries, map_opts);
        CHECK(map_fd < 0, "bpf_map_create()", "error:%s (name=%s)\n",
                        strerror(errno), name);

        return map_fd;
}

static int map_create(__u32 type, const char *name, struct bpf_map_create_opts *map_opts)
{
        return map_create_opts(type, name, map_opts, sizeof(int), sizeof(int));
}

static int create_hash(void)
{
        LIBBPF_OPTS(bpf_map_create_opts, map_opts, .map_flags = BPF_F_NO_PREALLOC);

        return map_create(BPF_MAP_TYPE_HASH, "hash", &map_opts);
}

static int create_percpu_hash(void)
{
        LIBBPF_OPTS(bpf_map_create_opts, map_opts, .map_flags = BPF_F_NO_PREALLOC);

        return map_create(BPF_MAP_TYPE_PERCPU_HASH, "percpu_hash", &map_opts);
}

static int create_hash_prealloc(void)
{
        return map_create(BPF_MAP_TYPE_HASH, "hash", NULL);
}

static int create_percpu_hash_prealloc(void)
{
        return map_create(BPF_MAP_TYPE_PERCPU_HASH, "percpu_hash_prealloc", NULL);
}

static int create_lru_hash(__u32 type, __u32 map_flags)
{
        LIBBPF_OPTS(bpf_map_create_opts, map_opts, .map_flags = map_flags);

        return map_create(type, "lru_hash", &map_opts);
}

static int create_hash_of_maps(void)
{
        LIBBPF_OPTS(bpf_map_create_opts, map_opts,
                .map_flags = BPF_F_NO_PREALLOC,
                .inner_map_fd = create_small_hash(),
        );
        int ret;

        ret = map_create_opts(BPF_MAP_TYPE_HASH_OF_MAPS, "hash_of_maps",
                              &map_opts, sizeof(int), sizeof(int));
        close(map_opts.inner_map_fd);
        return ret;
}

static void map_percpu_stats_hash(void)
{
        __test(create_hash());
        printf("test_%s:PASS\n", __func__);
}

static void map_percpu_stats_percpu_hash(void)
{
        __test(create_percpu_hash());
        printf("test_%s:PASS\n", __func__);
}

static void map_percpu_stats_hash_prealloc(void)
{
        __test(create_hash_prealloc());
        printf("test_%s:PASS\n", __func__);
}

static void map_percpu_stats_percpu_hash_prealloc(void)
{
        __test(create_percpu_hash_prealloc());
        printf("test_%s:PASS\n", __func__);
}

static void map_percpu_stats_lru_hash(void)
{
        __test(create_lru_hash(BPF_MAP_TYPE_LRU_HASH, 0));
        printf("test_%s:PASS\n", __func__);
}

static void map_percpu_stats_lru_hash_no_common(void)
{
        __test(create_lru_hash(BPF_MAP_TYPE_LRU_HASH, BPF_F_NO_COMMON_LRU));
        printf("test_%s:PASS\n", __func__);
}

static void map_percpu_stats_percpu_lru_hash(void)
{
        __test(create_lru_hash(BPF_MAP_TYPE_LRU_PERCPU_HASH, 0));
        printf("test_%s:PASS\n", __func__);
}

static void map_percpu_stats_percpu_lru_hash_no_common(void)
{
        __test(create_lru_hash(BPF_MAP_TYPE_LRU_PERCPU_HASH, BPF_F_NO_COMMON_LRU));
        printf("test_%s:PASS\n", __func__);
}

static void map_percpu_stats_hash_of_maps(void)
{
        __test(create_hash_of_maps());
        printf("test_%s:PASS\n", __func__);
}

static void map_percpu_stats_map_value_size(void)
{
        int fd;
        int value_sz = PCPU_MIN_UNIT_SIZE + 1;
        struct bpf_map_create_opts opts = { .sz = sizeof(opts) };
        enum bpf_map_type map_types[] = { BPF_MAP_TYPE_PERCPU_ARRAY,
                                          BPF_MAP_TYPE_PERCPU_HASH,
                                          BPF_MAP_TYPE_LRU_PERCPU_HASH };
        for (int i = 0; i < ARRAY_SIZE(map_types); i++) {
                fd = bpf_map_create(map_types[i], NULL, sizeof(__u32), value_sz, 1, &opts);
                CHECK(fd < 0 && errno != E2BIG, "percpu map value size",
                        "error: %s\n", strerror(errno));
        }
        printf("test_%s:PASS\n", __func__);
}

void test_map_percpu_stats(void)
{
        map_percpu_stats_hash();
        map_percpu_stats_percpu_hash();
        map_percpu_stats_hash_prealloc();
        map_percpu_stats_percpu_hash_prealloc();
        map_percpu_stats_lru_hash();
        map_percpu_stats_lru_hash_no_common();
        map_percpu_stats_percpu_lru_hash();
        map_percpu_stats_percpu_lru_hash_no_common();
        map_percpu_stats_hash_of_maps();
        map_percpu_stats_map_value_size();
}