tools/testing/selftests/net/bench/page_pool/bench_page_pool_simple.c

root/tools/testing/selftests/net/bench/page_pool/bench_page_pool_simple.c
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Benchmark module for page_pool.
 *
 */
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/interrupt.h>
#include <linux/limits.h>
#include <linux/module.h>
#include <linux/mutex.h>
#include <net/page_pool/helpers.h>

#include "time_bench.h"

static int verbose = 1;
#define MY_POOL_SIZE 1024

/* Makes tests selectable. Useful for perf-record to analyze a single test.
 * Hint: Bash shells support writing binary number like: $((2#101010)
 *
 * # modprobe bench_page_pool_simple run_flags=$((2#100))
 */
static unsigned long run_flags = 0xFFFFFFFF;
module_param(run_flags, ulong, 0);
MODULE_PARM_DESC(run_flags, "Limit which bench test that runs");

/* Count the bit number from the enum */
enum benchmark_bit {
        bit_run_bench_baseline,
        bit_run_bench_no_softirq01,
        bit_run_bench_no_softirq02,
        bit_run_bench_no_softirq03,
};

#define bit(b)          (1 << (b))
#define enabled(b)      ((run_flags & (bit(b))))

/* notice time_bench is limited to U32_MAX nr loops */
static unsigned long loops = 10000000;
module_param(loops, ulong, 0);
MODULE_PARM_DESC(loops, "Specify loops bench will run");

/* Timing at the nanosec level, we need to know the overhead
 * introduced by the for loop itself
 */
static int time_bench_for_loop(struct time_bench_record *rec, void *data)
{
        uint64_t loops_cnt = 0;
        int i;

        time_bench_start(rec);
        /** Loop to measure **/
        for (i = 0; i < rec->loops; i++) {
                loops_cnt++;
                barrier(); /* avoid compiler to optimize this loop */
        }
        time_bench_stop(rec, loops_cnt);
        return loops_cnt;
}

static int time_bench_atomic_inc(struct time_bench_record *rec, void *data)
{
        uint64_t loops_cnt = 0;
        atomic_t cnt;
        int i;

        atomic_set(&cnt, 0);

        time_bench_start(rec);
        /** Loop to measure **/
        for (i = 0; i < rec->loops; i++) {
                atomic_inc(&cnt);
                barrier(); /* avoid compiler to optimize this loop */
        }
        loops_cnt = atomic_read(&cnt);
        time_bench_stop(rec, loops_cnt);
        return loops_cnt;
}

/* The ptr_ping in page_pool uses a spinlock. We need to know the minimum
 * overhead of taking+releasing a spinlock, to know the cycles that can be saved
 * by e.g. amortizing this via bulking.
 */
static int time_bench_lock(struct time_bench_record *rec, void *data)
{
        uint64_t loops_cnt = 0;
        spinlock_t lock;
        int i;

        spin_lock_init(&lock);

        time_bench_start(rec);
        /** Loop to measure **/
        for (i = 0; i < rec->loops; i++) {
                spin_lock(&lock);
                loops_cnt++;
                barrier(); /* avoid compiler to optimize this loop */
                spin_unlock(&lock);
        }
        time_bench_stop(rec, loops_cnt);
        return loops_cnt;
}

/* Helper for filling some page's into ptr_ring */
static void pp_fill_ptr_ring(struct page_pool *pp, int elems)
{
        /* GFP_ATOMIC needed when under run softirq */
        gfp_t gfp_mask = GFP_ATOMIC;
        struct page **array;
        int i;

        array = kcalloc(elems, sizeof(struct page *), gfp_mask);

        for (i = 0; i < elems; i++)
                array[i] = page_pool_alloc_pages(pp, gfp_mask);
        for (i = 0; i < elems; i++)
                page_pool_put_page(pp, array[i], -1, false);

        kfree(array);
}

enum test_type { type_fast_path, type_ptr_ring, type_page_allocator };

/* Depends on compile optimizing this function */
static int time_bench_page_pool(struct time_bench_record *rec, void *data,
                                enum test_type type, const char *func)
{
        uint64_t loops_cnt = 0;
        gfp_t gfp_mask = GFP_ATOMIC; /* GFP_ATOMIC is not really needed */
        int i, err;

        struct page_pool *pp;
        struct page *page;

        struct page_pool_params pp_params = {
                .order = 0,
                .flags = 0,
                .pool_size = MY_POOL_SIZE,
                .nid = NUMA_NO_NODE,
                .dev = NULL, /* Only use for DMA mapping */
                .dma_dir = DMA_BIDIRECTIONAL,
        };

        pp = page_pool_create(&pp_params);
        if (IS_ERR(pp)) {
                err = PTR_ERR(pp);
                pr_warn("%s: Error(%d) creating page_pool\n", func, err);
                goto out;
        }
        pp_fill_ptr_ring(pp, 64);

        if (in_serving_softirq())
                pr_warn("%s(): in_serving_softirq fast-path\n", func);
        else
                pr_warn("%s(): Cannot use page_pool fast-path\n", func);

        time_bench_start(rec);
        /** Loop to measure **/
        for (i = 0; i < rec->loops; i++) {
                /* Common fast-path alloc that depend on in_serving_softirq() */
                page = page_pool_alloc_pages(pp, gfp_mask);
                if (!page)
                        break;
                loops_cnt++;
                barrier(); /* avoid compiler to optimize this loop */

                /* The benchmarks purpose it to test different return paths.
                 * Compiler should inline optimize other function calls out
                 */
                if (type == type_fast_path) {
                        /* Fast-path recycling e.g. XDP_DROP use-case */
                        page_pool_recycle_direct(pp, page);

                } else if (type == type_ptr_ring) {
                        /* Normal return path */
                        page_pool_put_page(pp, page, -1, false);

                } else if (type == type_page_allocator) {
                        /* Test if not pages are recycled, but instead
                         * returned back into systems page allocator
                         */
                        get_page(page); /* cause no-recycling */
                        page_pool_put_page(pp, page, -1, false);
                        put_page(page);
                } else {
                        BUILD_BUG();
                }
        }
        time_bench_stop(rec, loops_cnt);
out:
        page_pool_destroy(pp);
        return loops_cnt;
}

static int time_bench_page_pool01_fast_path(struct time_bench_record *rec,
                                            void *data)
{
        return time_bench_page_pool(rec, data, type_fast_path, __func__);
}

static int time_bench_page_pool02_ptr_ring(struct time_bench_record *rec,
                                           void *data)
{
        return time_bench_page_pool(rec, data, type_ptr_ring, __func__);
}

static int time_bench_page_pool03_slow(struct time_bench_record *rec,
                                       void *data)
{
        return time_bench_page_pool(rec, data, type_page_allocator, __func__);
}

static int run_benchmark_tests(void)
{
        uint32_t nr_loops = loops;

        /* Baseline tests */
        if (enabled(bit_run_bench_baseline)) {
                time_bench_loop(nr_loops * 10, 0, "for_loop", NULL,
                                time_bench_for_loop);
                time_bench_loop(nr_loops * 10, 0, "atomic_inc", NULL,
                                time_bench_atomic_inc);
                time_bench_loop(nr_loops, 0, "lock", NULL, time_bench_lock);
        }

        /* This test cannot activate correct code path, due to no-softirq ctx */
        if (enabled(bit_run_bench_no_softirq01))
                time_bench_loop(nr_loops, 0, "no-softirq-page_pool01", NULL,
                                time_bench_page_pool01_fast_path);
        if (enabled(bit_run_bench_no_softirq02))
                time_bench_loop(nr_loops, 0, "no-softirq-page_pool02", NULL,
                                time_bench_page_pool02_ptr_ring);
        if (enabled(bit_run_bench_no_softirq03))
                time_bench_loop(nr_loops, 0, "no-softirq-page_pool03", NULL,
                                time_bench_page_pool03_slow);

        return 0;
}

static int __init bench_page_pool_simple_module_init(void)
{
        if (verbose)
                pr_info("Loaded\n");

        if (loops > U32_MAX) {
                pr_err("Module param loops(%lu) exceeded U32_MAX(%u)\n", loops,
                       U32_MAX);
                return -ECHRNG;
        }

        run_benchmark_tests();

        return 0;
}
module_init(bench_page_pool_simple_module_init);

static void __exit bench_page_pool_simple_module_exit(void)
{
        if (verbose)
                pr_info("Unloaded\n");
}
module_exit(bench_page_pool_simple_module_exit);

MODULE_DESCRIPTION("Benchmark of page_pool simple cases");
MODULE_AUTHOR("Jesper Dangaard Brouer <netoptimizer@brouer.com>");
MODULE_LICENSE("GPL");
Linux