root/tools/testing/selftests/mm/khugepaged.c
#define _GNU_SOURCE
#include <ctype.h>
#include <errno.h>
#include <fcntl.h>
#include <limits.h>
#include <dirent.h>
#include <signal.h>
#include <stdio.h>
#include <stdlib.h>
#include <stdbool.h>
#include <string.h>
#include <unistd.h>

#include <linux/mman.h>
#include <sys/mman.h>
#include <sys/wait.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/sysmacros.h>
#include <sys/vfs.h>

#include "linux/magic.h"

#include "vm_util.h"
#include "thp_settings.h"

#define BASE_ADDR ((void *)(1UL << 30))
static unsigned long hpage_pmd_size;
static unsigned long page_size;
static int hpage_pmd_nr;
static int anon_order;

#define PID_SMAPS "/proc/self/smaps"
#define TEST_FILE "collapse_test_file"

#define MAX_LINE_LENGTH 500

enum vma_type {
        VMA_ANON,
        VMA_FILE,
        VMA_SHMEM,
};

struct mem_ops {
        void *(*setup_area)(int nr_hpages);
        void (*cleanup_area)(void *p, unsigned long size);
        void (*fault)(void *p, unsigned long start, unsigned long end);
        bool (*check_huge)(void *addr, int nr_hpages);
        const char *name;
};

static struct mem_ops *file_ops;
static struct mem_ops *anon_ops;
static struct mem_ops *shmem_ops;

struct collapse_context {
        void (*collapse)(const char *msg, char *p, int nr_hpages,
                         struct mem_ops *ops, bool expect);
        bool enforce_pte_scan_limits;
        const char *name;
};

static struct collapse_context *khugepaged_context;
static struct collapse_context *madvise_context;

struct file_info {
        const char *dir;
        char path[PATH_MAX];
        enum vma_type type;
        int fd;
        char dev_queue_read_ahead_path[PATH_MAX];
};

static struct file_info finfo;
static bool skip_settings_restore;
static int exit_status;

static void success(const char *msg)
{
        printf(" \e[32m%s\e[0m\n", msg);
}

static void fail(const char *msg)
{
        printf(" \e[31m%s\e[0m\n", msg);
        exit_status++;
}

static void skip(const char *msg)
{
        printf(" \e[33m%s\e[0m\n", msg);
}

static void restore_settings_atexit(void)
{
        if (skip_settings_restore)
                return;

        printf("Restore THP and khugepaged settings...");
        thp_restore_settings();
        success("OK");

        skip_settings_restore = true;
}

static void restore_settings(int sig)
{
        /* exit() will invoke the restore_settings_atexit handler. */
        exit(sig ? EXIT_FAILURE : exit_status);
}

static void save_settings(void)
{
        printf("Save THP and khugepaged settings...");
        if (file_ops && finfo.type == VMA_FILE)
                thp_set_read_ahead_path(finfo.dev_queue_read_ahead_path);
        thp_save_settings();

        success("OK");

        atexit(restore_settings_atexit);
        signal(SIGTERM, restore_settings);
        signal(SIGINT, restore_settings);
        signal(SIGHUP, restore_settings);
        signal(SIGQUIT, restore_settings);
}

static void get_finfo(const char *dir)
{
        struct stat path_stat;
        struct statfs fs;
        char buf[1 << 10];
        char path[PATH_MAX];
        char *str, *end;

        finfo.dir = dir;
        stat(finfo.dir, &path_stat);
        if (!S_ISDIR(path_stat.st_mode)) {
                printf("%s: Not a directory (%s)\n", __func__, finfo.dir);
                exit(EXIT_FAILURE);
        }
        if (snprintf(finfo.path, sizeof(finfo.path), "%s/" TEST_FILE,
                     finfo.dir) >= sizeof(finfo.path)) {
                printf("%s: Pathname is too long\n", __func__);
                exit(EXIT_FAILURE);
        }
        if (statfs(finfo.dir, &fs)) {
                perror("statfs()");
                exit(EXIT_FAILURE);
        }
        finfo.type = fs.f_type == TMPFS_MAGIC ? VMA_SHMEM : VMA_FILE;
        if (finfo.type == VMA_SHMEM)
                return;

        /* Find owning device's queue/read_ahead_kb control */
        if (snprintf(path, sizeof(path), "/sys/dev/block/%d:%d/uevent",
                     major(path_stat.st_dev), minor(path_stat.st_dev))
            >= sizeof(path)) {
                printf("%s: Pathname is too long\n", __func__);
                exit(EXIT_FAILURE);
        }
        if (read_file(path, buf, sizeof(buf)) < 0) {
                perror("read_file(read_num)");
                exit(EXIT_FAILURE);
        }
        if (strstr(buf, "DEVTYPE=disk")) {
                /* Found it */
                if (snprintf(finfo.dev_queue_read_ahead_path,
                             sizeof(finfo.dev_queue_read_ahead_path),
                             "/sys/dev/block/%d:%d/queue/read_ahead_kb",
                             major(path_stat.st_dev), minor(path_stat.st_dev))
                    >= sizeof(finfo.dev_queue_read_ahead_path)) {
                        printf("%s: Pathname is too long\n", __func__);
                        exit(EXIT_FAILURE);
                }
                return;
        }
        if (!strstr(buf, "DEVTYPE=partition")) {
                printf("%s: Unknown device type: %s\n", __func__, path);
                exit(EXIT_FAILURE);
        }
        /*
         * Partition of block device - need to find actual device.
         * Using naming convention that devnameN is partition of
         * device devname.
         */
        str = strstr(buf, "DEVNAME=");
        if (!str) {
                printf("%s: Could not read: %s", __func__, path);
                exit(EXIT_FAILURE);
        }
        str += 8;
        end = str;
        while (*end) {
                if (isdigit(*end)) {
                        *end = '\0';
                        if (snprintf(finfo.dev_queue_read_ahead_path,
                                     sizeof(finfo.dev_queue_read_ahead_path),
                                     "/sys/block/%s/queue/read_ahead_kb",
                                     str) >= sizeof(finfo.dev_queue_read_ahead_path)) {
                                printf("%s: Pathname is too long\n", __func__);
                                exit(EXIT_FAILURE);
                        }
                        return;
                }
                ++end;
        }
        printf("%s: Could not read: %s\n", __func__, path);
        exit(EXIT_FAILURE);
}

static bool check_swap(void *addr, unsigned long size)
{
        bool swap = false;
        int ret;
        FILE *fp;
        char buffer[MAX_LINE_LENGTH];
        char addr_pattern[MAX_LINE_LENGTH];

        ret = snprintf(addr_pattern, MAX_LINE_LENGTH, "%08lx-",
                       (unsigned long) addr);
        if (ret >= MAX_LINE_LENGTH) {
                printf("%s: Pattern is too long\n", __func__);
                exit(EXIT_FAILURE);
        }


        fp = fopen(PID_SMAPS, "r");
        if (!fp) {
                printf("%s: Failed to open file %s\n", __func__, PID_SMAPS);
                exit(EXIT_FAILURE);
        }
        if (!check_for_pattern(fp, addr_pattern, buffer, sizeof(buffer)))
                goto err_out;

        ret = snprintf(addr_pattern, MAX_LINE_LENGTH, "Swap:%19ld kB",
                       size >> 10);
        if (ret >= MAX_LINE_LENGTH) {
                printf("%s: Pattern is too long\n", __func__);
                exit(EXIT_FAILURE);
        }
        /*
         * Fetch the Swap: in the same block and check whether it got
         * the expected number of hugeepages next.
         */
        if (!check_for_pattern(fp, "Swap:", buffer, sizeof(buffer)))
                goto err_out;

        if (strncmp(buffer, addr_pattern, strlen(addr_pattern)))
                goto err_out;

        swap = true;
err_out:
        fclose(fp);
        return swap;
}

static void *alloc_mapping(int nr)
{
        void *p;

        p = mmap(BASE_ADDR, nr * hpage_pmd_size, PROT_READ | PROT_WRITE,
                 MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
        if (p != BASE_ADDR) {
                printf("Failed to allocate VMA at %p\n", BASE_ADDR);
                exit(EXIT_FAILURE);
        }

        return p;
}

static void fill_memory(int *p, unsigned long start, unsigned long end)
{
        int i;

        for (i = start / page_size; i < end / page_size; i++)
                p[i * page_size / sizeof(*p)] = i + 0xdead0000;
}

/*
 * MADV_COLLAPSE is a best-effort request and may fail if an internal
 * resource is temporarily unavailable, in which case it will set errno to
 * EAGAIN.  In such a case, immediately reattempt the operation one more
 * time.
 */
static int madvise_collapse_retry(void *p, unsigned long size)
{
        bool retry = true;
        int ret;

retry:
        ret = madvise(p, size, MADV_COLLAPSE);
        if (ret && errno == EAGAIN && retry) {
                retry = false;
                goto retry;
        }
        return ret;
}

/*
 * Returns pmd-mapped hugepage in VMA marked VM_HUGEPAGE, filled with
 * validate_memory()'able contents.
 */
static void *alloc_hpage(struct mem_ops *ops)
{
        void *p = ops->setup_area(1);

        ops->fault(p, 0, hpage_pmd_size);

        /*
         * VMA should be neither VM_HUGEPAGE nor VM_NOHUGEPAGE.
         * The latter is ineligible for collapse by MADV_COLLAPSE
         * while the former might cause MADV_COLLAPSE to race with
         * khugepaged on low-load system (like a test machine), which
         * would cause MADV_COLLAPSE to fail with EAGAIN.
         */
        printf("Allocate huge page...");
        if (madvise_collapse_retry(p, hpage_pmd_size)) {
                perror("madvise(MADV_COLLAPSE)");
                exit(EXIT_FAILURE);
        }
        if (!ops->check_huge(p, 1)) {
                perror("madvise(MADV_COLLAPSE)");
                exit(EXIT_FAILURE);
        }
        if (madvise(p, hpage_pmd_size, MADV_HUGEPAGE)) {
                perror("madvise(MADV_HUGEPAGE)");
                exit(EXIT_FAILURE);
        }
        success("OK");
        return p;
}

static void validate_memory(int *p, unsigned long start, unsigned long end)
{
        int i;

        for (i = start / page_size; i < end / page_size; i++) {
                if (p[i * page_size / sizeof(*p)] != i + 0xdead0000) {
                        printf("Page %d is corrupted: %#x\n",
                                        i, p[i * page_size / sizeof(*p)]);
                        exit(EXIT_FAILURE);
                }
        }
}

static void *anon_setup_area(int nr_hpages)
{
        return alloc_mapping(nr_hpages);
}

static void anon_cleanup_area(void *p, unsigned long size)
{
        munmap(p, size);
}

static void anon_fault(void *p, unsigned long start, unsigned long end)
{
        fill_memory(p, start, end);
}

static bool anon_check_huge(void *addr, int nr_hpages)
{
        return check_huge_anon(addr, nr_hpages, hpage_pmd_size);
}

static void *file_setup_area(int nr_hpages)
{
        int fd;
        void *p;
        unsigned long size;

        unlink(finfo.path);  /* Cleanup from previous failed tests */
        printf("Creating %s for collapse%s...", finfo.path,
               finfo.type == VMA_SHMEM ? " (tmpfs)" : "");
        fd = open(finfo.path, O_DSYNC | O_CREAT | O_RDWR | O_TRUNC | O_EXCL,
                  777);
        if (fd < 0) {
                perror("open()");
                exit(EXIT_FAILURE);
        }

        size = nr_hpages * hpage_pmd_size;
        p = alloc_mapping(nr_hpages);
        fill_memory(p, 0, size);
        write(fd, p, size);
        close(fd);
        munmap(p, size);
        success("OK");

        printf("Opening %s read only for collapse...", finfo.path);
        finfo.fd = open(finfo.path, O_RDONLY, 777);
        if (finfo.fd < 0) {
                perror("open()");
                exit(EXIT_FAILURE);
        }
        p = mmap(BASE_ADDR, size, PROT_READ,
                 MAP_PRIVATE, finfo.fd, 0);
        if (p == MAP_FAILED || p != BASE_ADDR) {
                perror("mmap()");
                exit(EXIT_FAILURE);
        }

        /* Drop page cache */
        write_file("/proc/sys/vm/drop_caches", "3", 2);
        success("OK");
        return p;
}

static void file_cleanup_area(void *p, unsigned long size)
{
        munmap(p, size);
        close(finfo.fd);
        unlink(finfo.path);
}

static void file_fault(void *p, unsigned long start, unsigned long end)
{
        if (madvise(((char *)p) + start, end - start, MADV_POPULATE_READ)) {
                perror("madvise(MADV_POPULATE_READ");
                exit(EXIT_FAILURE);
        }
}

static bool file_check_huge(void *addr, int nr_hpages)
{
        switch (finfo.type) {
        case VMA_FILE:
                return check_huge_file(addr, nr_hpages, hpage_pmd_size);
        case VMA_SHMEM:
                return check_huge_shmem(addr, nr_hpages, hpage_pmd_size);
        default:
                exit(EXIT_FAILURE);
                return false;
        }
}

static void *shmem_setup_area(int nr_hpages)
{
        void *p;
        unsigned long size = nr_hpages * hpage_pmd_size;

        finfo.fd = memfd_create("khugepaged-selftest-collapse-shmem", 0);
        if (finfo.fd < 0)  {
                perror("memfd_create()");
                exit(EXIT_FAILURE);
        }
        if (ftruncate(finfo.fd, size)) {
                perror("ftruncate()");
                exit(EXIT_FAILURE);
        }
        p = mmap(BASE_ADDR, size, PROT_READ | PROT_WRITE, MAP_SHARED, finfo.fd,
                 0);
        if (p != BASE_ADDR) {
                perror("mmap()");
                exit(EXIT_FAILURE);
        }
        return p;
}

static void shmem_cleanup_area(void *p, unsigned long size)
{
        munmap(p, size);
        close(finfo.fd);
}

static bool shmem_check_huge(void *addr, int nr_hpages)
{
        return check_huge_shmem(addr, nr_hpages, hpage_pmd_size);
}

static struct mem_ops __anon_ops = {
        .setup_area = &anon_setup_area,
        .cleanup_area = &anon_cleanup_area,
        .fault = &anon_fault,
        .check_huge = &anon_check_huge,
        .name = "anon",
};

static struct mem_ops __file_ops = {
        .setup_area = &file_setup_area,
        .cleanup_area = &file_cleanup_area,
        .fault = &file_fault,
        .check_huge = &file_check_huge,
        .name = "file",
};

static struct mem_ops __shmem_ops = {
        .setup_area = &shmem_setup_area,
        .cleanup_area = &shmem_cleanup_area,
        .fault = &anon_fault,
        .check_huge = &shmem_check_huge,
        .name = "shmem",
};

static void __madvise_collapse(const char *msg, char *p, int nr_hpages,
                               struct mem_ops *ops, bool expect)
{
        int ret;
        struct thp_settings settings = *thp_current_settings();

        printf("%s...", msg);

        /*
         * Prevent khugepaged interference and tests that MADV_COLLAPSE
         * ignores /sys/kernel/mm/transparent_hugepage/enabled
         */
        settings.thp_enabled = THP_NEVER;
        settings.shmem_enabled = SHMEM_NEVER;
        thp_push_settings(&settings);

        /* Clear VM_NOHUGEPAGE */
        madvise(p, nr_hpages * hpage_pmd_size, MADV_HUGEPAGE);
        ret = madvise_collapse_retry(p, nr_hpages * hpage_pmd_size);
        if (((bool)ret) == expect)
                fail("Fail: Bad return value");
        else if (!ops->check_huge(p, expect ? nr_hpages : 0))
                fail("Fail: check_huge()");
        else
                success("OK");

        thp_pop_settings();
}

static void madvise_collapse(const char *msg, char *p, int nr_hpages,
                             struct mem_ops *ops, bool expect)
{
        /* Sanity check */
        if (!ops->check_huge(p, 0)) {
                printf("Unexpected huge page\n");
                exit(EXIT_FAILURE);
        }
        __madvise_collapse(msg, p, nr_hpages, ops, expect);
}

#define TICK 500000
static bool wait_for_scan(const char *msg, char *p, int nr_hpages,
                          struct mem_ops *ops)
{
        int full_scans;
        int timeout = 6; /* 3 seconds */

        /* Sanity check */
        if (!ops->check_huge(p, 0)) {
                printf("Unexpected huge page\n");
                exit(EXIT_FAILURE);
        }

        madvise(p, nr_hpages * hpage_pmd_size, MADV_HUGEPAGE);

        /* Wait until the second full_scan completed */
        full_scans = thp_read_num("khugepaged/full_scans") + 2;

        printf("%s...", msg);
        while (timeout--) {
                if (ops->check_huge(p, nr_hpages))
                        break;
                if (thp_read_num("khugepaged/full_scans") >= full_scans)
                        break;
                printf(".");
                usleep(TICK);
        }

        return timeout == -1;
}

static void khugepaged_collapse(const char *msg, char *p, int nr_hpages,
                                struct mem_ops *ops, bool expect)
{
        if (wait_for_scan(msg, p, nr_hpages, ops)) {
                if (expect)
                        fail("Timeout");
                else
                        success("OK");
                return;
        }

        /*
         * For file and shmem memory, khugepaged only retracts pte entries after
         * putting the new hugepage in the page cache. The hugepage must be
         * subsequently refaulted to install the pmd mapping for the mm.
         */
        if (ops != &__anon_ops)
                ops->fault(p, 0, nr_hpages * hpage_pmd_size);

        if (ops->check_huge(p, expect ? nr_hpages : 0))
                success("OK");
        else
                fail("Fail");
}

static struct collapse_context __khugepaged_context = {
        .collapse = &khugepaged_collapse,
        .enforce_pte_scan_limits = true,
        .name = "khugepaged",
};

static struct collapse_context __madvise_context = {
        .collapse = &madvise_collapse,
        .enforce_pte_scan_limits = false,
        .name = "madvise",
};

static bool is_tmpfs(struct mem_ops *ops)
{
        return ops == &__file_ops && finfo.type == VMA_SHMEM;
}

static bool is_anon(struct mem_ops *ops)
{
        return ops == &__anon_ops;
}

static void alloc_at_fault(void)
{
        struct thp_settings settings = *thp_current_settings();
        char *p;

        settings.thp_enabled = THP_ALWAYS;
        thp_push_settings(&settings);

        p = alloc_mapping(1);
        *p = 1;
        printf("Allocate huge page on fault...");
        if (check_huge_anon(p, 1, hpage_pmd_size))
                success("OK");
        else
                fail("Fail");

        thp_pop_settings();

        madvise(p, page_size, MADV_DONTNEED);
        printf("Split huge PMD on MADV_DONTNEED...");
        if (check_huge_anon(p, 0, hpage_pmd_size))
                success("OK");
        else
                fail("Fail");
        munmap(p, hpage_pmd_size);
}

static void collapse_full(struct collapse_context *c, struct mem_ops *ops)
{
        void *p;
        int nr_hpages = 4;
        unsigned long size = nr_hpages * hpage_pmd_size;

        p = ops->setup_area(nr_hpages);
        ops->fault(p, 0, size);
        c->collapse("Collapse multiple fully populated PTE table", p, nr_hpages,
                    ops, true);
        validate_memory(p, 0, size);
        ops->cleanup_area(p, size);
}

static void collapse_empty(struct collapse_context *c, struct mem_ops *ops)
{
        void *p;

        p = ops->setup_area(1);
        c->collapse("Do not collapse empty PTE table", p, 1, ops, false);
        ops->cleanup_area(p, hpage_pmd_size);
}

static void collapse_single_pte_entry(struct collapse_context *c, struct mem_ops *ops)
{
        void *p;

        p = ops->setup_area(1);
        ops->fault(p, 0, page_size);
        c->collapse("Collapse PTE table with single PTE entry present", p,
                    1, ops, true);
        ops->cleanup_area(p, hpage_pmd_size);
}

static void collapse_max_ptes_none(struct collapse_context *c, struct mem_ops *ops)
{
        int max_ptes_none = hpage_pmd_nr / 2;
        struct thp_settings settings = *thp_current_settings();
        void *p;
        int fault_nr_pages = is_anon(ops) ? 1 << anon_order : 1;

        settings.khugepaged.max_ptes_none = max_ptes_none;
        thp_push_settings(&settings);

        p = ops->setup_area(1);

        if (is_tmpfs(ops)) {
                /* shmem pages always in the page cache */
                printf("tmpfs...");
                skip("Skip");
                goto skip;
        }

        ops->fault(p, 0, (hpage_pmd_nr - max_ptes_none - fault_nr_pages) * page_size);
        c->collapse("Maybe collapse with max_ptes_none exceeded", p, 1,
                    ops, !c->enforce_pte_scan_limits);
        validate_memory(p, 0, (hpage_pmd_nr - max_ptes_none - fault_nr_pages) * page_size);

        if (c->enforce_pte_scan_limits) {
                ops->fault(p, 0, (hpage_pmd_nr - max_ptes_none) * page_size);
                c->collapse("Collapse with max_ptes_none PTEs empty", p, 1, ops,
                            true);
                validate_memory(p, 0,
                                (hpage_pmd_nr - max_ptes_none) * page_size);
        }
skip:
        ops->cleanup_area(p, hpage_pmd_size);
        thp_pop_settings();
}

static void collapse_swapin_single_pte(struct collapse_context *c, struct mem_ops *ops)
{
        void *p;

        p = ops->setup_area(1);
        ops->fault(p, 0, hpage_pmd_size);

        printf("Swapout one page...");
        if (madvise(p, page_size, MADV_PAGEOUT)) {
                perror("madvise(MADV_PAGEOUT)");
                exit(EXIT_FAILURE);
        }
        if (check_swap(p, page_size)) {
                success("OK");
        } else {
                fail("Fail");
                goto out;
        }

        c->collapse("Collapse with swapping in single PTE entry", p, 1, ops,
                    true);
        validate_memory(p, 0, hpage_pmd_size);
out:
        ops->cleanup_area(p, hpage_pmd_size);
}

static void collapse_max_ptes_swap(struct collapse_context *c, struct mem_ops *ops)
{
        int max_ptes_swap = thp_read_num("khugepaged/max_ptes_swap");
        void *p;

        p = ops->setup_area(1);
        ops->fault(p, 0, hpage_pmd_size);

        printf("Swapout %d of %d pages...", max_ptes_swap + 1, hpage_pmd_nr);
        if (madvise(p, (max_ptes_swap + 1) * page_size, MADV_PAGEOUT)) {
                perror("madvise(MADV_PAGEOUT)");
                exit(EXIT_FAILURE);
        }
        if (check_swap(p, (max_ptes_swap + 1) * page_size)) {
                success("OK");
        } else {
                fail("Fail");
                goto out;
        }

        c->collapse("Maybe collapse with max_ptes_swap exceeded", p, 1, ops,
                    !c->enforce_pte_scan_limits);
        validate_memory(p, 0, hpage_pmd_size);

        if (c->enforce_pte_scan_limits) {
                ops->fault(p, 0, hpage_pmd_size);
                printf("Swapout %d of %d pages...", max_ptes_swap,
                       hpage_pmd_nr);
                if (madvise(p, max_ptes_swap * page_size, MADV_PAGEOUT)) {
                        perror("madvise(MADV_PAGEOUT)");
                        exit(EXIT_FAILURE);
                }
                if (check_swap(p, max_ptes_swap * page_size)) {
                        success("OK");
                } else {
                        fail("Fail");
                        goto out;
                }

                c->collapse("Collapse with max_ptes_swap pages swapped out", p,
                            1, ops, true);
                validate_memory(p, 0, hpage_pmd_size);
        }
out:
        ops->cleanup_area(p, hpage_pmd_size);
}

static void collapse_single_pte_entry_compound(struct collapse_context *c, struct mem_ops *ops)
{
        void *p;

        p = alloc_hpage(ops);

        if (is_tmpfs(ops)) {
                /* MADV_DONTNEED won't evict tmpfs pages */
                printf("tmpfs...");
                skip("Skip");
                goto skip;
        }

        madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE);
        printf("Split huge page leaving single PTE mapping compound page...");
        madvise(p + page_size, hpage_pmd_size - page_size, MADV_DONTNEED);
        if (ops->check_huge(p, 0))
                success("OK");
        else
                fail("Fail");

        c->collapse("Collapse PTE table with single PTE mapping compound page",
                    p, 1, ops, true);
        validate_memory(p, 0, page_size);
skip:
        ops->cleanup_area(p, hpage_pmd_size);
}

static void collapse_full_of_compound(struct collapse_context *c, struct mem_ops *ops)
{
        void *p;

        p = alloc_hpage(ops);
        printf("Split huge page leaving single PTE page table full of compound pages...");
        madvise(p, page_size, MADV_NOHUGEPAGE);
        madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE);
        if (ops->check_huge(p, 0))
                success("OK");
        else
                fail("Fail");

        c->collapse("Collapse PTE table full of compound pages", p, 1, ops,
                    true);
        validate_memory(p, 0, hpage_pmd_size);
        ops->cleanup_area(p, hpage_pmd_size);
}

static void collapse_compound_extreme(struct collapse_context *c, struct mem_ops *ops)
{
        void *p;
        int i;

        p = ops->setup_area(1);
        for (i = 0; i < hpage_pmd_nr; i++) {
                printf("\rConstruct PTE page table full of different PTE-mapped compound pages %3d/%d...",
                                i + 1, hpage_pmd_nr);

                madvise(BASE_ADDR, hpage_pmd_size, MADV_HUGEPAGE);
                ops->fault(BASE_ADDR, 0, hpage_pmd_size);
                if (!ops->check_huge(BASE_ADDR, 1)) {
                        printf("Failed to allocate huge page\n");
                        exit(EXIT_FAILURE);
                }
                madvise(BASE_ADDR, hpage_pmd_size, MADV_NOHUGEPAGE);

                p = mremap(BASE_ADDR - i * page_size,
                                i * page_size + hpage_pmd_size,
                                (i + 1) * page_size,
                                MREMAP_MAYMOVE | MREMAP_FIXED,
                                BASE_ADDR + 2 * hpage_pmd_size);
                if (p == MAP_FAILED) {
                        perror("mremap+unmap");
                        exit(EXIT_FAILURE);
                }

                p = mremap(BASE_ADDR + 2 * hpage_pmd_size,
                                (i + 1) * page_size,
                                (i + 1) * page_size + hpage_pmd_size,
                                MREMAP_MAYMOVE | MREMAP_FIXED,
                                BASE_ADDR - (i + 1) * page_size);
                if (p == MAP_FAILED) {
                        perror("mremap+alloc");
                        exit(EXIT_FAILURE);
                }
        }

        ops->cleanup_area(BASE_ADDR, hpage_pmd_size);
        ops->fault(p, 0, hpage_pmd_size);
        if (!ops->check_huge(p, 1))
                success("OK");
        else
                fail("Fail");

        c->collapse("Collapse PTE table full of different compound pages", p, 1,
                    ops, true);

        validate_memory(p, 0, hpage_pmd_size);
        ops->cleanup_area(p, hpage_pmd_size);
}

static void collapse_fork(struct collapse_context *c, struct mem_ops *ops)
{
        int wstatus;
        void *p;

        p = ops->setup_area(1);

        printf("Allocate small page...");
        ops->fault(p, 0, page_size);
        if (ops->check_huge(p, 0))
                success("OK");
        else
                fail("Fail");

        printf("Share small page over fork()...");
        if (!fork()) {
                /* Do not touch settings on child exit */
                skip_settings_restore = true;
                exit_status = 0;

                if (ops->check_huge(p, 0))
                        success("OK");
                else
                        fail("Fail");

                ops->fault(p, page_size, 2 * page_size);
                c->collapse("Collapse PTE table with single page shared with parent process",
                            p, 1, ops, true);

                validate_memory(p, 0, page_size);
                ops->cleanup_area(p, hpage_pmd_size);
                exit(exit_status);
        }

        wait(&wstatus);
        exit_status += WEXITSTATUS(wstatus);

        printf("Check if parent still has small page...");
        if (ops->check_huge(p, 0))
                success("OK");
        else
                fail("Fail");
        validate_memory(p, 0, page_size);
        ops->cleanup_area(p, hpage_pmd_size);
}

static void collapse_fork_compound(struct collapse_context *c, struct mem_ops *ops)
{
        int wstatus;
        void *p;

        p = alloc_hpage(ops);
        printf("Share huge page over fork()...");
        if (!fork()) {
                /* Do not touch settings on child exit */
                skip_settings_restore = true;
                exit_status = 0;

                if (ops->check_huge(p, 1))
                        success("OK");
                else
                        fail("Fail");

                printf("Split huge page PMD in child process...");
                madvise(p, page_size, MADV_NOHUGEPAGE);
                madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE);
                if (ops->check_huge(p, 0))
                        success("OK");
                else
                        fail("Fail");
                ops->fault(p, 0, page_size);

                thp_write_num("khugepaged/max_ptes_shared", hpage_pmd_nr - 1);
                c->collapse("Collapse PTE table full of compound pages in child",
                            p, 1, ops, true);
                thp_write_num("khugepaged/max_ptes_shared",
                          thp_current_settings()->khugepaged.max_ptes_shared);

                validate_memory(p, 0, hpage_pmd_size);
                ops->cleanup_area(p, hpage_pmd_size);
                exit(exit_status);
        }

        wait(&wstatus);
        exit_status += WEXITSTATUS(wstatus);

        printf("Check if parent still has huge page...");
        if (ops->check_huge(p, 1))
                success("OK");
        else
                fail("Fail");
        validate_memory(p, 0, hpage_pmd_size);
        ops->cleanup_area(p, hpage_pmd_size);
}

static void collapse_max_ptes_shared(struct collapse_context *c, struct mem_ops *ops)
{
        int max_ptes_shared = thp_read_num("khugepaged/max_ptes_shared");
        int wstatus;
        void *p;

        p = alloc_hpage(ops);
        printf("Share huge page over fork()...");
        if (!fork()) {
                /* Do not touch settings on child exit */
                skip_settings_restore = true;
                exit_status = 0;

                if (ops->check_huge(p, 1))
                        success("OK");
                else
                        fail("Fail");

                printf("Trigger CoW on page %d of %d...",
                                hpage_pmd_nr - max_ptes_shared - 1, hpage_pmd_nr);
                ops->fault(p, 0, (hpage_pmd_nr - max_ptes_shared - 1) * page_size);
                if (ops->check_huge(p, 0))
                        success("OK");
                else
                        fail("Fail");

                c->collapse("Maybe collapse with max_ptes_shared exceeded", p,
                            1, ops, !c->enforce_pte_scan_limits);

                if (c->enforce_pte_scan_limits) {
                        printf("Trigger CoW on page %d of %d...",
                               hpage_pmd_nr - max_ptes_shared, hpage_pmd_nr);
                        ops->fault(p, 0, (hpage_pmd_nr - max_ptes_shared) *
                                    page_size);
                        if (ops->check_huge(p, 0))
                                success("OK");
                        else
                                fail("Fail");

                        c->collapse("Collapse with max_ptes_shared PTEs shared",
                                    p, 1, ops, true);
                }

                validate_memory(p, 0, hpage_pmd_size);
                ops->cleanup_area(p, hpage_pmd_size);
                exit(exit_status);
        }

        wait(&wstatus);
        exit_status += WEXITSTATUS(wstatus);

        printf("Check if parent still has huge page...");
        if (ops->check_huge(p, 1))
                success("OK");
        else
                fail("Fail");
        validate_memory(p, 0, hpage_pmd_size);
        ops->cleanup_area(p, hpage_pmd_size);
}

static void madvise_collapse_existing_thps(struct collapse_context *c,
                                           struct mem_ops *ops)
{
        void *p;

        p = ops->setup_area(1);
        ops->fault(p, 0, hpage_pmd_size);
        c->collapse("Collapse fully populated PTE table...", p, 1, ops, true);
        validate_memory(p, 0, hpage_pmd_size);

        /* c->collapse() will find a hugepage and complain - call directly. */
        __madvise_collapse("Re-collapse PMD-mapped hugepage", p, 1, ops, true);
        validate_memory(p, 0, hpage_pmd_size);
        ops->cleanup_area(p, hpage_pmd_size);
}

/*
 * Test race with khugepaged where page tables have been retracted and
 * pmd cleared.
 */
static void madvise_retracted_page_tables(struct collapse_context *c,
                                          struct mem_ops *ops)
{
        void *p;
        int nr_hpages = 1;
        unsigned long size = nr_hpages * hpage_pmd_size;

        p = ops->setup_area(nr_hpages);
        ops->fault(p, 0, size);

        /* Let khugepaged collapse and leave pmd cleared */
        if (wait_for_scan("Collapse and leave PMD cleared", p, nr_hpages,
                          ops)) {
                fail("Timeout");
                return;
        }
        success("OK");
        c->collapse("Install huge PMD from page cache", p, nr_hpages, ops,
                    true);
        validate_memory(p, 0, size);
        ops->cleanup_area(p, size);
}

static void usage(void)
{
        fprintf(stderr, "\nUsage: ./khugepaged [OPTIONS] <test type> [dir]\n\n");
        fprintf(stderr, "\t<test type>\t: <context>:<mem_type>\n");
        fprintf(stderr, "\t<context>\t: [all|khugepaged|madvise]\n");
        fprintf(stderr, "\t<mem_type>\t: [all|anon|file|shmem]\n");
        fprintf(stderr, "\n\t\"file,all\" mem_type requires [dir] argument\n");
        fprintf(stderr, "\n\t\"file,all\" mem_type requires kernel built with\n");
        fprintf(stderr, "\tCONFIG_READ_ONLY_THP_FOR_FS=y\n");
        fprintf(stderr, "\n\tif [dir] is a (sub)directory of a tmpfs mount, tmpfs must be\n");
        fprintf(stderr, "\tmounted with huge=advise option for khugepaged tests to work\n");
        fprintf(stderr, "\n\tSupported Options:\n");
        fprintf(stderr, "\t\t-h: This help message.\n");
        fprintf(stderr, "\t\t-s: mTHP size, expressed as page order.\n");
        fprintf(stderr, "\t\t    Defaults to 0. Use this size for anon or shmem allocations.\n");
        exit(1);
}

static void parse_test_type(int argc, char **argv)
{
        int opt;
        char *buf;
        const char *token;

        while ((opt = getopt(argc, argv, "s:h")) != -1) {
                switch (opt) {
                case 's':
                        anon_order = atoi(optarg);
                        break;
                case 'h':
                default:
                        usage();
                }
        }

        argv += optind;
        argc -= optind;

        if (argc == 0) {
                /* Backwards compatibility */
                khugepaged_context =  &__khugepaged_context;
                madvise_context =  &__madvise_context;
                anon_ops = &__anon_ops;
                return;
        }

        buf = strdup(argv[0]);
        token = strsep(&buf, ":");

        if (!strcmp(token, "all")) {
                khugepaged_context =  &__khugepaged_context;
                madvise_context =  &__madvise_context;
        } else if (!strcmp(token, "khugepaged")) {
                khugepaged_context =  &__khugepaged_context;
        } else if (!strcmp(token, "madvise")) {
                madvise_context =  &__madvise_context;
        } else {
                usage();
        }

        if (!buf)
                usage();

        if (!strcmp(buf, "all")) {
                file_ops =  &__file_ops;
                anon_ops = &__anon_ops;
                shmem_ops = &__shmem_ops;
        } else if (!strcmp(buf, "anon")) {
                anon_ops = &__anon_ops;
        } else if (!strcmp(buf, "file")) {
                file_ops =  &__file_ops;
        } else if (!strcmp(buf, "shmem")) {
                shmem_ops = &__shmem_ops;
        } else {
                usage();
        }

        if (!file_ops)
                return;

        if (argc != 2)
                usage();

        get_finfo(argv[1]);
}

int main(int argc, char **argv)
{
        int hpage_pmd_order;
        struct thp_settings default_settings = {
                .thp_enabled = THP_MADVISE,
                .thp_defrag = THP_DEFRAG_ALWAYS,
                .shmem_enabled = SHMEM_ADVISE,
                .use_zero_page = 0,
                .khugepaged = {
                        .defrag = 1,
                        .alloc_sleep_millisecs = 10,
                        .scan_sleep_millisecs = 10,
                },
                /*
                 * When testing file-backed memory, the collapse path
                 * looks at how many pages are found in the page cache, not
                 * what pages are mapped. Disable read ahead optimization so
                 * pages don't find their way into the page cache unless
                 * we mem_ops->fault() them in.
                 */
                .read_ahead_kb = 0,
        };

        if (!thp_is_enabled()) {
                printf("Transparent Hugepages not available\n");
                return KSFT_SKIP;
        }

        parse_test_type(argc, argv);

        setbuf(stdout, NULL);

        page_size = getpagesize();
        hpage_pmd_size = read_pmd_pagesize();
        if (!hpage_pmd_size) {
                printf("Reading PMD pagesize failed");
                exit(EXIT_FAILURE);
        }
        hpage_pmd_nr = hpage_pmd_size / page_size;
        hpage_pmd_order = __builtin_ctz(hpage_pmd_nr);

        default_settings.khugepaged.max_ptes_none = hpage_pmd_nr - 1;
        default_settings.khugepaged.max_ptes_swap = hpage_pmd_nr / 8;
        default_settings.khugepaged.max_ptes_shared = hpage_pmd_nr / 2;
        default_settings.khugepaged.pages_to_scan = hpage_pmd_nr * 8;
        default_settings.hugepages[hpage_pmd_order].enabled = THP_INHERIT;
        default_settings.hugepages[anon_order].enabled = THP_ALWAYS;
        default_settings.shmem_hugepages[hpage_pmd_order].enabled = SHMEM_INHERIT;
        default_settings.shmem_hugepages[anon_order].enabled = SHMEM_ALWAYS;

        save_settings();
        thp_push_settings(&default_settings);

        alloc_at_fault();

#define TEST(t, c, o) do { \
        if (c && o) { \
                printf("\nRun test: " #t " (%s:%s)\n", c->name, o->name); \
                t(c, o); \
        } \
        } while (0)

        TEST(collapse_full, khugepaged_context, anon_ops);
        TEST(collapse_full, khugepaged_context, file_ops);
        TEST(collapse_full, khugepaged_context, shmem_ops);
        TEST(collapse_full, madvise_context, anon_ops);
        TEST(collapse_full, madvise_context, file_ops);
        TEST(collapse_full, madvise_context, shmem_ops);

        TEST(collapse_empty, khugepaged_context, anon_ops);
        TEST(collapse_empty, madvise_context, anon_ops);

        TEST(collapse_single_pte_entry, khugepaged_context, anon_ops);
        TEST(collapse_single_pte_entry, khugepaged_context, file_ops);
        TEST(collapse_single_pte_entry, khugepaged_context, shmem_ops);
        TEST(collapse_single_pte_entry, madvise_context, anon_ops);
        TEST(collapse_single_pte_entry, madvise_context, file_ops);
        TEST(collapse_single_pte_entry, madvise_context, shmem_ops);

        TEST(collapse_max_ptes_none, khugepaged_context, anon_ops);
        TEST(collapse_max_ptes_none, khugepaged_context, file_ops);
        TEST(collapse_max_ptes_none, madvise_context, anon_ops);
        TEST(collapse_max_ptes_none, madvise_context, file_ops);

        TEST(collapse_single_pte_entry_compound, khugepaged_context, anon_ops);
        TEST(collapse_single_pte_entry_compound, khugepaged_context, file_ops);
        TEST(collapse_single_pte_entry_compound, madvise_context, anon_ops);
        TEST(collapse_single_pte_entry_compound, madvise_context, file_ops);

        TEST(collapse_full_of_compound, khugepaged_context, anon_ops);
        TEST(collapse_full_of_compound, khugepaged_context, file_ops);
        TEST(collapse_full_of_compound, khugepaged_context, shmem_ops);
        TEST(collapse_full_of_compound, madvise_context, anon_ops);
        TEST(collapse_full_of_compound, madvise_context, file_ops);
        TEST(collapse_full_of_compound, madvise_context, shmem_ops);

        TEST(collapse_compound_extreme, khugepaged_context, anon_ops);
        TEST(collapse_compound_extreme, madvise_context, anon_ops);

        TEST(collapse_swapin_single_pte, khugepaged_context, anon_ops);
        TEST(collapse_swapin_single_pte, madvise_context, anon_ops);

        TEST(collapse_max_ptes_swap, khugepaged_context, anon_ops);
        TEST(collapse_max_ptes_swap, madvise_context, anon_ops);

        TEST(collapse_fork, khugepaged_context, anon_ops);
        TEST(collapse_fork, madvise_context, anon_ops);

        TEST(collapse_fork_compound, khugepaged_context, anon_ops);
        TEST(collapse_fork_compound, madvise_context, anon_ops);

        TEST(collapse_max_ptes_shared, khugepaged_context, anon_ops);
        TEST(collapse_max_ptes_shared, madvise_context, anon_ops);

        TEST(madvise_collapse_existing_thps, madvise_context, anon_ops);
        TEST(madvise_collapse_existing_thps, madvise_context, file_ops);
        TEST(madvise_collapse_existing_thps, madvise_context, shmem_ops);

        TEST(madvise_retracted_page_tables, madvise_context, file_ops);
        TEST(madvise_retracted_page_tables, madvise_context, shmem_ops);

        restore_settings(0);
}