root/tools/testing/selftests/mm/uffd-common.c
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Userfaultfd tests util functions
 *
 * Copyright (C) 2015-2023  Red Hat, Inc.
 */

#include "uffd-common.h"

uffd_test_ops_t *uffd_test_ops;
uffd_test_case_ops_t *uffd_test_case_ops;


/* pthread_mutex_t starts at page offset 0 */
pthread_mutex_t *area_mutex(char *area, unsigned long nr, uffd_global_test_opts_t *gopts)
{
        return (pthread_mutex_t *) (area + nr * gopts->page_size);
}

/*
 * count is placed in the page after pthread_mutex_t naturally aligned
 * to avoid non alignment faults on non-x86 archs.
 */
volatile unsigned long long *area_count(char *area, unsigned long nr,
                                        uffd_global_test_opts_t *gopts)
{
        return (volatile unsigned long long *)
               ((unsigned long)(area + nr * gopts->page_size +
               sizeof(pthread_mutex_t) + sizeof(unsigned long long) - 1) &
               ~(unsigned long)(sizeof(unsigned long long) - 1));
}

static int uffd_mem_fd_create(off_t mem_size, bool hugetlb)
{
        unsigned int memfd_flags = 0;
        int mem_fd;

        if (hugetlb)
                memfd_flags = MFD_HUGETLB;
        mem_fd = memfd_create("uffd-test", memfd_flags);
        if (mem_fd < 0)
                err("memfd_create");
        if (ftruncate(mem_fd, mem_size))
                err("ftruncate");
        if (fallocate(mem_fd,
                      FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, 0,
                      mem_size))
                err("fallocate");

        return mem_fd;
}

static void anon_release_pages(uffd_global_test_opts_t *gopts, char *rel_area)
{
        if (madvise(rel_area, gopts->nr_pages * gopts->page_size, MADV_DONTNEED))
                err("madvise(MADV_DONTNEED) failed");
}

static int anon_allocate_area(uffd_global_test_opts_t *gopts, void **alloc_area, bool is_src)
{
        *alloc_area = mmap(NULL, gopts->nr_pages * gopts->page_size, PROT_READ | PROT_WRITE,
                           MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
        if (*alloc_area == MAP_FAILED) {
                *alloc_area = NULL;
                return -errno;
        }
        return 0;
}

static void noop_alias_mapping(uffd_global_test_opts_t *gopts, __u64 *start,
                               size_t len, unsigned long offset)
{
}

static void hugetlb_release_pages(uffd_global_test_opts_t *gopts, char *rel_area)
{
        if (!gopts->map_shared) {
                if (madvise(rel_area, gopts->nr_pages * gopts->page_size, MADV_DONTNEED))
                        err("madvise(MADV_DONTNEED) failed");
        } else {
                if (madvise(rel_area, gopts->nr_pages * gopts->page_size, MADV_REMOVE))
                        err("madvise(MADV_REMOVE) failed");
        }
}

static int hugetlb_allocate_area(uffd_global_test_opts_t *gopts, void **alloc_area, bool is_src)
{
        off_t size = gopts->nr_pages * gopts->page_size;
        off_t offset = is_src ? 0 : size;
        void *area_alias = NULL;
        char **alloc_area_alias;
        int mem_fd = uffd_mem_fd_create(size * 2, true);

        *alloc_area = mmap(NULL, size, PROT_READ | PROT_WRITE,
                           (gopts->map_shared ? MAP_SHARED : MAP_PRIVATE) |
                           (is_src ? 0 : MAP_NORESERVE),
                           mem_fd, offset);
        if (*alloc_area == MAP_FAILED) {
                *alloc_area = NULL;
                return -errno;
        }

        if (gopts->map_shared) {
                area_alias = mmap(NULL, size, PROT_READ | PROT_WRITE,
                                  MAP_SHARED, mem_fd, offset);
                if (area_alias == MAP_FAILED)
                        return -errno;
        }

        if (is_src) {
                alloc_area_alias = &gopts->area_src_alias;
        } else {
                alloc_area_alias = &gopts->area_dst_alias;
        }
        if (area_alias)
                *alloc_area_alias = area_alias;

        close(mem_fd);
        return 0;
}

static void hugetlb_alias_mapping(uffd_global_test_opts_t *gopts, __u64 *start,
                                  size_t len, unsigned long offset)
{
        if (!gopts->map_shared)
                return;

        *start = (unsigned long) gopts->area_dst_alias + offset;
}

static void shmem_release_pages(uffd_global_test_opts_t *gopts, char *rel_area)
{
        if (madvise(rel_area, gopts->nr_pages * gopts->page_size, MADV_REMOVE))
                err("madvise(MADV_REMOVE) failed");
}

static int shmem_allocate_area(uffd_global_test_opts_t *gopts, void **alloc_area, bool is_src)
{
        void *area_alias = NULL;
        size_t bytes = gopts->nr_pages * gopts->page_size, hpage_size = read_pmd_pagesize();
        unsigned long offset = is_src ? 0 : bytes;
        char *p = NULL, *p_alias = NULL;
        int mem_fd = uffd_mem_fd_create(bytes * 2, false);
        size_t region_size = bytes * 2 + hpage_size;

        void *reserve = mmap(NULL, region_size, PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS,
                        -1, 0);
        if (reserve == MAP_FAILED) {
                close(mem_fd);
                return -errno;
        }

        p = reserve;
        p_alias = p;
        p_alias += bytes;
        p_alias += hpage_size;  /* Prevent src/dst VMA merge */

        *alloc_area = mmap(p, bytes, PROT_READ | PROT_WRITE, MAP_FIXED | MAP_SHARED,
                           mem_fd, offset);
        if (*alloc_area == MAP_FAILED) {
                *alloc_area = NULL;
                munmap(reserve, region_size);
                close(mem_fd);
                return -errno;
        }
        if (*alloc_area != p)
                err("mmap of memfd failed at %p", p);

        area_alias = mmap(p_alias, bytes, PROT_READ | PROT_WRITE, MAP_FIXED | MAP_SHARED,
                          mem_fd, offset);
        if (area_alias == MAP_FAILED) {
                *alloc_area = NULL;
                munmap(reserve, region_size);
                close(mem_fd);
                return -errno;
        }
        if (area_alias != p_alias)
                err("mmap of anonymous memory failed at %p", p_alias);

        if (is_src)
                gopts->area_src_alias = area_alias;
        else
                gopts->area_dst_alias = area_alias;

        close(mem_fd);
        return 0;
}

static void shmem_alias_mapping(uffd_global_test_opts_t *gopts, __u64 *start,
                                size_t len, unsigned long offset)
{
        *start = (unsigned long)gopts->area_dst_alias + offset;
}

static void shmem_check_pmd_mapping(uffd_global_test_opts_t *gopts, void *p, int expect_nr_hpages)
{
        if (!check_huge_shmem(gopts->area_dst_alias, expect_nr_hpages,
                              read_pmd_pagesize()))
                err("Did not find expected %d number of hugepages",
                    expect_nr_hpages);
}

struct uffd_test_ops anon_uffd_test_ops = {
        .allocate_area = anon_allocate_area,
        .release_pages = anon_release_pages,
        .alias_mapping = noop_alias_mapping,
        .check_pmd_mapping = NULL,
};

struct uffd_test_ops shmem_uffd_test_ops = {
        .allocate_area = shmem_allocate_area,
        .release_pages = shmem_release_pages,
        .alias_mapping = shmem_alias_mapping,
        .check_pmd_mapping = shmem_check_pmd_mapping,
};

struct uffd_test_ops hugetlb_uffd_test_ops = {
        .allocate_area = hugetlb_allocate_area,
        .release_pages = hugetlb_release_pages,
        .alias_mapping = hugetlb_alias_mapping,
        .check_pmd_mapping = NULL,
};

void uffd_stats_report(struct uffd_args *args, int n_cpus)
{
        int i;
        unsigned long long miss_total = 0, wp_total = 0, minor_total = 0;

        for (i = 0; i < n_cpus; i++) {
                miss_total += args[i].missing_faults;
                wp_total += args[i].wp_faults;
                minor_total += args[i].minor_faults;
        }

        printf("userfaults: ");
        if (miss_total) {
                printf("%llu missing (", miss_total);
                for (i = 0; i < n_cpus; i++)
                        printf("%lu+", args[i].missing_faults);
                printf("\b) ");
        }
        if (wp_total) {
                printf("%llu wp (", wp_total);
                for (i = 0; i < n_cpus; i++)
                        printf("%lu+", args[i].wp_faults);
                printf("\b) ");
        }
        if (minor_total) {
                printf("%llu minor (", minor_total);
                for (i = 0; i < n_cpus; i++)
                        printf("%lu+", args[i].minor_faults);
                printf("\b)");
        }
        printf("\n");
}

int userfaultfd_open(uffd_global_test_opts_t *gopts, uint64_t *features)
{
        struct uffdio_api uffdio_api;

        gopts->uffd = uffd_open(UFFD_FLAGS);
        if (gopts->uffd < 0)
                return -1;
        gopts->uffd_flags = fcntl(gopts->uffd, F_GETFD, NULL);

        uffdio_api.api = UFFD_API;
        uffdio_api.features = *features;
        if (ioctl(gopts->uffd, UFFDIO_API, &uffdio_api))
                /* Probably lack of CAP_PTRACE? */
                return -1;
        if (uffdio_api.api != UFFD_API)
                err("UFFDIO_API error: %" PRIu64, (uint64_t)uffdio_api.api);

        *features = uffdio_api.features;
        return 0;
}

static inline void munmap_area(uffd_global_test_opts_t *gopts, void **area)
{
        if (*area)
                if (munmap(*area, gopts->nr_pages * gopts->page_size))
                        err("munmap");

        *area = NULL;
}

void uffd_test_ctx_clear(uffd_global_test_opts_t *gopts)
{
        size_t i;

        if (gopts->pipefd) {
                for (i = 0; i < gopts->nr_parallel * 2; ++i) {
                        if (close(gopts->pipefd[i]))
                                err("close pipefd");
                }
                free(gopts->pipefd);
                gopts->pipefd = NULL;
        }

        if (gopts->count_verify) {
                free(gopts->count_verify);
                gopts->count_verify = NULL;
        }

        if (gopts->uffd != -1) {
                if (close(gopts->uffd))
                        err("close uffd");
                gopts->uffd = -1;
        }

        munmap_area(gopts, (void **)&gopts->area_src);
        munmap_area(gopts, (void **)&gopts->area_src_alias);
        munmap_area(gopts, (void **)&gopts->area_dst);
        munmap_area(gopts, (void **)&gopts->area_dst_alias);
        munmap_area(gopts, (void **)&gopts->area_remap);
}

int uffd_test_ctx_init(uffd_global_test_opts_t *gopts, uint64_t features, const char **errmsg)
{
        unsigned long nr, cpu;
        int ret;

        gopts->area_src_alias = NULL;
        gopts->area_dst_alias = NULL;
        gopts->area_remap = NULL;

        if (uffd_test_case_ops && uffd_test_case_ops->pre_alloc) {
                ret = uffd_test_case_ops->pre_alloc(gopts, errmsg);
                if (ret)
                        return ret;
        }

        ret = uffd_test_ops->allocate_area(gopts, (void **) &gopts->area_src, true);
        ret |= uffd_test_ops->allocate_area(gopts, (void **) &gopts->area_dst, false);
        if (ret) {
                if (errmsg)
                        *errmsg = "memory allocation failed";
                return ret;
        }

        if (uffd_test_case_ops && uffd_test_case_ops->post_alloc) {
                ret = uffd_test_case_ops->post_alloc(gopts, errmsg);
                if (ret)
                        return ret;
        }

        ret = userfaultfd_open(gopts, &features);
        if (ret) {
                if (errmsg)
                        *errmsg = "possible lack of privilege";
                return ret;
        }

        gopts->count_verify = malloc(gopts->nr_pages * sizeof(unsigned long long));
        if (!gopts->count_verify)
                err("count_verify");

        for (nr = 0; nr < gopts->nr_pages; nr++) {
                *area_mutex(gopts->area_src, nr, gopts) =
                        (pthread_mutex_t)PTHREAD_MUTEX_INITIALIZER;
                gopts->count_verify[nr] = *area_count(gopts->area_src, nr, gopts) = 1;
                /*
                 * In the transition between 255 to 256, powerpc will
                 * read out of order in my_bcmp and see both bytes as
                 * zero, so leave a placeholder below always non-zero
                 * after the count, to avoid my_bcmp to trigger false
                 * positives.
                 */
                *(area_count(gopts->area_src, nr, gopts) + 1) = 1;
        }

        /*
         * After initialization of area_src, we must explicitly release pages
         * for area_dst to make sure it's fully empty.  Otherwise we could have
         * some area_dst pages be erroneously initialized with zero pages,
         * hence we could hit memory corruption later in the test.
         *
         * One example is when THP is globally enabled, above allocate_area()
         * calls could have the two areas merged into a single VMA (as they
         * will have the same VMA flags so they're mergeable).  When we
         * initialize the area_src above, it's possible that some part of
         * area_dst could have been faulted in via one huge THP that will be
         * shared between area_src and area_dst.  It could cause some of the
         * area_dst won't be trapped by missing userfaults.
         *
         * This release_pages() will guarantee even if that happened, we'll
         * proactively split the thp and drop any accidentally initialized
         * pages within area_dst.
         */
        uffd_test_ops->release_pages(gopts, gopts->area_dst);

        gopts->pipefd = malloc(sizeof(int) * gopts->nr_parallel * 2);
        if (!gopts->pipefd)
                err("pipefd");
        for (cpu = 0; cpu < gopts->nr_parallel; cpu++)
                if (pipe2(&gopts->pipefd[cpu * 2], O_CLOEXEC | O_NONBLOCK))
                        err("pipe");

        return 0;
}

void wp_range(int ufd, __u64 start, __u64 len, bool wp)
{
        struct uffdio_writeprotect prms;

        /* Write protection page faults */
        prms.range.start = start;
        prms.range.len = len;
        /* Undo write-protect, do wakeup after that */
        prms.mode = wp ? UFFDIO_WRITEPROTECT_MODE_WP : 0;

        if (ioctl(ufd, UFFDIO_WRITEPROTECT, &prms))
                err("clear WP failed: address=0x%"PRIx64, (uint64_t)start);
}

static void continue_range(int ufd, __u64 start, __u64 len, bool wp)
{
        struct uffdio_continue req;
        int ret;

        req.range.start = start;
        req.range.len = len;
        req.mode = 0;
        if (wp)
                req.mode |= UFFDIO_CONTINUE_MODE_WP;

        if (ioctl(ufd, UFFDIO_CONTINUE, &req))
                err("UFFDIO_CONTINUE failed for address 0x%" PRIx64,
                    (uint64_t)start);

        /*
         * Error handling within the kernel for continue is subtly different
         * from copy or zeropage, so it may be a source of bugs. Trigger an
         * error (-EEXIST) on purpose, to verify doing so doesn't cause a BUG.
         */
        req.mapped = 0;
        ret = ioctl(ufd, UFFDIO_CONTINUE, &req);
        if (ret >= 0 || req.mapped != -EEXIST)
                err("failed to exercise UFFDIO_CONTINUE error handling, ret=%d, mapped=%" PRId64,
                    ret, (int64_t) req.mapped);
}

int uffd_read_msg(uffd_global_test_opts_t *gopts, struct uffd_msg *msg)
{
        int ret = read(gopts->uffd, msg, sizeof(*msg));

        if (ret != sizeof(*msg)) {
                if (ret < 0) {
                        if (errno == EAGAIN || errno == EINTR)
                                return 1;
                        err("blocking read error");
                } else {
                        err("short read");
                }
        }

        return 0;
}

void uffd_handle_page_fault(uffd_global_test_opts_t *gopts, struct uffd_msg *msg,
                            struct uffd_args *args)
{
        unsigned long offset;

        if (msg->event != UFFD_EVENT_PAGEFAULT)
                err("unexpected msg event %u", msg->event);

        if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WP) {
                /* Write protect page faults */
                wp_range(gopts->uffd, msg->arg.pagefault.address, gopts->page_size, false);
                args->wp_faults++;
        } else if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_MINOR) {
                uint8_t *area;
                int b;

                /*
                 * Minor page faults
                 *
                 * To prove we can modify the original range for testing
                 * purposes, we're going to bit flip this range before
                 * continuing.
                 *
                 * Note that this requires all minor page fault tests operate on
                 * area_dst (non-UFFD-registered) and area_dst_alias
                 * (UFFD-registered).
                 */

                area = (uint8_t *)(gopts->area_dst +
                       ((char *)msg->arg.pagefault.address -
                       gopts->area_dst_alias));
                for (b = 0; b < gopts->page_size; ++b)
                        area[b] = ~area[b];
                continue_range(gopts->uffd, msg->arg.pagefault.address, gopts->page_size,
                               args->apply_wp);
                args->minor_faults++;
        } else {
                /*
                 * Missing page faults.
                 *
                 * Here we force a write check for each of the missing mode
                 * faults.  It's guaranteed because the only threads that
                 * will trigger uffd faults are the locking threads, and
                 * their first instruction to touch the missing page will
                 * always be pthread_mutex_lock().
                 *
                 * Note that here we relied on an NPTL glibc impl detail to
                 * always read the lock type at the entry of the lock op
                 * (pthread_mutex_t.__data.__type, offset 0x10) before
                 * doing any locking operations to guarantee that.  It's
                 * actually not good to rely on this impl detail because
                 * logically a pthread-compatible lib can implement the
                 * locks without types and we can fail when linking with
                 * them.  However since we used to find bugs with this
                 * strict check we still keep it around.  Hopefully this
                 * could be a good hint when it fails again.  If one day
                 * it'll break on some other impl of glibc we'll revisit.
                 */
                if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WRITE)
                        err("unexpected write fault");

                offset = (char *)(unsigned long)msg->arg.pagefault.address - gopts->area_dst;
                offset &= ~(gopts->page_size-1);

                if (copy_page(gopts, offset, args->apply_wp))
                        args->missing_faults++;
        }
}

void *uffd_poll_thread(void *arg)
{
        struct uffd_args *args = (struct uffd_args *)arg;
        uffd_global_test_opts_t *gopts = args->gopts;
        unsigned long cpu = args->cpu;
        struct pollfd pollfd[2];
        struct uffd_msg msg;
        struct uffdio_register uffd_reg;
        int ret;
        char tmp_chr;

        if (!args->handle_fault)
                args->handle_fault = uffd_handle_page_fault;

        pollfd[0].fd = gopts->uffd;
        pollfd[0].events = POLLIN;
        pollfd[1].fd = gopts->pipefd[cpu*2];
        pollfd[1].events = POLLIN;

        gopts->ready_for_fork = true;

        for (;;) {
                ret = poll(pollfd, 2, -1);
                if (ret <= 0) {
                        if (errno == EINTR || errno == EAGAIN)
                                continue;
                        err("poll error: %d", ret);
                }
                if (pollfd[1].revents) {
                        if (!(pollfd[1].revents & POLLIN))
                                err("pollfd[1].revents %d", pollfd[1].revents);
                        if (read(pollfd[1].fd, &tmp_chr, 1) != 1)
                                err("read pipefd error");
                        break;
                }
                if (!(pollfd[0].revents & POLLIN))
                        err("pollfd[0].revents %d", pollfd[0].revents);
                if (uffd_read_msg(gopts, &msg))
                        continue;
                switch (msg.event) {
                default:
                        err("unexpected msg event %u\n", msg.event);
                        break;
                case UFFD_EVENT_PAGEFAULT:
                        args->handle_fault(gopts, &msg, args);
                        break;
                case UFFD_EVENT_FORK:
                        close(gopts->uffd);
                        gopts->uffd = msg.arg.fork.ufd;
                        pollfd[0].fd = gopts->uffd;
                        break;
                case UFFD_EVENT_REMOVE:
                        uffd_reg.range.start = msg.arg.remove.start;
                        uffd_reg.range.len = msg.arg.remove.end -
                                msg.arg.remove.start;
                        if (ioctl(gopts->uffd, UFFDIO_UNREGISTER, &uffd_reg.range))
                                err("remove failure");
                        break;
                case UFFD_EVENT_REMAP:
                        gopts->area_remap = gopts->area_dst;  /* save for later unmap */
                        gopts->area_dst = (char *)(unsigned long)msg.arg.remap.to;
                        break;
                }
        }

        return NULL;
}

static void retry_copy_page(uffd_global_test_opts_t *gopts, struct uffdio_copy *uffdio_copy,
                            unsigned long offset)
{
        uffd_test_ops->alias_mapping(gopts,
                                     &uffdio_copy->dst,
                                     uffdio_copy->len,
                                     offset);
        if (ioctl(gopts->uffd, UFFDIO_COPY, uffdio_copy)) {
                /* real retval in ufdio_copy.copy */
                if (uffdio_copy->copy != -EEXIST)
                        err("UFFDIO_COPY retry error: %"PRId64,
                        (int64_t)uffdio_copy->copy);
        } else {
                err("UFFDIO_COPY retry unexpected: %"PRId64,
                    (int64_t)uffdio_copy->copy);
        }
}

static void wake_range(int ufd, unsigned long addr, unsigned long len)
{
        struct uffdio_range uffdio_wake;

        uffdio_wake.start = addr;
        uffdio_wake.len = len;

        if (ioctl(ufd, UFFDIO_WAKE, &uffdio_wake))
                fprintf(stderr, "error waking %lu\n",
                        addr), exit(1);
}

int __copy_page(uffd_global_test_opts_t *gopts, unsigned long offset, bool retry, bool wp)
{
        struct uffdio_copy uffdio_copy;

        if (offset >= gopts->nr_pages * gopts->page_size)
                err("unexpected offset %lu\n", offset);
        uffdio_copy.dst = (unsigned long) gopts->area_dst + offset;
        uffdio_copy.src = (unsigned long) gopts->area_src + offset;
        uffdio_copy.len = gopts->page_size;
        if (wp)
                uffdio_copy.mode = UFFDIO_COPY_MODE_WP;
        else
                uffdio_copy.mode = 0;
        uffdio_copy.copy = 0;
        if (ioctl(gopts->uffd, UFFDIO_COPY, &uffdio_copy)) {
                /* real retval in ufdio_copy.copy */
                if (uffdio_copy.copy != -EEXIST)
                        err("UFFDIO_COPY error: %"PRId64,
                            (int64_t)uffdio_copy.copy);
                wake_range(gopts->uffd, uffdio_copy.dst, gopts->page_size);
        } else if (uffdio_copy.copy != gopts->page_size) {
                err("UFFDIO_COPY error: %"PRId64, (int64_t)uffdio_copy.copy);
        } else {
                if (gopts->test_uffdio_copy_eexist && retry) {
                        gopts->test_uffdio_copy_eexist = false;
                        retry_copy_page(gopts, &uffdio_copy, offset);
                }
                return 1;
        }
        return 0;
}

int copy_page(uffd_global_test_opts_t *gopts, unsigned long offset, bool wp)
{
        return __copy_page(gopts, offset, false, wp);
}

int move_page(uffd_global_test_opts_t *gopts, unsigned long offset, unsigned long len)
{
        struct uffdio_move uffdio_move;

        if (offset + len > gopts->nr_pages * gopts->page_size)
                err("unexpected offset %lu and length %lu\n", offset, len);
        uffdio_move.dst = (unsigned long) gopts->area_dst + offset;
        uffdio_move.src = (unsigned long) gopts->area_src + offset;
        uffdio_move.len = len;
        uffdio_move.mode = UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES;
        uffdio_move.move = 0;
        if (ioctl(gopts->uffd, UFFDIO_MOVE, &uffdio_move)) {
                /* real retval in uffdio_move.move */
                if (uffdio_move.move != -EEXIST)
                        err("UFFDIO_MOVE error: %"PRId64,
                            (int64_t)uffdio_move.move);
                wake_range(gopts->uffd, uffdio_move.dst, len);
        } else if (uffdio_move.move != len) {
                err("UFFDIO_MOVE error: %"PRId64, (int64_t)uffdio_move.move);
        } else
                return 1;
        return 0;
}

int uffd_open_dev(unsigned int flags)
{
        int fd, uffd;

        fd = open("/dev/userfaultfd", O_RDWR | O_CLOEXEC);
        if (fd < 0)
                return fd;
        uffd = ioctl(fd, USERFAULTFD_IOC_NEW, flags);
        close(fd);

        return uffd;
}

int uffd_open_sys(unsigned int flags)
{
#ifdef __NR_userfaultfd
        return syscall(__NR_userfaultfd, flags);
#else
        return -1;
#endif
}

int uffd_open(unsigned int flags)
{
        int uffd = uffd_open_sys(flags);

        if (uffd < 0)
                uffd = uffd_open_dev(flags);

        return uffd;
}

int uffd_get_features(uint64_t *features)
{
        struct uffdio_api uffdio_api = { .api = UFFD_API, .features = 0 };
        /*
         * This should by default work in most kernels; the feature list
         * will be the same no matter what we pass in here.
         */
        int fd = uffd_open(UFFD_USER_MODE_ONLY);

        if (fd < 0)
                /* Maybe the kernel is older than user-only mode? */
                fd = uffd_open(0);

        if (fd < 0)
                return fd;

        if (ioctl(fd, UFFDIO_API, &uffdio_api)) {
                close(fd);
                return -errno;
        }

        *features = uffdio_api.features;
        close(fd);

        return 0;
}