root/tools/testing/selftests/rseq/rseq.c
// SPDX-License-Identifier: LGPL-2.1
/*
 * rseq.c
 *
 * Copyright (C) 2016 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; only
 * version 2.1 of the License.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
 * Lesser General Public License for more details.
 */

#define _GNU_SOURCE
#include <errno.h>
#include <sched.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <syscall.h>
#include <assert.h>
#include <signal.h>
#include <limits.h>
#include <dlfcn.h>
#include <stddef.h>
#include <sys/auxv.h>
#include <linux/auxvec.h>

#include <linux/compiler.h>

#include "kselftest.h"
#include "rseq.h"

/*
 * Define weak versions to play nice with binaries that are statically linked
 * against a libc that doesn't support registering its own rseq.
 */
extern __weak ptrdiff_t __rseq_offset;
extern __weak unsigned int __rseq_size;
extern __weak unsigned int __rseq_flags;

static const ptrdiff_t *libc_rseq_offset_p = &__rseq_offset;
static const unsigned int *libc_rseq_size_p = &__rseq_size;
static const unsigned int *libc_rseq_flags_p = &__rseq_flags;

/* Offset from the thread pointer to the rseq area. */
ptrdiff_t rseq_offset;

/*
 * Size of the registered rseq area. 0 if the registration was
 * unsuccessful.
 */
unsigned int rseq_size = -1U;

/* Flags used during rseq registration.  */
unsigned int rseq_flags;

static int rseq_ownership;

/* Allocate a large area for the TLS. */
#define RSEQ_THREAD_AREA_ALLOC_SIZE     1024

/* Original struct rseq feature size is 20 bytes. */
#define ORIG_RSEQ_FEATURE_SIZE          20

/* Original struct rseq allocation size is 32 bytes. */
#define ORIG_RSEQ_ALLOC_SIZE            32

/*
 * Use a union to ensure we allocate a TLS area of 1024 bytes to accomodate an
 * rseq registration that is larger than the current rseq ABI.
 */
union rseq_tls {
        struct rseq_abi abi;
        char dummy[RSEQ_THREAD_AREA_ALLOC_SIZE];
};

static
__thread union rseq_tls __rseq __attribute__((tls_model("initial-exec"))) = {
        .abi = {
                .cpu_id = RSEQ_ABI_CPU_ID_UNINITIALIZED,
        },
};

static int sys_rseq(struct rseq_abi *rseq_abi, uint32_t rseq_len,
                    int flags, uint32_t sig)
{
        return syscall(__NR_rseq, rseq_abi, rseq_len, flags, sig);
}

static int sys_getcpu(unsigned *cpu, unsigned *node)
{
        return syscall(__NR_getcpu, cpu, node, NULL);
}

bool rseq_available(void)
{
        int rc;

        rc = sys_rseq(NULL, 0, 0, 0);
        if (rc != -1)
                abort();
        switch (errno) {
        case ENOSYS:
                return false;
        case EINVAL:
                return true;
        default:
                abort();
        }
}

/* The rseq areas need to be at least 32 bytes. */
static
unsigned int get_rseq_min_alloc_size(void)
{
        unsigned int alloc_size = rseq_size;

        if (alloc_size < ORIG_RSEQ_ALLOC_SIZE)
                alloc_size = ORIG_RSEQ_ALLOC_SIZE;
        return alloc_size;
}

/*
 * Return the feature size supported by the kernel.
 *
 * Depending on the value returned by getauxval(AT_RSEQ_FEATURE_SIZE):
 *
 * 0:   Return ORIG_RSEQ_FEATURE_SIZE (20)
 * > 0: Return the value from getauxval(AT_RSEQ_FEATURE_SIZE).
 *
 * It should never return a value below ORIG_RSEQ_FEATURE_SIZE.
 */
static
unsigned int get_rseq_kernel_feature_size(void)
{
        unsigned long auxv_rseq_feature_size, auxv_rseq_align;

        auxv_rseq_align = getauxval(AT_RSEQ_ALIGN);
        assert(!auxv_rseq_align || auxv_rseq_align <= RSEQ_THREAD_AREA_ALLOC_SIZE);

        auxv_rseq_feature_size = getauxval(AT_RSEQ_FEATURE_SIZE);
        assert(!auxv_rseq_feature_size || auxv_rseq_feature_size <= RSEQ_THREAD_AREA_ALLOC_SIZE);
        if (auxv_rseq_feature_size)
                return auxv_rseq_feature_size;
        else
                return ORIG_RSEQ_FEATURE_SIZE;
}

int rseq_register_current_thread(void)
{
        int rc;

        if (!rseq_ownership) {
                /* Treat libc's ownership as a successful registration. */
                return 0;
        }
        rc = sys_rseq(&__rseq.abi, get_rseq_min_alloc_size(), 0, RSEQ_SIG);
        if (rc) {
                /*
                 * After at least one thread has registered successfully
                 * (rseq_size > 0), the registration of other threads should
                 * never fail.
                 */
                if (RSEQ_READ_ONCE(rseq_size) > 0) {
                        /* Incoherent success/failure within process. */
                        abort();
                }
                return -1;
        }
        assert(rseq_current_cpu_raw() >= 0);

        /*
         * The first thread to register sets the rseq_size to mimic the libc
         * behavior.
         */
        if (RSEQ_READ_ONCE(rseq_size) == 0) {
                RSEQ_WRITE_ONCE(rseq_size, get_rseq_kernel_feature_size());
        }

        return 0;
}

int rseq_unregister_current_thread(void)
{
        int rc;

        if (!rseq_ownership) {
                /* Treat libc's ownership as a successful unregistration. */
                return 0;
        }
        rc = sys_rseq(&__rseq.abi, get_rseq_min_alloc_size(), RSEQ_ABI_FLAG_UNREGISTER, RSEQ_SIG);
        if (rc)
                return -1;
        return 0;
}

static __attribute__((constructor))
void rseq_init(void)
{
        /*
         * If the libc's registered rseq size isn't already valid, it may be
         * because the binary is dynamically linked and not necessarily due to
         * libc not having registered a restartable sequence.  Try to find the
         * symbols if that's the case.
         */
        if (!libc_rseq_size_p || !*libc_rseq_size_p) {
                libc_rseq_offset_p = dlsym(RTLD_NEXT, "__rseq_offset");
                libc_rseq_size_p = dlsym(RTLD_NEXT, "__rseq_size");
                libc_rseq_flags_p = dlsym(RTLD_NEXT, "__rseq_flags");
        }
        if (libc_rseq_size_p && libc_rseq_offset_p && libc_rseq_flags_p &&
                        *libc_rseq_size_p != 0) {
                unsigned int libc_rseq_size;

                /* rseq registration owned by glibc */
                rseq_offset = *libc_rseq_offset_p;
                libc_rseq_size = *libc_rseq_size_p;
                rseq_flags = *libc_rseq_flags_p;

                /*
                 * Previous versions of glibc expose the value
                 * 32 even though the kernel only supported 20
                 * bytes initially. Therefore treat 32 as a
                 * special-case. glibc 2.40 exposes a 20 bytes
                 * __rseq_size without using getauxval(3) to
                 * query the supported size, while still allocating a 32
                 * bytes area. Also treat 20 as a special-case.
                 *
                 * Special-cases are handled by using the following
                 * value as active feature set size:
                 *
                 *   rseq_size = min(32, get_rseq_kernel_feature_size())
                 */
                switch (libc_rseq_size) {
                case ORIG_RSEQ_FEATURE_SIZE:
                        fallthrough;
                case ORIG_RSEQ_ALLOC_SIZE:
                {
                        unsigned int rseq_kernel_feature_size = get_rseq_kernel_feature_size();

                        if (rseq_kernel_feature_size < ORIG_RSEQ_ALLOC_SIZE)
                                rseq_size = rseq_kernel_feature_size;
                        else
                                rseq_size = ORIG_RSEQ_ALLOC_SIZE;
                        break;
                }
                default:
                        /* Otherwise just use the __rseq_size from libc as rseq_size. */
                        rseq_size = libc_rseq_size;
                        break;
                }
                return;
        }
        rseq_ownership = 1;

        /* Calculate the offset of the rseq area from the thread pointer. */
        rseq_offset = (void *)&__rseq.abi - rseq_thread_pointer();

        /* rseq flags are deprecated, always set to 0. */
        rseq_flags = 0;

        /*
         * Set the size to 0 until at least one thread registers to mimic the
         * libc behavior.
         */
        rseq_size = 0;
}

static __attribute__((destructor))
void rseq_exit(void)
{
        if (!rseq_ownership)
                return;
        rseq_offset = 0;
        rseq_size = -1U;
        rseq_ownership = 0;
}

int32_t rseq_fallback_current_cpu(void)
{
        int32_t cpu;

        cpu = sched_getcpu();
        if (cpu < 0) {
                perror("sched_getcpu()");
                abort();
        }
        return cpu;
}

int32_t rseq_fallback_current_node(void)
{
        uint32_t cpu_id, node_id;
        int ret;

        ret = sys_getcpu(&cpu_id, &node_id);
        if (ret) {
                perror("sys_getcpu()");
                return ret;
        }
        return (int32_t) node_id;
}