root/arch/riscv/kernel/usercfi.c
// SPDX-License-Identifier: GPL-2.0
/*
 * Copyright (C) 2024 Rivos, Inc.
 * Deepak Gupta <debug@rivosinc.com>
 */

#include <linux/sched.h>
#include <linux/bitops.h>
#include <linux/types.h>
#include <linux/mm.h>
#include <linux/mman.h>
#include <linux/uaccess.h>
#include <linux/sizes.h>
#include <linux/user.h>
#include <linux/syscalls.h>
#include <linux/prctl.h>
#include <asm/csr.h>
#include <asm/usercfi.h>

unsigned long riscv_nousercfi __read_mostly;

#define SHSTK_ENTRY_SIZE sizeof(void *)

bool is_shstk_enabled(struct task_struct *task)
{
        return task->thread_info.user_cfi_state.ubcfi_en;
}

bool is_shstk_allocated(struct task_struct *task)
{
        return task->thread_info.user_cfi_state.shdw_stk_base;
}

bool is_shstk_locked(struct task_struct *task)
{
        return task->thread_info.user_cfi_state.ubcfi_locked;
}

void set_shstk_base(struct task_struct *task, unsigned long shstk_addr, unsigned long size)
{
        task->thread_info.user_cfi_state.shdw_stk_base = shstk_addr;
        task->thread_info.user_cfi_state.shdw_stk_size = size;
}

unsigned long get_shstk_base(struct task_struct *task, unsigned long *size)
{
        if (size)
                *size = task->thread_info.user_cfi_state.shdw_stk_size;
        return task->thread_info.user_cfi_state.shdw_stk_base;
}

void set_active_shstk(struct task_struct *task, unsigned long shstk_addr)
{
        task->thread_info.user_cfi_state.user_shdw_stk = shstk_addr;
}

unsigned long get_active_shstk(struct task_struct *task)
{
        return task->thread_info.user_cfi_state.user_shdw_stk;
}

void set_shstk_status(struct task_struct *task, bool enable)
{
        if (!is_user_shstk_enabled())
                return;

        task->thread_info.user_cfi_state.ubcfi_en = enable ? 1 : 0;

        if (enable)
                task->thread.envcfg |= ENVCFG_SSE;
        else
                task->thread.envcfg &= ~ENVCFG_SSE;

        csr_write(CSR_ENVCFG, task->thread.envcfg);
}

void set_shstk_lock(struct task_struct *task)
{
        task->thread_info.user_cfi_state.ubcfi_locked = 1;
}

bool is_indir_lp_enabled(struct task_struct *task)
{
        return task->thread_info.user_cfi_state.ufcfi_en;
}

bool is_indir_lp_locked(struct task_struct *task)
{
        return task->thread_info.user_cfi_state.ufcfi_locked;
}

void set_indir_lp_status(struct task_struct *task, bool enable)
{
        if (!is_user_lpad_enabled())
                return;

        task->thread_info.user_cfi_state.ufcfi_en = enable ? 1 : 0;

        if (enable)
                task->thread.envcfg |= ENVCFG_LPE;
        else
                task->thread.envcfg &= ~ENVCFG_LPE;

        csr_write(CSR_ENVCFG, task->thread.envcfg);
}

void set_indir_lp_lock(struct task_struct *task)
{
        task->thread_info.user_cfi_state.ufcfi_locked = 1;
}
/*
 * If size is 0, then to be compatible with regular stack we want it to be as big as
 * regular stack. Else PAGE_ALIGN it and return back
 */
static unsigned long calc_shstk_size(unsigned long size)
{
        if (size)
                return PAGE_ALIGN(size);

        return PAGE_ALIGN(min_t(unsigned long long, rlimit(RLIMIT_STACK), SZ_4G));
}

/*
 * Writes on shadow stack can either be `sspush` or `ssamoswap`. `sspush` can happen
 * implicitly on current shadow stack pointed to by CSR_SSP. `ssamoswap` takes pointer to
 * shadow stack. To keep it simple, we plan to use `ssamoswap` to perform writes on shadow
 * stack.
 */
static noinline unsigned long amo_user_shstk(unsigned long __user *addr, unsigned long val)
{
        /*
         * Never expect -1 on shadow stack. Expect return addresses and zero
         */
        unsigned long swap = -1;

        __enable_user_access();
        asm goto(".option push\n"
                ".option arch, +zicfiss\n"
                "1: ssamoswap.d %[swap], %[val], %[addr]\n"
                _ASM_EXTABLE(1b, %l[fault])
                ".option pop\n"
                 : [swap] "=r" (swap), [addr] "+A" (*(__force unsigned long *)addr)
                : [val] "r" (val)
                : "memory"
                : fault
                );
        __disable_user_access();
        return swap;
fault:
        __disable_user_access();
        return -1;
}

/*
 * Create a restore token on the shadow stack.  A token is always XLEN wide
 * and aligned to XLEN.
 */
static int create_rstor_token(unsigned long ssp, unsigned long *token_addr)
{
        unsigned long addr;

        /* Token must be aligned */
        if (!IS_ALIGNED(ssp, SHSTK_ENTRY_SIZE))
                return -EINVAL;

        /* On RISC-V we're constructing token to be function of address itself */
        addr = ssp - SHSTK_ENTRY_SIZE;

        if (amo_user_shstk((unsigned long __user *)addr, (unsigned long)ssp) == -1)
                return -EFAULT;

        if (token_addr)
                *token_addr = addr;

        return 0;
}

/*
 * Save user shadow stack pointer on the shadow stack itself and return a pointer to saved location.
 * Returns -EFAULT if unsuccessful.
 */
int save_user_shstk(struct task_struct *tsk, unsigned long *saved_shstk_ptr)
{
        unsigned long ss_ptr = 0;
        unsigned long token_loc = 0;
        int ret = 0;

        if (!saved_shstk_ptr)
                return -EINVAL;

        ss_ptr = get_active_shstk(tsk);
        ret = create_rstor_token(ss_ptr, &token_loc);

        if (!ret) {
                *saved_shstk_ptr = token_loc;
                set_active_shstk(tsk, token_loc);
        }

        return ret;
}

/*
 * Restores the user shadow stack pointer from the token on the shadow stack for task 'tsk'.
 * Returns -EFAULT if unsuccessful.
 */
int restore_user_shstk(struct task_struct *tsk, unsigned long shstk_ptr)
{
        unsigned long token = 0;

        token = amo_user_shstk((unsigned long __user *)shstk_ptr, 0);

        if (token == -1)
                return -EFAULT;

        /* invalid token, return EINVAL */
        if ((token - shstk_ptr) != SHSTK_ENTRY_SIZE) {
                pr_info_ratelimited("%s[%d]: bad restore token in %s: pc=%p sp=%p, token=%p, shstk_ptr=%p\n",
                                    tsk->comm, task_pid_nr(tsk), __func__,
                                    (void *)(task_pt_regs(tsk)->epc),
                                    (void *)(task_pt_regs(tsk)->sp),
                                    (void *)token, (void *)shstk_ptr);
                return -EINVAL;
        }

        /* all checks passed, set active shstk and return success */
        set_active_shstk(tsk, token);
        return 0;
}

static unsigned long allocate_shadow_stack(unsigned long addr, unsigned long size,
                                           unsigned long token_offset, bool set_tok)
{
        int flags = MAP_ANONYMOUS | MAP_PRIVATE;
        struct mm_struct *mm = current->mm;
        unsigned long populate;

        if (addr)
                flags |= MAP_FIXED_NOREPLACE;

        mmap_write_lock(mm);
        addr = do_mmap(NULL, addr, size, PROT_READ, flags,
                       VM_SHADOW_STACK | VM_WRITE, 0, &populate, NULL);
        mmap_write_unlock(mm);

        if (!set_tok || IS_ERR_VALUE(addr))
                goto out;

        if (create_rstor_token(addr + token_offset, NULL)) {
                vm_munmap(addr, size);
                return -EINVAL;
        }

out:
        return addr;
}

SYSCALL_DEFINE3(map_shadow_stack, unsigned long, addr, unsigned long, size, unsigned int, flags)
{
        bool set_tok = flags & SHADOW_STACK_SET_TOKEN;
        unsigned long aligned_size = 0;

        if (!is_user_shstk_enabled())
                return -EOPNOTSUPP;

        /* Anything other than set token should result in invalid param */
        if (flags & ~SHADOW_STACK_SET_TOKEN)
                return -EINVAL;

        /*
         * Unlike other architectures, on RISC-V, SSP pointer is held in CSR_SSP and is an available
         * CSR in all modes. CSR accesses are performed using 12bit index programmed in instruction
         * itself. This provides static property on register programming and writes to CSR can't
         * be unintentional from programmer's perspective. As long as programmer has guarded areas
         * which perform writes to CSR_SSP properly, shadow stack pivoting is not possible. Since
         * CSR_SSP is writable by user mode, it itself can setup a shadow stack token subsequent
         * to allocation. Although in order to provide portablity with other architectures (because
         * `map_shadow_stack` is arch agnostic syscall), RISC-V will follow expectation of a token
         * flag in flags and if provided in flags, will setup a token at the base.
         */

        /* If there isn't space for a token */
        if (set_tok && size < SHSTK_ENTRY_SIZE)
                return -ENOSPC;

        if (addr && (addr & (PAGE_SIZE - 1)))
                return -EINVAL;

        aligned_size = PAGE_ALIGN(size);
        if (aligned_size < size)
                return -EOVERFLOW;

        return allocate_shadow_stack(addr, aligned_size, size, set_tok);
}

/*
 * This gets called during clone/clone3/fork. And is needed to allocate a shadow stack for
 * cases where CLONE_VM is specified and thus a different stack is specified by user. We
 * thus need a separate shadow stack too. How a separate shadow stack is specified by
 * user is still being debated. Once that's settled, remove this part of the comment.
 * This function simply returns 0 if shadow stacks are not supported or if separate shadow
 * stack allocation is not needed (like in case of !CLONE_VM)
 */
unsigned long shstk_alloc_thread_stack(struct task_struct *tsk,
                                       const struct kernel_clone_args *args)
{
        unsigned long addr, size;

        /* If shadow stack is not supported, return 0 */
        if (!is_user_shstk_enabled())
                return 0;

        /*
         * If shadow stack is not enabled on the new thread, skip any
         * switch to a new shadow stack.
         */
        if (!is_shstk_enabled(tsk))
                return 0;

        /*
         * For CLONE_VFORK the child will share the parents shadow stack.
         * Set base = 0 and size = 0, this is special means to track this state
         * so the freeing logic run for child knows to leave it alone.
         */
        if (args->flags & CLONE_VFORK) {
                set_shstk_base(tsk, 0, 0);
                return 0;
        }

        /*
         * For !CLONE_VM the child will use a copy of the parents shadow
         * stack.
         */
        if (!(args->flags & CLONE_VM))
                return 0;

        /*
         * reaching here means, CLONE_VM was specified and thus a separate shadow
         * stack is needed for new cloned thread. Note: below allocation is happening
         * using current mm.
         */
        size = calc_shstk_size(args->stack_size);
        addr = allocate_shadow_stack(0, size, 0, false);
        if (IS_ERR_VALUE(addr))
                return addr;

        set_shstk_base(tsk, addr, size);

        return addr + size;
}

void shstk_release(struct task_struct *tsk)
{
        unsigned long base = 0, size = 0;
        /* If shadow stack is not supported or not enabled, nothing to release */
        if (!is_user_shstk_enabled() || !is_shstk_enabled(tsk))
                return;

        /*
         * When fork() with CLONE_VM fails, the child (tsk) already has a
         * shadow stack allocated, and exit_thread() calls this function to
         * free it.  In this case the parent (current) and the child share
         * the same mm struct. Move forward only when they're same.
         */
        if (!tsk->mm || tsk->mm != current->mm)
                return;

        /*
         * We know shadow stack is enabled but if base is NULL, then
         * this task is not managing its own shadow stack (CLONE_VFORK). So
         * skip freeing it.
         */
        base = get_shstk_base(tsk, &size);
        if (!base)
                return;

        vm_munmap(base, size);
        set_shstk_base(tsk, 0, 0);
}

int arch_get_shadow_stack_status(struct task_struct *t, unsigned long __user *status)
{
        unsigned long bcfi_status = 0;

        if (!is_user_shstk_enabled())
                return -EINVAL;

        /* this means shadow stack is enabled on the task */
        bcfi_status |= (is_shstk_enabled(t) ? PR_SHADOW_STACK_ENABLE : 0);

        return copy_to_user(status, &bcfi_status, sizeof(bcfi_status)) ? -EFAULT : 0;
}

int arch_set_shadow_stack_status(struct task_struct *t, unsigned long status)
{
        unsigned long size = 0, addr = 0;
        bool enable_shstk = false;

        if (!is_user_shstk_enabled())
                return -EINVAL;

        /* Reject unknown flags */
        if (status & ~PR_SHADOW_STACK_SUPPORTED_STATUS_MASK)
                return -EINVAL;

        /* bcfi status is locked and further can't be modified by user */
        if (is_shstk_locked(t))
                return -EINVAL;

        enable_shstk = status & PR_SHADOW_STACK_ENABLE;
        /* Request is to enable shadow stack and shadow stack is not enabled already */
        if (enable_shstk && !is_shstk_enabled(t)) {
                /* shadow stack was allocated and enable request again
                 * no need to support such usecase and return EINVAL.
                 */
                if (is_shstk_allocated(t))
                        return -EINVAL;

                size = calc_shstk_size(0);
                addr = allocate_shadow_stack(0, size, 0, false);
                if (IS_ERR_VALUE(addr))
                        return -ENOMEM;
                set_shstk_base(t, addr, size);
                set_active_shstk(t, addr + size);
        }

        /*
         * If a request to disable shadow stack happens, let's go ahead and release it
         * Although, if CLONE_VFORKed child did this, then in that case we will end up
         * not releasing the shadow stack (because it might be needed in parent). Although
         * we will disable it for VFORKed child. And if VFORKed child tries to enable again
         * then in that case, it'll get entirely new shadow stack because following condition
         * are true
         *  - shadow stack was not enabled for vforked child
         *  - shadow stack base was anyways pointing to 0
         * This shouldn't be a big issue because we want parent to have availability of shadow
         * stack whenever VFORKed child releases resources via exit or exec but at the same
         * time we want VFORKed child to break away and establish new shadow stack if it desires
         *
         */
        if (!enable_shstk)
                shstk_release(t);

        set_shstk_status(t, enable_shstk);
        return 0;
}

int arch_lock_shadow_stack_status(struct task_struct *task,
                                  unsigned long arg)
{
        /* If shtstk not supported or not enabled on task, nothing to lock here */
        if (!is_user_shstk_enabled() ||
            !is_shstk_enabled(task) || arg != 0)
                return -EINVAL;

        set_shstk_lock(task);

        return 0;
}

int arch_get_indir_br_lp_status(struct task_struct *t, unsigned long __user *status)
{
        unsigned long fcfi_status = 0;

        if (!is_user_lpad_enabled())
                return -EINVAL;

        /* indirect branch tracking is enabled on the task or not */
        fcfi_status |= (is_indir_lp_enabled(t) ? PR_INDIR_BR_LP_ENABLE : 0);

        return copy_to_user(status, &fcfi_status, sizeof(fcfi_status)) ? -EFAULT : 0;
}

int arch_set_indir_br_lp_status(struct task_struct *t, unsigned long status)
{
        bool enable_indir_lp = false;

        if (!is_user_lpad_enabled())
                return -EINVAL;

        /* indirect branch tracking is locked and further can't be modified by user */
        if (is_indir_lp_locked(t))
                return -EINVAL;

        /* Reject unknown flags */
        if (status & ~PR_INDIR_BR_LP_ENABLE)
                return -EINVAL;

        enable_indir_lp = (status & PR_INDIR_BR_LP_ENABLE);
        set_indir_lp_status(t, enable_indir_lp);

        return 0;
}

int arch_lock_indir_br_lp_status(struct task_struct *task,
                                 unsigned long arg)
{
        /*
         * If indirect branch tracking is not supported or not enabled on task,
         * nothing to lock here
         */
        if (!is_user_lpad_enabled() ||
            !is_indir_lp_enabled(task) || arg != 0)
                return -EINVAL;

        set_indir_lp_lock(task);

        return 0;
}

bool is_user_shstk_enabled(void)
{
        return (cpu_supports_shadow_stack() &&
                !(riscv_nousercfi & CMDLINE_DISABLE_RISCV_USERCFI_BCFI));
}

bool is_user_lpad_enabled(void)
{
        return (cpu_supports_indirect_br_lp_instr() &&
                !(riscv_nousercfi & CMDLINE_DISABLE_RISCV_USERCFI_FCFI));
}

static int __init setup_global_riscv_enable(char *str)
{
        if (strcmp(str, "all") == 0)
                riscv_nousercfi = CMDLINE_DISABLE_RISCV_USERCFI;

        if (strcmp(str, "fcfi") == 0)
                riscv_nousercfi |= CMDLINE_DISABLE_RISCV_USERCFI_FCFI;

        if (strcmp(str, "bcfi") == 0)
                riscv_nousercfi |= CMDLINE_DISABLE_RISCV_USERCFI_BCFI;

        if (riscv_nousercfi)
                pr_info("RISC-V user CFI disabled via cmdline - shadow stack status : %s, landing pad status : %s\n",
                        (riscv_nousercfi & CMDLINE_DISABLE_RISCV_USERCFI_BCFI) ? "disabled" :
                        "enabled", (riscv_nousercfi & CMDLINE_DISABLE_RISCV_USERCFI_FCFI) ?
                        "disabled" : "enabled");

        return 1;
}

__setup("riscv_nousercfi=", setup_global_riscv_enable);