root/arch/x86/kernel/shstk.c
// SPDX-License-Identifier: GPL-2.0
/*
 * shstk.c - Intel shadow stack support
 *
 * Copyright (c) 2021, Intel Corporation.
 * Yu-cheng Yu <yu-cheng.yu@intel.com>
 */

#include <linux/sched.h>
#include <linux/bitops.h>
#include <linux/types.h>
#include <linux/mm.h>
#include <linux/mman.h>
#include <linux/slab.h>
#include <linux/uaccess.h>
#include <linux/sched/signal.h>
#include <linux/compat.h>
#include <linux/sizes.h>
#include <linux/user.h>
#include <linux/syscalls.h>
#include <asm/msr.h>
#include <asm/fpu/xstate.h>
#include <asm/fpu/types.h>
#include <asm/shstk.h>
#include <asm/special_insns.h>
#include <asm/fpu/api.h>
#include <asm/prctl.h>

#define SS_FRAME_SIZE 8

static bool features_enabled(unsigned long features)
{
        return current->thread.features & features;
}

static void features_set(unsigned long features)
{
        current->thread.features |= features;
}

static void features_clr(unsigned long features)
{
        current->thread.features &= ~features;
}

/*
 * Create a restore token on the shadow stack.  A token is always 8-byte
 * and aligned to 8.
 */
static int create_rstor_token(unsigned long ssp, unsigned long *token_addr)
{
        unsigned long addr;

        /* Token must be aligned */
        if (!IS_ALIGNED(ssp, 8))
                return -EINVAL;

        addr = ssp - SS_FRAME_SIZE;

        /*
         * SSP is aligned, so reserved bits and mode bit are a zero, just mark
         * the token 64-bit.
         */
        ssp |= BIT(0);

        if (write_user_shstk_64((u64 __user *)addr, (u64)ssp))
                return -EFAULT;

        if (token_addr)
                *token_addr = addr;

        return 0;
}

/*
 * VM_SHADOW_STACK will have a guard page. This helps userspace protect
 * itself from attacks. The reasoning is as follows:
 *
 * The shadow stack pointer(SSP) is moved by CALL, RET, and INCSSPQ. The
 * INCSSP instruction can increment the shadow stack pointer. It is the
 * shadow stack analog of an instruction like:
 *
 *   addq $0x80, %rsp
 *
 * However, there is one important difference between an ADD on %rsp
 * and INCSSP. In addition to modifying SSP, INCSSP also reads from the
 * memory of the first and last elements that were "popped". It can be
 * thought of as acting like this:
 *
 * READ_ONCE(ssp);       // read+discard top element on stack
 * ssp += nr_to_pop * 8; // move the shadow stack
 * READ_ONCE(ssp-8);     // read+discard last popped stack element
 *
 * The maximum distance INCSSP can move the SSP is 2040 bytes, before
 * it would read the memory. Therefore a single page gap will be enough
 * to prevent any operation from shifting the SSP to an adjacent stack,
 * since it would have to land in the gap at least once, causing a
 * fault.
 */
static unsigned long alloc_shstk(unsigned long addr, unsigned long size,
                                 unsigned long token_offset, bool set_res_tok)
{
        int flags = MAP_ANONYMOUS | MAP_PRIVATE | MAP_ABOVE4G;
        struct mm_struct *mm = current->mm;
        unsigned long mapped_addr, unused;

        if (addr)
                flags |= MAP_FIXED_NOREPLACE;

        mmap_write_lock(mm);
        mapped_addr = do_mmap(NULL, addr, size, PROT_READ, flags,
                              VM_SHADOW_STACK | VM_WRITE, 0, &unused, NULL);
        mmap_write_unlock(mm);

        if (!set_res_tok || IS_ERR_VALUE(mapped_addr))
                goto out;

        if (create_rstor_token(mapped_addr + token_offset, NULL)) {
                vm_munmap(mapped_addr, size);
                return -EINVAL;
        }

out:
        return mapped_addr;
}

static unsigned long adjust_shstk_size(unsigned long size)
{
        if (size)
                return PAGE_ALIGN(size);

        return PAGE_ALIGN(min_t(unsigned long long, rlimit(RLIMIT_STACK), SZ_4G));
}

static void unmap_shadow_stack(u64 base, u64 size)
{
        int r;

        r = vm_munmap(base, size);

        /*
         * mmap_write_lock_killable() failed with -EINTR. This means
         * the process is about to die and have it's MM cleaned up.
         * This task shouldn't ever make it back to userspace. In this
         * case it is ok to leak a shadow stack, so just exit out.
         */
        if (r == -EINTR)
                return;

        /*
         * For all other types of vm_munmap() failure, either the
         * system is out of memory or there is bug.
         */
        WARN_ON_ONCE(r);
}

static int shstk_setup(void)
{
        struct thread_shstk *shstk = &current->thread.shstk;
        unsigned long addr, size;

        /* Already enabled */
        if (features_enabled(ARCH_SHSTK_SHSTK))
                return 0;

        /* Also not supported for 32 bit */
        if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK) || in_ia32_syscall())
                return -EOPNOTSUPP;

        size = adjust_shstk_size(0);
        addr = alloc_shstk(0, size, 0, false);
        if (IS_ERR_VALUE(addr))
                return PTR_ERR((void *)addr);

        fpregs_lock_and_load();
        wrmsrq(MSR_IA32_PL3_SSP, addr + size);
        wrmsrq(MSR_IA32_U_CET, CET_SHSTK_EN);
        fpregs_unlock();

        shstk->base = addr;
        shstk->size = size;
        features_set(ARCH_SHSTK_SHSTK);

        return 0;
}

void reset_thread_features(void)
{
        memset(&current->thread.shstk, 0, sizeof(struct thread_shstk));
        current->thread.features = 0;
        current->thread.features_locked = 0;
}

unsigned long shstk_alloc_thread_stack(struct task_struct *tsk, u64 clone_flags,
                                       unsigned long stack_size)
{
        struct thread_shstk *shstk = &tsk->thread.shstk;
        unsigned long addr, size;

        /*
         * If shadow stack is not enabled on the new thread, skip any
         * switch to a new shadow stack.
         */
        if (!features_enabled(ARCH_SHSTK_SHSTK))
                return 0;

        /*
         * For CLONE_VFORK the child will share the parents shadow stack.
         * Make sure to clear the internal tracking of the thread shadow
         * stack so the freeing logic run for child knows to leave it alone.
         */
        if (clone_flags & CLONE_VFORK) {
                shstk->base = 0;
                shstk->size = 0;
                return 0;
        }

        /*
         * For !CLONE_VM the child will use a copy of the parents shadow
         * stack.
         */
        if (!(clone_flags & CLONE_VM))
                return 0;

        size = adjust_shstk_size(stack_size);
        addr = alloc_shstk(0, size, 0, false);
        if (IS_ERR_VALUE(addr))
                return addr;

        shstk->base = addr;
        shstk->size = size;

        return addr + size;
}

static unsigned long get_user_shstk_addr(void)
{
        unsigned long long ssp;

        fpregs_lock_and_load();

        rdmsrq(MSR_IA32_PL3_SSP, ssp);

        fpregs_unlock();

        return ssp;
}

int shstk_pop(u64 *val)
{
        int ret = 0;
        u64 ssp;

        if (!features_enabled(ARCH_SHSTK_SHSTK))
                return -ENOTSUPP;

        fpregs_lock_and_load();

        rdmsrq(MSR_IA32_PL3_SSP, ssp);
        if (val && get_user(*val, (__user u64 *)ssp))
                ret = -EFAULT;
        else
                wrmsrq(MSR_IA32_PL3_SSP, ssp + SS_FRAME_SIZE);
        fpregs_unlock();

        return ret;
}

int shstk_push(u64 val)
{
        u64 ssp;
        int ret;

        if (!features_enabled(ARCH_SHSTK_SHSTK))
                return -ENOTSUPP;

        fpregs_lock_and_load();

        rdmsrq(MSR_IA32_PL3_SSP, ssp);
        ssp -= SS_FRAME_SIZE;
        ret = write_user_shstk_64((__user void *)ssp, val);
        if (!ret)
                wrmsrq(MSR_IA32_PL3_SSP, ssp);
        fpregs_unlock();

        return ret;
}

#define SHSTK_DATA_BIT BIT(63)

static int put_shstk_data(u64 __user *addr, u64 data)
{
        if (WARN_ON_ONCE(data & SHSTK_DATA_BIT))
                return -EINVAL;

        /*
         * Mark the high bit so that the sigframe can't be processed as a
         * return address.
         */
        if (write_user_shstk_64(addr, data | SHSTK_DATA_BIT))
                return -EFAULT;
        return 0;
}

static int get_shstk_data(unsigned long *data, unsigned long __user *addr)
{
        unsigned long ldata;

        if (unlikely(get_user(ldata, addr)))
                return -EFAULT;

        if (!(ldata & SHSTK_DATA_BIT))
                return -EINVAL;

        *data = ldata & ~SHSTK_DATA_BIT;

        return 0;
}

static int shstk_push_sigframe(unsigned long *ssp)
{
        unsigned long target_ssp = *ssp;

        /* Token must be aligned */
        if (!IS_ALIGNED(target_ssp, 8))
                return -EINVAL;

        *ssp -= SS_FRAME_SIZE;
        if (put_shstk_data((void __user *)*ssp, target_ssp))
                return -EFAULT;

        return 0;
}

static int shstk_pop_sigframe(unsigned long *ssp)
{
        struct vm_area_struct *vma;
        unsigned long token_addr;
        bool need_to_check_vma;
        int err = 1;

        /*
         * It is possible for the SSP to be off the end of a shadow stack by 4
         * or 8 bytes. If the shadow stack is at the start of a page or 4 bytes
         * before it, it might be this case, so check that the address being
         * read is actually shadow stack.
         */
        if (!IS_ALIGNED(*ssp, 8))
                return -EINVAL;

        need_to_check_vma = PAGE_ALIGN(*ssp) == *ssp;

        if (need_to_check_vma)
                if (mmap_read_lock_killable(current->mm))
                        return -EINTR;

        err = get_shstk_data(&token_addr, (unsigned long __user *)*ssp);
        if (unlikely(err))
                goto out_err;

        if (need_to_check_vma) {
                vma = find_vma(current->mm, *ssp);
                if (!vma || !(vma->vm_flags & VM_SHADOW_STACK)) {
                        err = -EFAULT;
                        goto out_err;
                }

                mmap_read_unlock(current->mm);
        }

        /* Restore SSP aligned? */
        if (unlikely(!IS_ALIGNED(token_addr, 8)))
                return -EINVAL;

        /* SSP in userspace? */
        if (unlikely(token_addr >= TASK_SIZE_MAX))
                return -EINVAL;

        *ssp = token_addr;

        return 0;
out_err:
        if (need_to_check_vma)
                mmap_read_unlock(current->mm);
        return err;
}

int setup_signal_shadow_stack(struct ksignal *ksig)
{
        void __user *restorer = ksig->ka.sa.sa_restorer;
        unsigned long ssp;
        int err;

        if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK) ||
            !features_enabled(ARCH_SHSTK_SHSTK))
                return 0;

        if (!restorer)
                return -EINVAL;

        ssp = get_user_shstk_addr();
        if (unlikely(!ssp))
                return -EINVAL;

        err = shstk_push_sigframe(&ssp);
        if (unlikely(err))
                return err;

        /* Push restorer address */
        ssp -= SS_FRAME_SIZE;
        err = write_user_shstk_64((u64 __user *)ssp, (u64)restorer);
        if (unlikely(err))
                return -EFAULT;

        fpregs_lock_and_load();
        wrmsrq(MSR_IA32_PL3_SSP, ssp);
        fpregs_unlock();

        return 0;
}

int restore_signal_shadow_stack(void)
{
        unsigned long ssp;
        int err;

        if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK) ||
            !features_enabled(ARCH_SHSTK_SHSTK))
                return 0;

        ssp = get_user_shstk_addr();
        if (unlikely(!ssp))
                return -EINVAL;

        err = shstk_pop_sigframe(&ssp);
        if (unlikely(err))
                return err;

        fpregs_lock_and_load();
        wrmsrq(MSR_IA32_PL3_SSP, ssp);
        fpregs_unlock();

        return 0;
}

void shstk_free(struct task_struct *tsk)
{
        struct thread_shstk *shstk = &tsk->thread.shstk;

        if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK) ||
            !features_enabled(ARCH_SHSTK_SHSTK))
                return;

        /*
         * When fork() with CLONE_VM fails, the child (tsk) already has a
         * shadow stack allocated, and exit_thread() calls this function to
         * free it.  In this case the parent (current) and the child share
         * the same mm struct.
         */
        if (!tsk->mm || tsk->mm != current->mm)
                return;

        /*
         * If shstk->base is NULL, then this task is not managing its
         * own shadow stack (CLONE_VFORK). So skip freeing it.
         */
        if (!shstk->base)
                return;

        /*
         * shstk->base is NULL for CLONE_VFORK child tasks, and so is
         * normal. But size = 0 on a shstk->base is not normal and
         * indicated an attempt to free the thread shadow stack twice.
         * Warn about it.
         */
        if (WARN_ON(!shstk->size))
                return;

        unmap_shadow_stack(shstk->base, shstk->size);

        shstk->size = 0;
}

static int wrss_control(bool enable)
{
        u64 msrval;

        if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK))
                return -EOPNOTSUPP;

        /*
         * Only enable WRSS if shadow stack is enabled. If shadow stack is not
         * enabled, WRSS will already be disabled, so don't bother clearing it
         * when disabling.
         */
        if (!features_enabled(ARCH_SHSTK_SHSTK))
                return -EPERM;

        /* Already enabled/disabled? */
        if (features_enabled(ARCH_SHSTK_WRSS) == enable)
                return 0;

        fpregs_lock_and_load();
        rdmsrq(MSR_IA32_U_CET, msrval);

        if (enable) {
                features_set(ARCH_SHSTK_WRSS);
                msrval |= CET_WRSS_EN;
        } else {
                features_clr(ARCH_SHSTK_WRSS);
                if (!(msrval & CET_WRSS_EN))
                        goto unlock;

                msrval &= ~CET_WRSS_EN;
        }

        wrmsrq(MSR_IA32_U_CET, msrval);

unlock:
        fpregs_unlock();

        return 0;
}

static int shstk_disable(void)
{
        if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK))
                return -EOPNOTSUPP;

        /* Already disabled? */
        if (!features_enabled(ARCH_SHSTK_SHSTK))
                return 0;

        fpregs_lock_and_load();
        /* Disable WRSS too when disabling shadow stack */
        wrmsrq(MSR_IA32_U_CET, 0);
        wrmsrq(MSR_IA32_PL3_SSP, 0);
        fpregs_unlock();

        shstk_free(current);
        features_clr(ARCH_SHSTK_SHSTK | ARCH_SHSTK_WRSS);

        return 0;
}

SYSCALL_DEFINE3(map_shadow_stack, unsigned long, addr, unsigned long, size, unsigned int, flags)
{
        bool set_tok = flags & SHADOW_STACK_SET_TOKEN;
        unsigned long aligned_size;

        if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK))
                return -EOPNOTSUPP;

        if (flags & ~SHADOW_STACK_SET_TOKEN)
                return -EINVAL;

        /* If there isn't space for a token */
        if (set_tok && size < 8)
                return -ENOSPC;

        if (addr && addr < SZ_4G)
                return -ERANGE;

        /*
         * An overflow would result in attempting to write the restore token
         * to the wrong location. Not catastrophic, but just return the right
         * error code and block it.
         */
        aligned_size = PAGE_ALIGN(size);
        if (aligned_size < size)
                return -EOVERFLOW;

        return alloc_shstk(addr, aligned_size, size, set_tok);
}

long shstk_prctl(struct task_struct *task, int option, unsigned long arg2)
{
        unsigned long features = arg2;

        if (option == ARCH_SHSTK_STATUS) {
                return put_user(task->thread.features, (unsigned long __user *)arg2);
        }

        if (option == ARCH_SHSTK_LOCK) {
                task->thread.features_locked |= features;
                return 0;
        }

        /* Only allow via ptrace */
        if (task != current) {
                if (option == ARCH_SHSTK_UNLOCK && IS_ENABLED(CONFIG_CHECKPOINT_RESTORE)) {
                        task->thread.features_locked &= ~features;
                        return 0;
                }
                return -EINVAL;
        }

        /* Do not allow to change locked features */
        if (features & task->thread.features_locked)
                return -EPERM;

        /* Only support enabling/disabling one feature at a time. */
        if (hweight_long(features) > 1)
                return -EINVAL;

        if (option == ARCH_SHSTK_DISABLE) {
                if (features & ARCH_SHSTK_WRSS)
                        return wrss_control(false);
                if (features & ARCH_SHSTK_SHSTK)
                        return shstk_disable();
                return -EINVAL;
        }

        /* Handle ARCH_SHSTK_ENABLE */
        if (features & ARCH_SHSTK_SHSTK)
                return shstk_setup();
        if (features & ARCH_SHSTK_WRSS)
                return wrss_control(true);
        return -EINVAL;
}

int shstk_update_last_frame(unsigned long val)
{
        unsigned long ssp;

        if (!features_enabled(ARCH_SHSTK_SHSTK))
                return 0;

        ssp = get_user_shstk_addr();
        return write_user_shstk_64((u64 __user *)ssp, (u64)val);
}

bool shstk_is_enabled(void)
{
        return features_enabled(ARCH_SHSTK_SHSTK);
}