root/include/linux/entry-common.h
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __LINUX_ENTRYCOMMON_H
#define __LINUX_ENTRYCOMMON_H

#include <linux/audit.h>
#include <linux/irq-entry-common.h>
#include <linux/livepatch.h>
#include <linux/ptrace.h>
#include <linux/resume_user_mode.h>
#include <linux/seccomp.h>
#include <linux/sched.h>

#include <asm/entry-common.h>
#include <asm/syscall.h>

#ifndef _TIF_UPROBE
# define _TIF_UPROBE                    (0)
#endif

/*
 * SYSCALL_WORK flags handled in syscall_enter_from_user_mode()
 */
#ifndef ARCH_SYSCALL_WORK_ENTER
# define ARCH_SYSCALL_WORK_ENTER        (0)
#endif

/*
 * SYSCALL_WORK flags handled in syscall_exit_to_user_mode()
 */
#ifndef ARCH_SYSCALL_WORK_EXIT
# define ARCH_SYSCALL_WORK_EXIT         (0)
#endif

#define SYSCALL_WORK_ENTER      (SYSCALL_WORK_SECCOMP |                 \
                                 SYSCALL_WORK_SYSCALL_TRACEPOINT |      \
                                 SYSCALL_WORK_SYSCALL_TRACE |           \
                                 SYSCALL_WORK_SYSCALL_EMU |             \
                                 SYSCALL_WORK_SYSCALL_AUDIT |           \
                                 SYSCALL_WORK_SYSCALL_USER_DISPATCH |   \
                                 SYSCALL_WORK_SYSCALL_RSEQ_SLICE |      \
                                 ARCH_SYSCALL_WORK_ENTER)
#define SYSCALL_WORK_EXIT       (SYSCALL_WORK_SYSCALL_TRACEPOINT |      \
                                 SYSCALL_WORK_SYSCALL_TRACE |           \
                                 SYSCALL_WORK_SYSCALL_AUDIT |           \
                                 SYSCALL_WORK_SYSCALL_USER_DISPATCH |   \
                                 SYSCALL_WORK_SYSCALL_EXIT_TRAP |       \
                                 ARCH_SYSCALL_WORK_EXIT)

/**
 * arch_ptrace_report_syscall_entry - Architecture specific ptrace_report_syscall_entry() wrapper
 *
 * Invoked from syscall_trace_enter() to wrap ptrace_report_syscall_entry().
 *
 * This allows architecture specific ptrace_report_syscall_entry()
 * implementations. If not defined by the architecture this falls back to
 * to ptrace_report_syscall_entry().
 */
static __always_inline int arch_ptrace_report_syscall_entry(struct pt_regs *regs);

#ifndef arch_ptrace_report_syscall_entry
static __always_inline int arch_ptrace_report_syscall_entry(struct pt_regs *regs)
{
        return ptrace_report_syscall_entry(regs);
}
#endif

bool syscall_user_dispatch(struct pt_regs *regs);
long trace_syscall_enter(struct pt_regs *regs, long syscall);
void trace_syscall_exit(struct pt_regs *regs, long ret);

static inline void syscall_enter_audit(struct pt_regs *regs, long syscall)
{
        if (unlikely(audit_context())) {
                unsigned long args[6];

                syscall_get_arguments(current, regs, args);
                audit_syscall_entry(syscall, args[0], args[1], args[2], args[3]);
        }
}

static __always_inline long syscall_trace_enter(struct pt_regs *regs, unsigned long work)
{
        long syscall, ret = 0;

        /*
         * Handle Syscall User Dispatch.  This must comes first, since
         * the ABI here can be something that doesn't make sense for
         * other syscall_work features.
         */
        if (work & SYSCALL_WORK_SYSCALL_USER_DISPATCH) {
                if (syscall_user_dispatch(regs))
                        return -1L;
        }

        /*
         * User space got a time slice extension granted and relinquishes
         * the CPU. The work stops the slice timer to avoid an extra round
         * through hrtimer_interrupt().
         */
        if (work & SYSCALL_WORK_SYSCALL_RSEQ_SLICE)
                rseq_syscall_enter_work(syscall_get_nr(current, regs));

        /* Handle ptrace */
        if (work & (SYSCALL_WORK_SYSCALL_TRACE | SYSCALL_WORK_SYSCALL_EMU)) {
                ret = arch_ptrace_report_syscall_entry(regs);
                if (ret || (work & SYSCALL_WORK_SYSCALL_EMU))
                        return -1L;
        }

        /* Do seccomp after ptrace, to catch any tracer changes. */
        if (work & SYSCALL_WORK_SECCOMP) {
                ret = __secure_computing();
                if (ret == -1L)
                        return ret;
        }

        /* Either of the above might have changed the syscall number */
        syscall = syscall_get_nr(current, regs);

        if (unlikely(work & SYSCALL_WORK_SYSCALL_TRACEPOINT))
                syscall = trace_syscall_enter(regs, syscall);

        syscall_enter_audit(regs, syscall);

        return ret ? : syscall;
}

/**
 * syscall_enter_from_user_mode_work - Check and handle work before invoking
 *                                     a syscall
 * @regs:       Pointer to currents pt_regs
 * @syscall:    The syscall number
 *
 * Invoked from architecture specific syscall entry code with interrupts
 * enabled after invoking enter_from_user_mode(), enabling interrupts and
 * extra architecture specific work.
 *
 * Returns: The original or a modified syscall number
 *
 * If the returned syscall number is -1 then the syscall should be
 * skipped. In this case the caller may invoke syscall_set_error() or
 * syscall_set_return_value() first.  If neither of those are called and -1
 * is returned, then the syscall will fail with ENOSYS.
 *
 * It handles the following work items:
 *
 *  1) syscall_work flag dependent invocations of
 *     ptrace_report_syscall_entry(), __secure_computing(), trace_sys_enter()
 *  2) Invocation of audit_syscall_entry()
 */
static __always_inline long syscall_enter_from_user_mode_work(struct pt_regs *regs, long syscall)
{
        unsigned long work = READ_ONCE(current_thread_info()->syscall_work);

        if (work & SYSCALL_WORK_ENTER)
                syscall = syscall_trace_enter(regs, work);

        return syscall;
}

/**
 * syscall_enter_from_user_mode - Establish state and check and handle work
 *                                before invoking a syscall
 * @regs:       Pointer to currents pt_regs
 * @syscall:    The syscall number
 *
 * Invoked from architecture specific syscall entry code with interrupts
 * disabled. The calling code has to be non-instrumentable. When the
 * function returns all state is correct, interrupts are enabled and the
 * subsequent functions can be instrumented.
 *
 * This is the combination of enter_from_user_mode() and
 * syscall_enter_from_user_mode_work() to be used when there is no
 * architecture specific work to be done between the two.
 *
 * Returns: The original or a modified syscall number. See
 * syscall_enter_from_user_mode_work() for further explanation.
 */
static __always_inline long syscall_enter_from_user_mode(struct pt_regs *regs, long syscall)
{
        long ret;

        enter_from_user_mode(regs);

        instrumentation_begin();
        local_irq_enable();
        ret = syscall_enter_from_user_mode_work(regs, syscall);
        instrumentation_end();

        return ret;
}

/*
 * If SYSCALL_EMU is set, then the only reason to report is when
 * SINGLESTEP is set (i.e. PTRACE_SYSEMU_SINGLESTEP).  This syscall
 * instruction has been already reported in syscall_enter_from_user_mode().
 */
static __always_inline bool report_single_step(unsigned long work)
{
        if (work & SYSCALL_WORK_SYSCALL_EMU)
                return false;

        return work & SYSCALL_WORK_SYSCALL_EXIT_TRAP;
}

/**
 * arch_ptrace_report_syscall_exit - Architecture specific ptrace_report_syscall_exit()
 *
 * This allows architecture specific ptrace_report_syscall_exit()
 * implementations. If not defined by the architecture this falls back to
 * to ptrace_report_syscall_exit().
 */
static __always_inline void arch_ptrace_report_syscall_exit(struct pt_regs *regs,
                                                            int step);

#ifndef arch_ptrace_report_syscall_exit
static __always_inline void arch_ptrace_report_syscall_exit(struct pt_regs *regs,
                                                            int step)
{
        ptrace_report_syscall_exit(regs, step);
}
#endif

/**
 * syscall_exit_work - Handle work before returning to user mode
 * @regs:       Pointer to current pt_regs
 * @work:       Current thread syscall work
 *
 * Do one-time syscall specific work.
 */
static __always_inline void syscall_exit_work(struct pt_regs *regs, unsigned long work)
{
        bool step;

        /*
         * If the syscall was rolled back due to syscall user dispatching,
         * then the tracers below are not invoked for the same reason as
         * the entry side was not invoked in syscall_trace_enter(): The ABI
         * of these syscalls is unknown.
         */
        if (work & SYSCALL_WORK_SYSCALL_USER_DISPATCH) {
                if (unlikely(current->syscall_dispatch.on_dispatch)) {
                        current->syscall_dispatch.on_dispatch = false;
                        return;
                }
        }

        audit_syscall_exit(regs);

        if (work & SYSCALL_WORK_SYSCALL_TRACEPOINT)
                trace_syscall_exit(regs, syscall_get_return_value(current, regs));

        step = report_single_step(work);
        if (step || work & SYSCALL_WORK_SYSCALL_TRACE)
                arch_ptrace_report_syscall_exit(regs, step);
}

/**
 * syscall_exit_to_user_mode_work - Handle one time work before returning to user mode
 * @regs:       Pointer to currents pt_regs
 *
 * Step 1 of syscall_exit_to_user_mode() with the same calling convention.
 *
 * The caller must invoke steps 2-3 of syscall_exit_to_user_mode() afterwards.
 */
static __always_inline void syscall_exit_to_user_mode_work(struct pt_regs *regs)
{
        unsigned long work = READ_ONCE(current_thread_info()->syscall_work);
        unsigned long nr = syscall_get_nr(current, regs);

        CT_WARN_ON(ct_state() != CT_STATE_KERNEL);

        if (IS_ENABLED(CONFIG_PROVE_LOCKING)) {
                if (WARN(irqs_disabled(), "syscall %lu left IRQs disabled", nr))
                        local_irq_enable();
        }

        rseq_debug_syscall_return(regs);

        /*
         * Do one-time syscall specific work. If these work items are
         * enabled, we want to run them exactly once per syscall exit with
         * interrupts enabled.
         */
        if (unlikely(work & SYSCALL_WORK_EXIT))
                syscall_exit_work(regs, work);
}

/**
 * syscall_exit_to_user_mode - Handle work before returning to user mode
 * @regs:       Pointer to currents pt_regs
 *
 * Invoked with interrupts enabled and fully valid @regs. Returns with all
 * work handled, interrupts disabled such that the caller can immediately
 * switch to user mode. Called from architecture specific syscall and ret
 * from fork code.
 *
 * The call order is:
 *  1) One-time syscall exit work:
 *      - rseq syscall exit
 *      - audit
 *      - syscall tracing
 *      - ptrace (single stepping)
 *
 *  2) Preparatory work
 *      - Disable interrupts
 *      - Exit to user mode loop (common TIF handling). Invokes
 *        arch_exit_to_user_mode_work() for architecture specific TIF work
 *      - Architecture specific one time work arch_exit_to_user_mode_prepare()
 *      - Address limit and lockdep checks
 *
 *  3) Final transition (lockdep, tracing, context tracking, RCU), i.e. the
 *     functionality in exit_to_user_mode().
 *
 * This is a combination of syscall_exit_to_user_mode_work() (1), disabling
 * interrupts followed by syscall_exit_to_user_mode_prepare() (2) and
 * exit_to_user_mode() (3). This function is preferred unless there is a
 * compelling architectural reason to invoke the functions separately.
 */
static __always_inline void syscall_exit_to_user_mode(struct pt_regs *regs)
{
        instrumentation_begin();
        syscall_exit_to_user_mode_work(regs);
        local_irq_disable_exit_to_user();
        syscall_exit_to_user_mode_prepare(regs);
        instrumentation_end();
        exit_to_user_mode();
}

#endif