root/kernel/trace/trace_syscalls.c
// SPDX-License-Identifier: GPL-2.0
#include <trace/syscall.h>
#include <trace/events/syscalls.h>
#include <linux/kernel_stat.h>
#include <linux/syscalls.h>
#include <linux/slab.h>
#include <linux/kernel.h>
#include <linux/module.h>       /* for MODULE_NAME_LEN via KSYM_SYMBOL_LEN */
#include <linux/ftrace.h>
#include <linux/perf_event.h>
#include <linux/xarray.h>
#include <asm/syscall.h>

#include "trace_output.h"
#include "trace.h"

static DEFINE_MUTEX(syscall_trace_lock);

static int syscall_enter_register(struct trace_event_call *event,
                                 enum trace_reg type, void *data);
static int syscall_exit_register(struct trace_event_call *event,
                                 enum trace_reg type, void *data);

static struct list_head *
syscall_get_enter_fields(struct trace_event_call *call)
{
        struct syscall_metadata *entry = call->data;

        return &entry->enter_fields;
}

extern struct syscall_metadata *__start_syscalls_metadata[];
extern struct syscall_metadata *__stop_syscalls_metadata[];

static DEFINE_XARRAY(syscalls_metadata_sparse);
static struct syscall_metadata **syscalls_metadata;

#ifndef ARCH_HAS_SYSCALL_MATCH_SYM_NAME
static inline bool arch_syscall_match_sym_name(const char *sym, const char *name)
{
        /*
         * Only compare after the "sys" prefix. Archs that use
         * syscall wrappers may have syscalls symbols aliases prefixed
         * with ".SyS" or ".sys" instead of "sys", leading to an unwanted
         * mismatch.
         */
        return !strcmp(sym + 3, name + 3);
}
#endif

#ifdef ARCH_TRACE_IGNORE_COMPAT_SYSCALLS
/*
 * Some architectures that allow for 32bit applications
 * to run on a 64bit kernel, do not map the syscalls for
 * the 32bit tasks the same as they do for 64bit tasks.
 *
 *     *cough*x86*cough*
 *
 * In such a case, instead of reporting the wrong syscalls,
 * simply ignore them.
 *
 * For an arch to ignore the compat syscalls it needs to
 * define ARCH_TRACE_IGNORE_COMPAT_SYSCALLS as well as
 * define the function arch_trace_is_compat_syscall() to let
 * the tracing system know that it should ignore it.
 */
static int
trace_get_syscall_nr(struct task_struct *task, struct pt_regs *regs)
{
        if (unlikely(arch_trace_is_compat_syscall(regs)))
                return -1;

        return syscall_get_nr(task, regs);
}
#else
static inline int
trace_get_syscall_nr(struct task_struct *task, struct pt_regs *regs)
{
        return syscall_get_nr(task, regs);
}
#endif /* ARCH_TRACE_IGNORE_COMPAT_SYSCALLS */

static __init struct syscall_metadata *
find_syscall_meta(unsigned long syscall)
{
        struct syscall_metadata **start;
        struct syscall_metadata **stop;
        char str[KSYM_SYMBOL_LEN];


        start = __start_syscalls_metadata;
        stop = __stop_syscalls_metadata;
        kallsyms_lookup(syscall, NULL, NULL, NULL, str);

        if (arch_syscall_match_sym_name(str, "sys_ni_syscall"))
                return NULL;

        for ( ; start < stop; start++) {
                if ((*start)->name && arch_syscall_match_sym_name(str, (*start)->name))
                        return *start;
        }
        return NULL;
}

static struct syscall_metadata *syscall_nr_to_meta(int nr)
{
        if (IS_ENABLED(CONFIG_HAVE_SPARSE_SYSCALL_NR))
                return xa_load(&syscalls_metadata_sparse, (unsigned long)nr);

        if (!syscalls_metadata || nr >= NR_syscalls || nr < 0)
                return NULL;

        return syscalls_metadata[nr];
}

const char *get_syscall_name(int syscall)
{
        struct syscall_metadata *entry;

        entry = syscall_nr_to_meta(syscall);
        if (!entry)
                return NULL;

        return entry->name;
}

/* Added to user strings or arrays when max limit is reached */
#define EXTRA "..."

static void get_dynamic_len_ptr(struct syscall_trace_enter *trace,
                                struct syscall_metadata *entry,
                                int *offset_p, int *len_p, unsigned char **ptr_p)
{
        unsigned char *ptr;
        int offset = *offset_p;
        int val;

        /* This arg points to a user space string */
        ptr = (void *)trace->args + sizeof(long) * entry->nb_args + offset;
        val = *(int *)ptr;

        /* The value is a dynamic string (len << 16 | offset) */
        ptr = (void *)trace + (val & 0xffff);
        *len_p = val >> 16;
        offset += 4;

        *ptr_p = ptr;
        *offset_p = offset;
}

static enum print_line_t
sys_enter_openat_print(struct syscall_trace_enter *trace, struct syscall_metadata *entry,
                       struct trace_seq *s, struct trace_event *event)
{
        unsigned char *ptr;
        int offset = 0;
        int bits, len;
        bool done = false;
        static const struct trace_print_flags __flags[] =
                {
                        { O_TMPFILE, "O_TMPFILE" },
                        { O_WRONLY, "O_WRONLY" },
                        { O_RDWR, "O_RDWR" },
                        { O_CREAT, "O_CREAT" },
                        { O_EXCL, "O_EXCL" },
                        { O_NOCTTY, "O_NOCTTY" },
                        { O_TRUNC, "O_TRUNC" },
                        { O_APPEND, "O_APPEND" },
                        { O_NONBLOCK, "O_NONBLOCK" },
                        { O_DSYNC, "O_DSYNC" },
                        { O_DIRECT, "O_DIRECT" },
                        { O_LARGEFILE, "O_LARGEFILE" },
                        { O_DIRECTORY, "O_DIRECTORY" },
                        { O_NOFOLLOW, "O_NOFOLLOW" },
                        { O_NOATIME, "O_NOATIME" },
                        { O_CLOEXEC, "O_CLOEXEC" },
                        { -1, NULL }
                };

        trace_seq_printf(s, "%s(", entry->name);

        for (int i = 0; !done && i < entry->nb_args; i++) {

                if (trace_seq_has_overflowed(s))
                        goto end;

                if (i)
                        trace_seq_puts(s, ", ");

                switch (i) {
                case 2:
                        bits = trace->args[2];

                        trace_seq_puts(s, "flags: ");

                        /* No need to show mode when not creating the file */
                        if (!(bits & (O_CREAT|O_TMPFILE)))
                                done = true;

                        if (!(bits & O_ACCMODE)) {
                                if (!bits) {
                                        trace_seq_puts(s, "O_RDONLY");
                                        continue;
                                }
                                trace_seq_puts(s, "O_RDONLY|");
                        }

                        trace_print_flags_seq(s, "|", bits, __flags);
                        /*
                         * trace_print_flags_seq() adds a '\0' to the
                         * buffer, but this needs to append more to the seq.
                         */
                        if (!trace_seq_has_overflowed(s))
                                trace_seq_pop(s);

                        continue;
                case 3:
                        trace_seq_printf(s, "%s: 0%03o", entry->args[i],
                                         (unsigned int)trace->args[i]);
                        continue;
                }

                trace_seq_printf(s, "%s: %lu", entry->args[i],
                                 trace->args[i]);

                if (!(BIT(i) & entry->user_mask))
                        continue;

                get_dynamic_len_ptr(trace, entry, &offset, &len, &ptr);
                trace_seq_printf(s, " \"%.*s\"", len, ptr);
        }

        trace_seq_putc(s, ')');
end:
        trace_seq_putc(s, '\n');

        return trace_handle_return(s);
}

static enum print_line_t
print_syscall_enter(struct trace_iterator *iter, int flags,
                    struct trace_event *event)
{
        struct trace_array *tr = iter->tr;
        struct trace_seq *s = &iter->seq;
        struct trace_entry *ent = iter->ent;
        struct syscall_trace_enter *trace;
        struct syscall_metadata *entry;
        int i, syscall, val, len;
        unsigned char *ptr;
        int offset = 0;

        trace = (typeof(trace))ent;
        syscall = trace->nr;
        entry = syscall_nr_to_meta(syscall);

        if (!entry)
                goto end;

        if (entry->enter_event->event.type != ent->type) {
                WARN_ON_ONCE(1);
                goto end;
        }

        switch (entry->syscall_nr) {
        case __NR_openat:
                if (!tr || !(tr->trace_flags & TRACE_ITER(VERBOSE)))
                        return sys_enter_openat_print(trace, entry, s, event);
                break;
        default:
                break;
        }

        trace_seq_printf(s, "%s(", entry->name);

        for (i = 0; i < entry->nb_args; i++) {
                bool printable = false;
                char *str;

                if (trace_seq_has_overflowed(s))
                        goto end;

                if (i)
                        trace_seq_puts(s, ", ");

                /* parameter types */
                if (tr && tr->trace_flags & TRACE_ITER(VERBOSE))
                        trace_seq_printf(s, "%s ", entry->types[i]);

                /* parameter values */
                if (trace->args[i] < 10)
                        trace_seq_printf(s, "%s: %lu", entry->args[i],
                                         trace->args[i]);
                else
                        trace_seq_printf(s, "%s: 0x%lx", entry->args[i],
                                         trace->args[i]);

                if (!(BIT(i) & entry->user_mask))
                        continue;

                get_dynamic_len_ptr(trace, entry, &offset, &len, &ptr);

                if (entry->user_arg_size < 0 || entry->user_arg_is_str) {
                        trace_seq_printf(s, " \"%.*s\"", len, ptr);
                        continue;
                }

                val = trace->args[entry->user_arg_size];

                str = ptr;
                trace_seq_puts(s, " (");
                for (int x = 0; x < len; x++, ptr++) {
                        if (isascii(*ptr) && isprint(*ptr))
                                printable = true;
                        if (x)
                                trace_seq_putc(s, ':');
                        trace_seq_printf(s, "%02x", *ptr);
                }
                if (len < val)
                        trace_seq_printf(s, ", %s", EXTRA);

                trace_seq_putc(s, ')');

                /* If nothing is printable, don't bother printing anything */
                if (!printable)
                        continue;

                trace_seq_puts(s, " \"");
                for (int x = 0; x < len; x++) {
                        if (isascii(str[x]) && isprint(str[x]))
                                trace_seq_putc(s, str[x]);
                        else
                                trace_seq_putc(s, '.');
                }
                if (len < val)
                        trace_seq_printf(s, "\"%s", EXTRA);
                else
                        trace_seq_putc(s, '"');
        }

        trace_seq_putc(s, ')');
end:
        trace_seq_putc(s, '\n');

        return trace_handle_return(s);
}

static enum print_line_t
print_syscall_exit(struct trace_iterator *iter, int flags,
                   struct trace_event *event)
{
        struct trace_seq *s = &iter->seq;
        struct trace_entry *ent = iter->ent;
        struct syscall_trace_exit *trace;
        int syscall;
        struct syscall_metadata *entry;

        trace = (typeof(trace))ent;
        syscall = trace->nr;
        entry = syscall_nr_to_meta(syscall);

        if (!entry) {
                trace_seq_putc(s, '\n');
                goto out;
        }

        if (entry->exit_event->event.type != ent->type) {
                WARN_ON_ONCE(1);
                return TRACE_TYPE_UNHANDLED;
        }

        trace_seq_printf(s, "%s -> 0x%lx\n", entry->name,
                                trace->ret);

 out:
        return trace_handle_return(s);
}

#define SYSCALL_FIELD(_type, _name) {                                   \
        .type = #_type, .name = #_name,                                 \
        .size = sizeof(_type), .align = __alignof__(_type),             \
        .is_signed = is_signed_type(_type), .filter_type = FILTER_OTHER }

/* When len=0, we just calculate the needed length */
#define LEN_OR_ZERO (len ? len - pos : 0)

static int __init
sys_enter_openat_print_fmt(struct syscall_metadata *entry, char *buf, int len)
{
        int pos = 0;

        pos += snprintf(buf + pos, LEN_OR_ZERO,
                        "\"dfd: 0x%%08lx, filename: 0x%%08lx \\\"%%s\\\", flags: %%s%%s, mode: 0%%03o\",");
        pos += snprintf(buf + pos, LEN_OR_ZERO,
                        " ((unsigned long)(REC->dfd)),");
        pos += snprintf(buf + pos, LEN_OR_ZERO,
                        " ((unsigned long)(REC->filename)),");
        pos += snprintf(buf + pos, LEN_OR_ZERO,
                        " __get_str(__filename_val),");
        pos += snprintf(buf + pos, LEN_OR_ZERO,
                        " (REC->flags & ~3) && !(REC->flags & 3) ? \"O_RDONLY|\" : \"\", ");
        pos += snprintf(buf + pos, LEN_OR_ZERO,
                        " REC->flags ? __print_flags(REC->flags, \"|\", ");
        pos += snprintf(buf + pos, LEN_OR_ZERO,
                        "{ 0x%x, \"O_WRONLY\" }, ", O_WRONLY);
        pos += snprintf(buf + pos, LEN_OR_ZERO,
                        "{ 0x%x, \"O_RDWR\" }, ", O_RDWR);
        pos += snprintf(buf + pos, LEN_OR_ZERO,
                        "{ 0x%x, \"O_CREAT\" }, ", O_CREAT);
        pos += snprintf(buf + pos, LEN_OR_ZERO,
                        "{ 0x%x, \"O_EXCL\" }, ", O_EXCL);
        pos += snprintf(buf + pos, LEN_OR_ZERO,
                        "{ 0x%x, \"O_NOCTTY\" }, ", O_NOCTTY);
        pos += snprintf(buf + pos, LEN_OR_ZERO,
                        "{ 0x%x, \"O_TRUNC\" }, ", O_TRUNC);
        pos += snprintf(buf + pos, LEN_OR_ZERO,
                        "{ 0x%x, \"O_APPEND\" }, ", O_APPEND);
        pos += snprintf(buf + pos, LEN_OR_ZERO,
                        "{ 0x%x, \"O_NONBLOCK\" }, ", O_NONBLOCK);
        pos += snprintf(buf + pos, LEN_OR_ZERO,
                        "{ 0x%x, \"O_DSYNC\" }, ", O_DSYNC);
        pos += snprintf(buf + pos, LEN_OR_ZERO,
                        "{ 0x%x, \"O_DIRECT\" }, ", O_DIRECT);
        pos += snprintf(buf + pos, LEN_OR_ZERO,
                        "{ 0x%x, \"O_LARGEFILE\" }, ", O_LARGEFILE);
        pos += snprintf(buf + pos, LEN_OR_ZERO,
                        "{ 0x%x, \"O_DIRECTORY\" }, ", O_DIRECTORY);
        pos += snprintf(buf + pos, LEN_OR_ZERO,
                        "{ 0x%x, \"O_NOFOLLOW\" }, ", O_NOFOLLOW);
        pos += snprintf(buf + pos, LEN_OR_ZERO,
                        "{ 0x%x, \"O_NOATIME\" }, ", O_NOATIME);
        pos += snprintf(buf + pos, LEN_OR_ZERO,
                        "{ 0x%x, \"O_CLOEXEC\" }) : \"O_RDONLY\", ", O_CLOEXEC);

        pos += snprintf(buf + pos, LEN_OR_ZERO,
                        " ((unsigned long)(REC->mode))");
        return pos;
}

static int __init
__set_enter_print_fmt(struct syscall_metadata *entry, char *buf, int len)
{
        bool is_string = entry->user_arg_is_str;
        int i;
        int pos = 0;

        switch (entry->syscall_nr) {
        case __NR_openat:
                return sys_enter_openat_print_fmt(entry, buf, len);
        default:
                break;
        }

        pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");
        for (i = 0; i < entry->nb_args; i++) {
                if (i)
                        pos += snprintf(buf + pos, LEN_OR_ZERO, ", ");
                pos += snprintf(buf + pos, LEN_OR_ZERO, "%s: 0x%%0%zulx",
                                entry->args[i], sizeof(unsigned long));

                if (!(BIT(i) & entry->user_mask))
                        continue;

                /* Add the format for the user space string or array */
                if (entry->user_arg_size < 0 || is_string)
                        pos += snprintf(buf + pos, LEN_OR_ZERO, " \\\"%%s\\\"");
                else
                        pos += snprintf(buf + pos, LEN_OR_ZERO, " (%%s)");
        }
        pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");

        for (i = 0; i < entry->nb_args; i++) {
                pos += snprintf(buf + pos, LEN_OR_ZERO,
                                ", ((unsigned long)(REC->%s))", entry->args[i]);
                if (!(BIT(i) & entry->user_mask))
                        continue;
                /* The user space data for arg has name __<arg>_val */
                if (entry->user_arg_size < 0 || is_string) {
                        pos += snprintf(buf + pos, LEN_OR_ZERO, ", __get_str(__%s_val)",
                                        entry->args[i]);
                } else {
                        pos += snprintf(buf + pos, LEN_OR_ZERO, ", __print_dynamic_array(__%s_val, 1)",
                                        entry->args[i]);
                }
        }

#undef LEN_OR_ZERO

        /* return the length of print_fmt */
        return pos;
}

static int __init set_syscall_print_fmt(struct trace_event_call *call)
{
        char *print_fmt;
        int len;
        struct syscall_metadata *entry = call->data;

        if (entry->enter_event != call) {
                call->print_fmt = "\"0x%lx\", REC->ret";
                return 0;
        }

        /* First: called with 0 length to calculate the needed length */
        len = __set_enter_print_fmt(entry, NULL, 0);

        print_fmt = kmalloc(len + 1, GFP_KERNEL);
        if (!print_fmt)
                return -ENOMEM;

        /* Second: actually write the @print_fmt */
        __set_enter_print_fmt(entry, print_fmt, len + 1);
        call->print_fmt = print_fmt;

        return 0;
}

static void __init free_syscall_print_fmt(struct trace_event_call *call)
{
        struct syscall_metadata *entry = call->data;

        if (entry->enter_event == call)
                kfree(call->print_fmt);
}

static int __init syscall_enter_define_fields(struct trace_event_call *call)
{
        struct syscall_trace_enter trace;
        struct syscall_metadata *meta = call->data;
        unsigned long mask;
        char *arg;
        int offset = offsetof(typeof(trace), args);
        int ret = 0;
        int len;
        int i;

        for (i = 0; i < meta->nb_args; i++) {
                ret = trace_define_field(call, meta->types[i],
                                         meta->args[i], offset,
                                         sizeof(unsigned long), 0,
                                         FILTER_OTHER);
                if (ret)
                        break;
                offset += sizeof(unsigned long);
        }

        if (ret || !meta->user_mask)
                return ret;

        mask = meta->user_mask;

        while (mask) {
                int idx = ffs(mask) - 1;
                mask &= ~BIT(idx);

                /*
                 * User space data is faulted into a temporary buffer and then
                 * added as a dynamic string or array to the end of the event.
                 * The user space data name for the arg pointer is
                 * "__<arg>_val".
                 */
                len = strlen(meta->args[idx]) + sizeof("___val");
                arg = kmalloc(len, GFP_KERNEL);
                if (WARN_ON_ONCE(!arg)) {
                        meta->user_mask = 0;
                        return -ENOMEM;
                }

                snprintf(arg, len, "__%s_val", meta->args[idx]);

                ret = trace_define_field(call, "__data_loc char[]",
                                         arg, offset, sizeof(int), 0,
                                         FILTER_OTHER);
                if (ret) {
                        kfree(arg);
                        break;
                }
                offset += 4;
        }
        return ret;
}

/*
 * Create a per CPU temporary buffer to copy user space pointers into.
 *
 * SYSCALL_FAULT_USER_MAX is the amount to copy from user space.
 *  (defined in kernel/trace/trace.h)

 * SYSCALL_FAULT_ARG_SZ is the amount to copy from user space plus the
 *   nul terminating byte and possibly appended EXTRA (4 bytes).
 *
 * SYSCALL_FAULT_BUF_SZ holds the size of the per CPU buffer to use
 * to copy memory from user space addresses into that will hold
 * 3 args as only 3 args are allowed to be copied from system calls.
 */
#define SYSCALL_FAULT_ARG_SZ (SYSCALL_FAULT_USER_MAX + 1 + 4)
#define SYSCALL_FAULT_MAX_CNT 3
#define SYSCALL_FAULT_BUF_SZ (SYSCALL_FAULT_ARG_SZ * SYSCALL_FAULT_MAX_CNT)

/* Use the tracing per CPU buffer infrastructure to copy from user space */
struct syscall_user_buffer {
        struct trace_user_buf_info      buf;
        struct rcu_head                 rcu;
};

static struct syscall_user_buffer *syscall_buffer;

static int syscall_fault_buffer_enable(void)
{
        struct syscall_user_buffer *sbuf;
        int ret;

        lockdep_assert_held(&syscall_trace_lock);

        if (syscall_buffer) {
                trace_user_fault_get(&syscall_buffer->buf);
                return 0;
        }

        sbuf = kmalloc_obj(*sbuf);
        if (!sbuf)
                return -ENOMEM;

        ret = trace_user_fault_init(&sbuf->buf, SYSCALL_FAULT_BUF_SZ);
        if (ret < 0) {
                kfree(sbuf);
                return ret;
        }

        WRITE_ONCE(syscall_buffer, sbuf);

        return 0;
}

static void rcu_free_syscall_buffer(struct rcu_head *rcu)
{
        struct syscall_user_buffer *sbuf =
                container_of(rcu, struct syscall_user_buffer, rcu);

        trace_user_fault_destroy(&sbuf->buf);
        kfree(sbuf);
}


static void syscall_fault_buffer_disable(void)
{
        struct syscall_user_buffer *sbuf = syscall_buffer;

        lockdep_assert_held(&syscall_trace_lock);

        if (trace_user_fault_put(&sbuf->buf))
                return;

        WRITE_ONCE(syscall_buffer, NULL);
        call_rcu_tasks_trace(&sbuf->rcu, rcu_free_syscall_buffer);
}

struct syscall_args {
        char            *ptr_array[SYSCALL_FAULT_MAX_CNT];
        int             read[SYSCALL_FAULT_MAX_CNT];
        int             uargs;
};

static int syscall_copy_user(char *buf, const char __user *ptr,
                             size_t size, void *data)
{
        struct syscall_args *args = data;
        int ret;

        for (int i = 0; i < args->uargs; i++, buf += SYSCALL_FAULT_ARG_SZ) {
                ptr = (char __user *)args->ptr_array[i];
                ret = strncpy_from_user(buf, ptr, size);
                args->read[i] = ret;
        }
        return 0;
}

static int syscall_copy_user_array(char *buf, const char __user *ptr,
                                   size_t size, void *data)
{
        struct syscall_args *args = data;
        int ret;

        for (int i = 0; i < args->uargs; i++, buf += SYSCALL_FAULT_ARG_SZ) {
                ptr = (char __user *)args->ptr_array[i];
                ret = __copy_from_user(buf, ptr, size);
                args->read[i] = ret ? -1 : size;
        }
        return 0;
}

static char *sys_fault_user(unsigned int buf_size,
                            struct syscall_metadata *sys_data,
                            struct syscall_user_buffer *sbuf,
                            unsigned long *args,
                            unsigned int data_size[SYSCALL_FAULT_MAX_CNT])
{
        trace_user_buf_copy syscall_copy = syscall_copy_user;
        unsigned long mask = sys_data->user_mask;
        unsigned long size = SYSCALL_FAULT_ARG_SZ - 1;
        struct syscall_args sargs;
        bool array = false;
        char *buffer;
        char *buf;
        int ret;
        int i = 0;

        /* The extra is appended to the user data in the buffer */
        BUILD_BUG_ON(SYSCALL_FAULT_USER_MAX + sizeof(EXTRA) >=
                     SYSCALL_FAULT_ARG_SZ);

        /*
         * If this system call event has a size argument, use
         * it to define how much of user space memory to read,
         * and read it as an array and not a string.
         */
        if (sys_data->user_arg_size >= 0) {
                array = true;
                size = args[sys_data->user_arg_size];
                if (size > SYSCALL_FAULT_ARG_SZ - 1)
                        size = SYSCALL_FAULT_ARG_SZ - 1;
                syscall_copy = syscall_copy_user_array;
        }

        while (mask) {
                int idx = ffs(mask) - 1;
                mask &= ~BIT(idx);

                if (WARN_ON_ONCE(i == SYSCALL_FAULT_MAX_CNT))
                        break;

                /* Get the pointer to user space memory to read */
                sargs.ptr_array[i++] = (char *)args[idx];
        }

        sargs.uargs = i;

        /* Clear the values that are not used */
        for (; i < SYSCALL_FAULT_MAX_CNT; i++) {
                data_size[i] = -1; /* Denotes no pointer */
        }

        /* A zero size means do not even try */
        if (!buf_size)
                return NULL;

        buffer = trace_user_fault_read(&sbuf->buf, NULL, size,
                                       syscall_copy, &sargs);
        if (!buffer)
                return NULL;

        buf = buffer;
        for (i = 0; i < sargs.uargs; i++, buf += SYSCALL_FAULT_ARG_SZ) {

                ret = sargs.read[i];
                if (ret < 0)
                        continue;
                buf[ret] = '\0';

                /* For strings, replace any non-printable characters with '.' */
                if (!array) {
                        for (int x = 0; x < ret; x++) {
                                if (!isprint(buf[x]))
                                        buf[x] = '.';
                        }

                        size = min(buf_size, SYSCALL_FAULT_USER_MAX);

                        /*
                         * If the text was truncated due to our max limit,
                         * add "..." to the string.
                         */
                        if (ret > size) {
                                strscpy(buf + size, EXTRA, sizeof(EXTRA));
                                ret = size + sizeof(EXTRA);
                        } else {
                                buf[ret++] = '\0';
                        }
                } else {
                        ret = min((unsigned int)ret, buf_size);
                }
                data_size[i] = ret;
        }

        return buffer;
}

static int
syscall_get_data(struct syscall_metadata *sys_data, unsigned long *args,
                 char **buffer, int *size, int *user_sizes, int *uargs,
                 int buf_size)
{
        struct syscall_user_buffer *sbuf;
        int i;

        /* If the syscall_buffer is NULL, tracing is being shutdown */
        sbuf = READ_ONCE(syscall_buffer);
        if (!sbuf)
                return -1;

        *buffer = sys_fault_user(buf_size, sys_data, sbuf, args, user_sizes);
        /*
         * user_size is the amount of data to append.
         * Need to add 4 for the meta field that points to
         * the user memory at the end of the event and also
         * stores its size.
         */
        for (i = 0; i < SYSCALL_FAULT_MAX_CNT; i++) {
                if (user_sizes[i] < 0)
                        break;
                *size += user_sizes[i] + 4;
        }
        /* Save the number of user read arguments of this syscall */
        *uargs = i;
        return 0;
}

static void syscall_put_data(struct syscall_metadata *sys_data,
                             struct syscall_trace_enter *entry,
                             char *buffer, int size, int *user_sizes, int uargs)
{
        char *buf = buffer;
        void *ptr;
        int val;

        /*
         * Set the pointer to point to the meta data of the event
         * that has information about the stored user space memory.
         */
        ptr = (void *)entry->args + sizeof(unsigned long) * sys_data->nb_args;

        /*
         * The meta data will store the offset of the user data from
         * the beginning of the event. That is after the static arguments
         * and the meta data fields.
         */
        val = (ptr - (void *)entry) + 4 * uargs;

        for (int i = 0; i < uargs; i++) {

                if (i)
                        val += user_sizes[i - 1];

                /* Store the offset and the size into the meta data */
                *(int *)ptr = val | (user_sizes[i] << 16);

                /* Skip the meta data */
                ptr += 4;
        }

        for (int i = 0; i < uargs; i++, buf += SYSCALL_FAULT_ARG_SZ) {
                /* Nothing to do if the user space was empty or faulted */
                if (!user_sizes[i])
                        continue;

                memcpy(ptr, buf, user_sizes[i]);
                ptr += user_sizes[i];
        }
}

static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
{
        struct trace_array *tr = data;
        struct trace_event_file *trace_file;
        struct syscall_trace_enter *entry;
        struct syscall_metadata *sys_data;
        struct trace_event_buffer fbuffer;
        unsigned long args[6];
        char *user_ptr;
        int user_sizes[SYSCALL_FAULT_MAX_CNT] = {};
        int syscall_nr;
        int size = 0;
        int uargs = 0;
        bool mayfault;

        /*
         * Syscall probe called with preemption enabled, but the ring
         * buffer and per-cpu data require preemption to be disabled.
         */
        might_fault();

        syscall_nr = trace_get_syscall_nr(current, regs);
        if (syscall_nr < 0 || syscall_nr >= NR_syscalls)
                return;

        trace_file = READ_ONCE(tr->enter_syscall_files[syscall_nr]);
        if (!trace_file)
                return;

        if (trace_trigger_soft_disabled(trace_file))
                return;

        sys_data = syscall_nr_to_meta(syscall_nr);
        if (!sys_data)
                return;

        /* Check if this syscall event faults in user space memory */
        mayfault = sys_data->user_mask != 0;

        guard(preempt_notrace)();

        syscall_get_arguments(current, regs, args);

        if (mayfault) {
                if (syscall_get_data(sys_data, args, &user_ptr,
                                     &size, user_sizes, &uargs, tr->syscall_buf_sz) < 0)
                        return;
        }

        size += sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args;

        entry = trace_event_buffer_reserve(&fbuffer, trace_file, size);
        if (!entry)
                return;

        entry = ring_buffer_event_data(fbuffer.event);
        entry->nr = syscall_nr;

        memcpy(entry->args, args, sizeof(unsigned long) * sys_data->nb_args);

        if (mayfault)
                syscall_put_data(sys_data, entry, user_ptr, size, user_sizes, uargs);

        trace_event_buffer_commit(&fbuffer);
}

static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret)
{
        struct trace_array *tr = data;
        struct trace_event_file *trace_file;
        struct syscall_trace_exit *entry;
        struct syscall_metadata *sys_data;
        struct trace_event_buffer fbuffer;
        int syscall_nr;

        /*
         * Syscall probe called with preemption enabled, but the ring
         * buffer and per-cpu data require preemption to be disabled.
         */
        might_fault();
        guard(preempt_notrace)();

        syscall_nr = trace_get_syscall_nr(current, regs);
        if (syscall_nr < 0 || syscall_nr >= NR_syscalls)
                return;

        trace_file = READ_ONCE(tr->exit_syscall_files[syscall_nr]);
        if (!trace_file)
                return;

        if (trace_trigger_soft_disabled(trace_file))
                return;

        sys_data = syscall_nr_to_meta(syscall_nr);
        if (!sys_data)
                return;

        entry = trace_event_buffer_reserve(&fbuffer, trace_file, sizeof(*entry));
        if (!entry)
                return;

        entry = ring_buffer_event_data(fbuffer.event);
        entry->nr = syscall_nr;
        entry->ret = syscall_get_return_value(current, regs);

        trace_event_buffer_commit(&fbuffer);
}

static int reg_event_syscall_enter(struct trace_event_file *file,
                                   struct trace_event_call *call)
{
        struct syscall_metadata *sys_data = call->data;
        struct trace_array *tr = file->tr;
        int ret = 0;
        int num;

        num = sys_data->syscall_nr;
        if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
                return -ENOSYS;
        guard(mutex)(&syscall_trace_lock);
        if (sys_data->user_mask) {
                ret = syscall_fault_buffer_enable();
                if (ret < 0)
                        return ret;
        }
        if (!tr->sys_refcount_enter) {
                ret = register_trace_sys_enter(ftrace_syscall_enter, tr);
                if (ret < 0) {
                        if (sys_data->user_mask)
                                syscall_fault_buffer_disable();
                        return ret;
                }
        }
        WRITE_ONCE(tr->enter_syscall_files[num], file);
        tr->sys_refcount_enter++;
        return 0;
}

static void unreg_event_syscall_enter(struct trace_event_file *file,
                                      struct trace_event_call *call)
{
        struct syscall_metadata *sys_data = call->data;
        struct trace_array *tr = file->tr;
        int num;

        num = sys_data->syscall_nr;
        if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
                return;
        guard(mutex)(&syscall_trace_lock);
        tr->sys_refcount_enter--;
        WRITE_ONCE(tr->enter_syscall_files[num], NULL);
        if (!tr->sys_refcount_enter)
                unregister_trace_sys_enter(ftrace_syscall_enter, tr);
        if (sys_data->user_mask)
                syscall_fault_buffer_disable();
}

static int reg_event_syscall_exit(struct trace_event_file *file,
                                  struct trace_event_call *call)
{
        struct trace_array *tr = file->tr;
        int ret = 0;
        int num;

        num = ((struct syscall_metadata *)call->data)->syscall_nr;
        if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
                return -ENOSYS;
        mutex_lock(&syscall_trace_lock);
        if (!tr->sys_refcount_exit)
                ret = register_trace_sys_exit(ftrace_syscall_exit, tr);
        if (!ret) {
                WRITE_ONCE(tr->exit_syscall_files[num], file);
                tr->sys_refcount_exit++;
        }
        mutex_unlock(&syscall_trace_lock);
        return ret;
}

static void unreg_event_syscall_exit(struct trace_event_file *file,
                                     struct trace_event_call *call)
{
        struct trace_array *tr = file->tr;
        int num;

        num = ((struct syscall_metadata *)call->data)->syscall_nr;
        if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
                return;
        mutex_lock(&syscall_trace_lock);
        tr->sys_refcount_exit--;
        WRITE_ONCE(tr->exit_syscall_files[num], NULL);
        if (!tr->sys_refcount_exit)
                unregister_trace_sys_exit(ftrace_syscall_exit, tr);
        mutex_unlock(&syscall_trace_lock);
}

/*
 * For system calls that reference user space memory that can
 * be recorded into the event, set the system call meta data's user_mask
 * to the "args" index that points to the user space memory to retrieve.
 */
static void check_faultable_syscall(struct trace_event_call *call, int nr)
{
        struct syscall_metadata *sys_data = call->data;
        unsigned long mask;

        /* Only work on entry */
        if (sys_data->enter_event != call)
                return;

        sys_data->user_arg_size = -1;

        switch (nr) {
        /* user arg 1 with size arg at 2 */
        case __NR_write:
#ifdef __NR_mq_timedsend
        case __NR_mq_timedsend:
#endif
        case __NR_pwrite64:
                sys_data->user_mask = BIT(1);
                sys_data->user_arg_size = 2;
                break;
        /* user arg 0 with size arg at 1 as string */
        case __NR_setdomainname:
        case __NR_sethostname:
                sys_data->user_mask = BIT(0);
                sys_data->user_arg_size = 1;
                sys_data->user_arg_is_str = 1;
                break;
#ifdef __NR_kexec_file_load
        /* user arg 4 with size arg at 3 as string */
        case __NR_kexec_file_load:
                sys_data->user_mask = BIT(4);
                sys_data->user_arg_size = 3;
                sys_data->user_arg_is_str = 1;
                break;
#endif
        /* user arg at position 0 */
#ifdef __NR_access
        case __NR_access:
#endif
        case __NR_acct:
        case __NR_chdir:
#ifdef  __NR_chown
        case __NR_chown:
#endif
#ifdef  __NR_chmod
        case __NR_chmod:
#endif
        case __NR_chroot:
#ifdef __NR_creat
        case __NR_creat:
#endif
        case __NR_delete_module:
        case __NR_execve:
        case __NR_fsopen:
#ifdef __NR_lchown
        case __NR_lchown:
#endif
#ifdef __NR_open
        case __NR_open:
#endif
        case __NR_memfd_create:
#ifdef __NR_mkdir
        case __NR_mkdir:
#endif
#ifdef __NR_mknod
        case __NR_mknod:
#endif
        case __NR_mq_open:
        case __NR_mq_unlink:
#ifdef __NR_readlink
        case __NR_readlink:
#endif
#ifdef  __NR_rmdir
        case __NR_rmdir:
#endif
        case __NR_shmdt:
#ifdef __NR_statfs
        case __NR_statfs:
#endif
        case __NR_swapon:
        case __NR_swapoff:
#ifdef __NR_truncate
        case __NR_truncate:
#endif
#ifdef __NR_unlink
        case __NR_unlink:
#endif
        case __NR_umount2:
#ifdef __NR_utime
        case __NR_utime:
#endif
#ifdef __NR_utimes
        case __NR_utimes:
#endif
                sys_data->user_mask = BIT(0);
                break;
        /* user arg at position 1 */
        case __NR_execveat:
        case __NR_faccessat:
        case __NR_faccessat2:
        case __NR_finit_module:
        case __NR_fchmodat:
        case __NR_fchmodat2:
        case __NR_fchownat:
        case __NR_fgetxattr:
        case __NR_flistxattr:
        case __NR_fsetxattr:
        case __NR_fspick:
        case __NR_fremovexattr:
#ifdef __NR_futimesat
        case __NR_futimesat:
#endif
        case __NR_inotify_add_watch:
        case __NR_mkdirat:
        case __NR_mknodat:
        case __NR_mount_setattr:
        case __NR_name_to_handle_at:
#ifdef __NR_newfstatat
        case __NR_newfstatat:
#endif
        case __NR_openat:
        case __NR_openat2:
        case __NR_open_tree:
        case __NR_open_tree_attr:
        case __NR_readlinkat:
        case __NR_quotactl:
        case __NR_syslog:
        case __NR_statx:
        case __NR_unlinkat:
#ifdef __NR_utimensat
        case __NR_utimensat:
#endif
                sys_data->user_mask = BIT(1);
                break;
        /* user arg at position 2 */
        case __NR_init_module:
        case __NR_fsconfig:
                sys_data->user_mask = BIT(2);
                break;
        /* user arg at position 4 */
        case __NR_fanotify_mark:
                sys_data->user_mask = BIT(4);
                break;
        /* 2 user args, 0 and 1 */
        case __NR_add_key:
        case __NR_getxattr:
        case __NR_lgetxattr:
        case __NR_lremovexattr:
#ifdef __NR_link
        case __NR_link:
#endif
        case __NR_listxattr:
        case __NR_llistxattr:
        case __NR_lsetxattr:
        case __NR_pivot_root:
        case __NR_removexattr:
#ifdef __NR_rename
        case __NR_rename:
#endif
        case __NR_request_key:
        case __NR_setxattr:
#ifdef __NR_symlink
        case __NR_symlink:
#endif
                sys_data->user_mask = BIT(0) | BIT(1);
                break;
        /* 2 user args, 0 and 2 */
        case __NR_symlinkat:
                sys_data->user_mask = BIT(0) | BIT(2);
                break;
        /* 2 user args, 1 and 3 */
        case __NR_getxattrat:
        case __NR_linkat:
        case __NR_listxattrat:
        case __NR_move_mount:
#ifdef __NR_renameat
        case __NR_renameat:
#endif
        case __NR_renameat2:
        case __NR_removexattrat:
        case __NR_setxattrat:
                sys_data->user_mask = BIT(1) | BIT(3);
                break;
        case __NR_mount: /* Just dev_name and dir_name, TODO add type */
                sys_data->user_mask = BIT(0) | BIT(1) | BIT(2);
                break;
        default:
                sys_data->user_mask = 0;
                return;
        }

        if (sys_data->user_arg_size < 0)
                return;

        /*
         * The user_arg_size can only be used when the system call
         * is reading only a single address from user space.
         */
        mask = sys_data->user_mask;
        if (WARN_ON(mask & (mask - 1)))
                sys_data->user_arg_size = -1;
}

static int __init init_syscall_trace(struct trace_event_call *call)
{
        int id;
        int num;

        num = ((struct syscall_metadata *)call->data)->syscall_nr;
        if (num < 0 || num >= NR_syscalls) {
                pr_debug("syscall %s metadata not mapped, disabling ftrace event\n",
                                ((struct syscall_metadata *)call->data)->name);
                return -ENOSYS;
        }

        check_faultable_syscall(call, num);

        if (set_syscall_print_fmt(call) < 0)
                return -ENOMEM;

        id = trace_event_raw_init(call);

        if (id < 0) {
                free_syscall_print_fmt(call);
                return id;
        }

        return id;
}

static struct trace_event_fields __refdata syscall_enter_fields_array[] = {
        SYSCALL_FIELD(int, __syscall_nr),
        { .type = TRACE_FUNCTION_TYPE,
          .define_fields = syscall_enter_define_fields },
        {}
};

struct trace_event_functions enter_syscall_print_funcs = {
        .trace          = print_syscall_enter,
};

struct trace_event_functions exit_syscall_print_funcs = {
        .trace          = print_syscall_exit,
};

struct trace_event_class __refdata event_class_syscall_enter = {
        .system         = "syscalls",
        .reg            = syscall_enter_register,
        .fields_array   = syscall_enter_fields_array,
        .get_fields     = syscall_get_enter_fields,
        .raw_init       = init_syscall_trace,
};

struct trace_event_class __refdata event_class_syscall_exit = {
        .system         = "syscalls",
        .reg            = syscall_exit_register,
        .fields_array   = (struct trace_event_fields[]){
                SYSCALL_FIELD(int, __syscall_nr),
                SYSCALL_FIELD(long, ret),
                {}
        },
        .fields         = LIST_HEAD_INIT(event_class_syscall_exit.fields),
        .raw_init       = init_syscall_trace,
};

unsigned long __init __weak arch_syscall_addr(int nr)
{
        return (unsigned long)sys_call_table[nr];
}

void __init init_ftrace_syscalls(void)
{
        struct syscall_metadata *meta;
        unsigned long addr;
        int i;
        void *ret;

        if (!IS_ENABLED(CONFIG_HAVE_SPARSE_SYSCALL_NR)) {
                syscalls_metadata = kzalloc_objs(*syscalls_metadata,
                                                 NR_syscalls);
                if (!syscalls_metadata) {
                        WARN_ON(1);
                        return;
                }
        }

        for (i = 0; i < NR_syscalls; i++) {
                addr = arch_syscall_addr(i);
                meta = find_syscall_meta(addr);
                if (!meta)
                        continue;

                meta->syscall_nr = i;

                if (!IS_ENABLED(CONFIG_HAVE_SPARSE_SYSCALL_NR)) {
                        syscalls_metadata[i] = meta;
                } else {
                        ret = xa_store(&syscalls_metadata_sparse, i, meta,
                                        GFP_KERNEL);
                        WARN(xa_is_err(ret),
                                "Syscall memory allocation failed\n");
                }

        }
}

#ifdef CONFIG_PERF_EVENTS

static DECLARE_BITMAP(enabled_perf_enter_syscalls, NR_syscalls);
static DECLARE_BITMAP(enabled_perf_exit_syscalls, NR_syscalls);
static int sys_perf_refcount_enter;
static int sys_perf_refcount_exit;

static int perf_call_bpf_enter(struct trace_event_call *call, struct pt_regs *regs,
                               struct syscall_metadata *sys_data,
                               struct syscall_trace_enter *rec)
{
        struct syscall_tp_t {
                struct trace_entry ent;
                int syscall_nr;
                unsigned long args[SYSCALL_DEFINE_MAXARGS];
        } __aligned(8) param;
        int i;

        BUILD_BUG_ON(sizeof(param.ent) < sizeof(void *));

        /* bpf prog requires 'regs' to be the first member in the ctx (a.k.a. &param) */
        perf_fetch_caller_regs(regs);
        *(struct pt_regs **)&param = regs;
        param.syscall_nr = rec->nr;
        for (i = 0; i < sys_data->nb_args; i++)
                param.args[i] = rec->args[i];
        return trace_call_bpf(call, &param);
}

static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
{
        struct syscall_metadata *sys_data;
        struct syscall_trace_enter *rec;
        struct pt_regs *fake_regs;
        struct hlist_head *head;
        unsigned long args[6];
        bool valid_prog_array;
        bool mayfault;
        char *user_ptr;
        int user_sizes[SYSCALL_FAULT_MAX_CNT] = {};
        int buf_size = CONFIG_TRACE_SYSCALL_BUF_SIZE_DEFAULT;
        int syscall_nr;
        int rctx;
        int size = 0;
        int uargs = 0;

        /*
         * Syscall probe called with preemption enabled, but the ring
         * buffer and per-cpu data require preemption to be disabled.
         */
        might_fault();
        guard(preempt_notrace)();

        syscall_nr = trace_get_syscall_nr(current, regs);
        if (syscall_nr < 0 || syscall_nr >= NR_syscalls)
                return;
        if (!test_bit(syscall_nr, enabled_perf_enter_syscalls))
                return;

        sys_data = syscall_nr_to_meta(syscall_nr);
        if (!sys_data)
                return;

        syscall_get_arguments(current, regs, args);

        /* Check if this syscall event faults in user space memory */
        mayfault = sys_data->user_mask != 0;

        if (mayfault) {
                if (syscall_get_data(sys_data, args, &user_ptr,
                                     &size, user_sizes, &uargs, buf_size) < 0)
                        return;
        }

        head = this_cpu_ptr(sys_data->enter_event->perf_events);
        valid_prog_array = bpf_prog_array_valid(sys_data->enter_event);
        if (!valid_prog_array && hlist_empty(head))
                return;

        /* get the size after alignment with the u32 buffer size field */
        size += sizeof(unsigned long) * sys_data->nb_args + sizeof(*rec);
        size = ALIGN(size + sizeof(u32), sizeof(u64));
        size -= sizeof(u32);

        rec = perf_trace_buf_alloc(size, &fake_regs, &rctx);
        if (!rec)
                return;

        rec->nr = syscall_nr;
        memcpy(&rec->args, args, sizeof(unsigned long) * sys_data->nb_args);

        if (mayfault)
                syscall_put_data(sys_data, rec, user_ptr, size, user_sizes, uargs);

        if ((valid_prog_array &&
             !perf_call_bpf_enter(sys_data->enter_event, fake_regs, sys_data, rec)) ||
            hlist_empty(head)) {
                perf_swevent_put_recursion_context(rctx);
                return;
        }

        perf_trace_buf_submit(rec, size, rctx,
                              sys_data->enter_event->event.type, 1, regs,
                              head, NULL);
}

static int perf_sysenter_enable(struct trace_event_call *call)
{
        struct syscall_metadata *sys_data = call->data;
        int num;
        int ret;

        num = sys_data->syscall_nr;

        guard(mutex)(&syscall_trace_lock);
        if (sys_data->user_mask) {
                ret = syscall_fault_buffer_enable();
                if (ret < 0)
                        return ret;
        }
        if (!sys_perf_refcount_enter) {
                ret = register_trace_sys_enter(perf_syscall_enter, NULL);
                if (ret) {
                        pr_info("event trace: Could not activate syscall entry trace point");
                        if (sys_data->user_mask)
                                syscall_fault_buffer_disable();
                        return ret;
                }
        }
        set_bit(num, enabled_perf_enter_syscalls);
        sys_perf_refcount_enter++;
        return 0;
}

static void perf_sysenter_disable(struct trace_event_call *call)
{
        struct syscall_metadata *sys_data = call->data;
        int num;

        num = sys_data->syscall_nr;

        guard(mutex)(&syscall_trace_lock);
        sys_perf_refcount_enter--;
        clear_bit(num, enabled_perf_enter_syscalls);
        if (!sys_perf_refcount_enter)
                unregister_trace_sys_enter(perf_syscall_enter, NULL);
        if (sys_data->user_mask)
                syscall_fault_buffer_disable();
}

static int perf_call_bpf_exit(struct trace_event_call *call, struct pt_regs *regs,
                              struct syscall_trace_exit *rec)
{
        struct syscall_tp_t {
                struct trace_entry ent;
                int syscall_nr;
                unsigned long ret;
        } __aligned(8) param;

        /* bpf prog requires 'regs' to be the first member in the ctx (a.k.a. &param) */
        perf_fetch_caller_regs(regs);
        *(struct pt_regs **)&param = regs;
        param.syscall_nr = rec->nr;
        param.ret = rec->ret;
        return trace_call_bpf(call, &param);
}

static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
{
        struct syscall_metadata *sys_data;
        struct syscall_trace_exit *rec;
        struct pt_regs *fake_regs;
        struct hlist_head *head;
        bool valid_prog_array;
        int syscall_nr;
        int rctx;
        int size;

        /*
         * Syscall probe called with preemption enabled, but the ring
         * buffer and per-cpu data require preemption to be disabled.
         */
        might_fault();
        guard(preempt_notrace)();

        syscall_nr = trace_get_syscall_nr(current, regs);
        if (syscall_nr < 0 || syscall_nr >= NR_syscalls)
                return;
        if (!test_bit(syscall_nr, enabled_perf_exit_syscalls))
                return;

        sys_data = syscall_nr_to_meta(syscall_nr);
        if (!sys_data)
                return;

        head = this_cpu_ptr(sys_data->exit_event->perf_events);
        valid_prog_array = bpf_prog_array_valid(sys_data->exit_event);
        if (!valid_prog_array && hlist_empty(head))
                return;

        /* We can probably do that at build time */
        size = ALIGN(sizeof(*rec) + sizeof(u32), sizeof(u64));
        size -= sizeof(u32);

        rec = perf_trace_buf_alloc(size, &fake_regs, &rctx);
        if (!rec)
                return;

        rec->nr = syscall_nr;
        rec->ret = syscall_get_return_value(current, regs);

        if ((valid_prog_array &&
             !perf_call_bpf_exit(sys_data->exit_event, fake_regs, rec)) ||
            hlist_empty(head)) {
                perf_swevent_put_recursion_context(rctx);
                return;
        }

        perf_trace_buf_submit(rec, size, rctx, sys_data->exit_event->event.type,
                              1, regs, head, NULL);
}

static int perf_sysexit_enable(struct trace_event_call *call)
{
        int num;

        num = ((struct syscall_metadata *)call->data)->syscall_nr;

        guard(mutex)(&syscall_trace_lock);
        if (!sys_perf_refcount_exit) {
                int ret = register_trace_sys_exit(perf_syscall_exit, NULL);
                if (ret) {
                        pr_info("event trace: Could not activate syscall exit trace point");
                        return ret;
                }
        }
        set_bit(num, enabled_perf_exit_syscalls);
        sys_perf_refcount_exit++;
        return 0;
}

static void perf_sysexit_disable(struct trace_event_call *call)
{
        int num;

        num = ((struct syscall_metadata *)call->data)->syscall_nr;

        guard(mutex)(&syscall_trace_lock);
        sys_perf_refcount_exit--;
        clear_bit(num, enabled_perf_exit_syscalls);
        if (!sys_perf_refcount_exit)
                unregister_trace_sys_exit(perf_syscall_exit, NULL);
}

#endif /* CONFIG_PERF_EVENTS */

static int syscall_enter_register(struct trace_event_call *event,
                                 enum trace_reg type, void *data)
{
        struct trace_event_file *file = data;

        switch (type) {
        case TRACE_REG_REGISTER:
                return reg_event_syscall_enter(file, event);
        case TRACE_REG_UNREGISTER:
                unreg_event_syscall_enter(file, event);
                return 0;

#ifdef CONFIG_PERF_EVENTS
        case TRACE_REG_PERF_REGISTER:
                return perf_sysenter_enable(event);
        case TRACE_REG_PERF_UNREGISTER:
                perf_sysenter_disable(event);
                return 0;
        case TRACE_REG_PERF_OPEN:
        case TRACE_REG_PERF_CLOSE:
        case TRACE_REG_PERF_ADD:
        case TRACE_REG_PERF_DEL:
                return 0;
#endif
        }
        return 0;
}

static int syscall_exit_register(struct trace_event_call *event,
                                 enum trace_reg type, void *data)
{
        struct trace_event_file *file = data;

        switch (type) {
        case TRACE_REG_REGISTER:
                return reg_event_syscall_exit(file, event);
        case TRACE_REG_UNREGISTER:
                unreg_event_syscall_exit(file, event);
                return 0;

#ifdef CONFIG_PERF_EVENTS
        case TRACE_REG_PERF_REGISTER:
                return perf_sysexit_enable(event);
        case TRACE_REG_PERF_UNREGISTER:
                perf_sysexit_disable(event);
                return 0;
        case TRACE_REG_PERF_OPEN:
        case TRACE_REG_PERF_CLOSE:
        case TRACE_REG_PERF_ADD:
        case TRACE_REG_PERF_DEL:
                return 0;
#endif
        }
        return 0;
}