root/io_uring/bpf_filter.c
// SPDX-License-Identifier: GPL-2.0
/*
 * BPF filter support for io_uring. Supports SQE opcodes for now.
 */
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/io_uring.h>
#include <linux/filter.h>
#include <linux/bpf.h>
#include <uapi/linux/io_uring.h>

#include "io_uring.h"
#include "bpf_filter.h"
#include "net.h"
#include "openclose.h"

struct io_bpf_filter {
        refcount_t              refs;
        struct bpf_prog         *prog;
        struct io_bpf_filter    *next;
};

/* Deny if this is set as the filter */
static const struct io_bpf_filter dummy_filter;

static void io_uring_populate_bpf_ctx(struct io_uring_bpf_ctx *bctx,
                                      struct io_kiocb *req)
{
        const struct io_issue_def *def = &io_issue_defs[req->opcode];

        bctx->opcode = req->opcode;
        bctx->sqe_flags = (__force int) req->flags & SQE_VALID_FLAGS;
        bctx->user_data = req->cqe.user_data;
        /* clear residual, anything from pdu_size and below */
        memset((void *) bctx + offsetof(struct io_uring_bpf_ctx, pdu_size), 0,
                sizeof(*bctx) - offsetof(struct io_uring_bpf_ctx, pdu_size));

        /*
         * Opcodes can provide a handler for populating more data into bctx,
         * for filters to use.
         */
        if (def->filter_pdu_size) {
                bctx->pdu_size = def->filter_pdu_size;
                def->filter_populate(bctx, req);
        }
}

/*
 * Run registered filters for a given opcode. For filters, a return of 0 denies
 * execution of the request, a return of 1 allows it. If any filter for an
 * opcode returns 0, filter processing is stopped, and the request is denied.
 * This also stops the processing of filters.
 *
 * __io_uring_run_bpf_filters() returns 0 on success, allow running the
 * request, and -EACCES when a request is denied.
 */
int __io_uring_run_bpf_filters(struct io_bpf_filter __rcu **filters,
                               struct io_kiocb *req)
{
        struct io_bpf_filter *filter;
        struct io_uring_bpf_ctx bpf_ctx;
        int ret;

        /* Fast check for existence of filters outside of RCU */
        if (!rcu_access_pointer(filters[req->opcode]))
                return 0;

        /*
         * req->opcode has already been validated to be within the range
         * of what we expect, io_init_req() does this.
         */
        guard(rcu)();
        filter = rcu_dereference(filters[req->opcode]);
        if (!filter)
                return 0;
        else if (filter == &dummy_filter)
                return -EACCES;

        io_uring_populate_bpf_ctx(&bpf_ctx, req);

        /*
         * Iterate registered filters. The opcode is allowed IFF all filters
         * return 1. If any filter returns denied, opcode will be denied.
         */
        do {
                if (filter == &dummy_filter)
                        return -EACCES;
                ret = bpf_prog_run_pin_on_cpu(filter->prog, &bpf_ctx);
                if (!ret)
                        return -EACCES;
                filter = filter->next;
        } while (filter);

        return 0;
}

static void io_free_bpf_filters(struct rcu_head *head)
{
        struct io_bpf_filter __rcu **filter;
        struct io_bpf_filters *filters;
        int i;

        filters = container_of(head, struct io_bpf_filters, rcu_head);
        scoped_guard(spinlock, &filters->lock) {
                filter = filters->filters;
                if (!filter)
                        return;
        }

        for (i = 0; i < IORING_OP_LAST; i++) {
                struct io_bpf_filter *f;

                rcu_read_lock();
                f = rcu_dereference(filter[i]);
                while (f) {
                        struct io_bpf_filter *next = f->next;

                        /*
                         * Even if stacked, dummy filter will always be last
                         * as it can only get installed into an empty spot.
                         */
                        if (f == &dummy_filter)
                                break;

                        /* Someone still holds a ref, stop iterating. */
                        if (!refcount_dec_and_test(&f->refs))
                                break;

                        bpf_prog_destroy(f->prog);
                        kfree(f);
                        f = next;
                }
                rcu_read_unlock();
        }
        kfree(filters->filters);
        kfree(filters);
}

static void __io_put_bpf_filters(struct io_bpf_filters *filters)
{
        if (refcount_dec_and_test(&filters->refs))
                call_rcu(&filters->rcu_head, io_free_bpf_filters);
}

void io_put_bpf_filters(struct io_restriction *res)
{
        if (res->bpf_filters)
                __io_put_bpf_filters(res->bpf_filters);
}

static struct io_bpf_filters *io_new_bpf_filters(void)
{
        struct io_bpf_filters *filters __free(kfree) = NULL;

        filters = kzalloc_obj(*filters, GFP_KERNEL_ACCOUNT);
        if (!filters)
                return ERR_PTR(-ENOMEM);

        filters->filters = kzalloc_objs(struct io_bpf_filter *, IORING_OP_LAST,
                                        GFP_KERNEL_ACCOUNT);
        if (!filters->filters)
                return ERR_PTR(-ENOMEM);

        refcount_set(&filters->refs, 1);
        spin_lock_init(&filters->lock);
        return no_free_ptr(filters);
}

/*
 * Validate classic BPF filter instructions. Only allow a safe subset of
 * operations - no packet data access, just context field loads and basic
 * ALU/jump operations.
 */
static int io_uring_check_cbpf_filter(struct sock_filter *filter,
                                      unsigned int flen)
{
        int pc;

        for (pc = 0; pc < flen; pc++) {
                struct sock_filter *ftest = &filter[pc];
                u16 code = ftest->code;
                u32 k = ftest->k;

                switch (code) {
                case BPF_LD | BPF_W | BPF_ABS:
                        ftest->code = BPF_LDX | BPF_W | BPF_ABS;
                        /* 32-bit aligned and not out of bounds. */
                        if (k >= sizeof(struct io_uring_bpf_ctx) || k & 3)
                                return -EINVAL;
                        continue;
                case BPF_LD | BPF_W | BPF_LEN:
                        ftest->code = BPF_LD | BPF_IMM;
                        ftest->k = sizeof(struct io_uring_bpf_ctx);
                        continue;
                case BPF_LDX | BPF_W | BPF_LEN:
                        ftest->code = BPF_LDX | BPF_IMM;
                        ftest->k = sizeof(struct io_uring_bpf_ctx);
                        continue;
                /* Explicitly include allowed calls. */
                case BPF_RET | BPF_K:
                case BPF_RET | BPF_A:
                case BPF_ALU | BPF_ADD | BPF_K:
                case BPF_ALU | BPF_ADD | BPF_X:
                case BPF_ALU | BPF_SUB | BPF_K:
                case BPF_ALU | BPF_SUB | BPF_X:
                case BPF_ALU | BPF_MUL | BPF_K:
                case BPF_ALU | BPF_MUL | BPF_X:
                case BPF_ALU | BPF_DIV | BPF_K:
                case BPF_ALU | BPF_DIV | BPF_X:
                case BPF_ALU | BPF_AND | BPF_K:
                case BPF_ALU | BPF_AND | BPF_X:
                case BPF_ALU | BPF_OR | BPF_K:
                case BPF_ALU | BPF_OR | BPF_X:
                case BPF_ALU | BPF_XOR | BPF_K:
                case BPF_ALU | BPF_XOR | BPF_X:
                case BPF_ALU | BPF_LSH | BPF_K:
                case BPF_ALU | BPF_LSH | BPF_X:
                case BPF_ALU | BPF_RSH | BPF_K:
                case BPF_ALU | BPF_RSH | BPF_X:
                case BPF_ALU | BPF_NEG:
                case BPF_LD | BPF_IMM:
                case BPF_LDX | BPF_IMM:
                case BPF_MISC | BPF_TAX:
                case BPF_MISC | BPF_TXA:
                case BPF_LD | BPF_MEM:
                case BPF_LDX | BPF_MEM:
                case BPF_ST:
                case BPF_STX:
                case BPF_JMP | BPF_JA:
                case BPF_JMP | BPF_JEQ | BPF_K:
                case BPF_JMP | BPF_JEQ | BPF_X:
                case BPF_JMP | BPF_JGE | BPF_K:
                case BPF_JMP | BPF_JGE | BPF_X:
                case BPF_JMP | BPF_JGT | BPF_K:
                case BPF_JMP | BPF_JGT | BPF_X:
                case BPF_JMP | BPF_JSET | BPF_K:
                case BPF_JMP | BPF_JSET | BPF_X:
                        continue;
                default:
                        return -EINVAL;
                }
        }
        return 0;
}

void io_bpf_filter_clone(struct io_restriction *dst, struct io_restriction *src)
{
        if (!src->bpf_filters)
                return;

        rcu_read_lock();
        /*
         * If the src filter is going away, just ignore it.
         */
        if (refcount_inc_not_zero(&src->bpf_filters->refs)) {
                dst->bpf_filters = src->bpf_filters;
                dst->bpf_filters_cow = true;
        }
        rcu_read_unlock();
}

/*
 * Allocate a new struct io_bpf_filters. Used when a filter is cloned and
 * modifications need to be made.
 */
static struct io_bpf_filters *io_bpf_filter_cow(struct io_restriction *src)
{
        struct io_bpf_filters *filters;
        struct io_bpf_filter *srcf;
        int i;

        filters = io_new_bpf_filters();
        if (IS_ERR(filters))
                return filters;

        /*
         * Iterate filters from src and assign in destination. Grabbing
         * a reference is enough, we don't need to duplicate the memory.
         * This is safe because filters are only ever appended to the
         * front of the list, hence the only memory ever touched inside
         * a filter is the refcount.
         */
        rcu_read_lock();
        for (i = 0; i < IORING_OP_LAST; i++) {
                srcf = rcu_dereference(src->bpf_filters->filters[i]);
                if (!srcf) {
                        continue;
                } else if (srcf == &dummy_filter) {
                        rcu_assign_pointer(filters->filters[i], &dummy_filter);
                        continue;
                }

                /*
                 * Getting a ref on the first node is enough, putting the
                 * filter and iterating nodes to free will stop on the first
                 * one that doesn't hit zero when dropping.
                 */
                if (!refcount_inc_not_zero(&srcf->refs))
                        goto err;
                rcu_assign_pointer(filters->filters[i], srcf);
        }
        rcu_read_unlock();
        return filters;
err:
        rcu_read_unlock();
        __io_put_bpf_filters(filters);
        return ERR_PTR(-EBUSY);
}

#define IO_URING_BPF_FILTER_FLAGS       (IO_URING_BPF_FILTER_DENY_REST | \
                                         IO_URING_BPF_FILTER_SZ_STRICT)

static int io_bpf_filter_import(struct io_uring_bpf *reg,
                                struct io_uring_bpf __user *arg)
{
        const struct io_issue_def *def;
        int ret;

        if (copy_from_user(reg, arg, sizeof(*reg)))
                return -EFAULT;
        if (reg->cmd_type != IO_URING_BPF_CMD_FILTER)
                return -EINVAL;
        if (reg->cmd_flags || reg->resv)
                return -EINVAL;

        if (reg->filter.opcode >= IORING_OP_LAST)
                return -EINVAL;
        if (reg->filter.flags & ~IO_URING_BPF_FILTER_FLAGS)
                return -EINVAL;
        if (!mem_is_zero(reg->filter.resv, sizeof(reg->filter.resv)))
                return -EINVAL;
        if (!mem_is_zero(reg->filter.resv2, sizeof(reg->filter.resv2)))
                return -EINVAL;
        if (!reg->filter.filter_len || reg->filter.filter_len > BPF_MAXINSNS)
                return -EINVAL;

        /* Verify filter size */
        def = &io_issue_defs[array_index_nospec(reg->filter.opcode, IORING_OP_LAST)];

        /* same size, always ok */
        ret = 0;
        if (reg->filter.pdu_size == def->filter_pdu_size)
                ;
        /* size differs, fail in strict mode */
        else if (reg->filter.flags & IO_URING_BPF_FILTER_SZ_STRICT)
                ret = -EMSGSIZE;
        /* userspace filter is bigger, always disallow */
        else if (reg->filter.pdu_size > def->filter_pdu_size)
                ret = -EMSGSIZE;

        /* copy back kernel filter size */
        reg->filter.pdu_size = def->filter_pdu_size;
        if (copy_to_user(&arg->filter, &reg->filter, sizeof(reg->filter)))
                return -EFAULT;

        return ret;
}

int io_register_bpf_filter(struct io_restriction *res,
                           struct io_uring_bpf __user *arg)
{
        struct io_bpf_filters *filters, *old_filters = NULL;
        struct io_bpf_filter *filter, *old_filter;
        struct io_uring_bpf reg;
        struct bpf_prog *prog;
        struct sock_fprog fprog;
        int ret;

        ret = io_bpf_filter_import(&reg, arg);
        if (ret)
                return ret;

        fprog.len = reg.filter.filter_len;
        fprog.filter = u64_to_user_ptr(reg.filter.filter_ptr);

        ret = bpf_prog_create_from_user(&prog, &fprog,
                                        io_uring_check_cbpf_filter, false);
        if (ret)
                return ret;

        /*
         * No existing filters, allocate set.
         */
        filters = res->bpf_filters;
        if (!filters) {
                filters = io_new_bpf_filters();
                if (IS_ERR(filters)) {
                        ret = PTR_ERR(filters);
                        goto err_prog;
                }
        } else if (res->bpf_filters_cow) {
                filters = io_bpf_filter_cow(res);
                if (IS_ERR(filters)) {
                        ret = PTR_ERR(filters);
                        goto err_prog;
                }
                /*
                 * Stash old filters, we'll put them once we know we'll
                 * succeed. Until then, res->bpf_filters is left untouched.
                 */
                old_filters = res->bpf_filters;
        }

        filter = kzalloc_obj(*filter, GFP_KERNEL_ACCOUNT);
        if (!filter) {
                ret = -ENOMEM;
                goto err;
        }
        refcount_set(&filter->refs, 1);
        filter->prog = prog;

        /*
         * Success - install the new filter set now. If we did COW, put
         * the old filters as we're replacing them.
         */
        if (old_filters) {
                __io_put_bpf_filters(old_filters);
                res->bpf_filters_cow = false;
        }
        res->bpf_filters = filters;

        /*
         * Insert filter - if the current opcode already has a filter
         * attached, add to the set.
         */
        rcu_read_lock();
        spin_lock_bh(&filters->lock);
        old_filter = rcu_dereference(filters->filters[reg.filter.opcode]);
        if (old_filter)
                filter->next = old_filter;
        rcu_assign_pointer(filters->filters[reg.filter.opcode], filter);

        /*
         * If IO_URING_BPF_FILTER_DENY_REST is set, fill any unregistered
         * opcode with the dummy filter. That will cause them to be denied.
         */
        if (reg.filter.flags & IO_URING_BPF_FILTER_DENY_REST) {
                for (int i = 0; i < IORING_OP_LAST; i++) {
                        if (i == reg.filter.opcode)
                                continue;
                        old_filter = rcu_dereference(filters->filters[i]);
                        if (old_filter)
                                continue;
                        rcu_assign_pointer(filters->filters[i], &dummy_filter);
                }
        }

        spin_unlock_bh(&filters->lock);
        rcu_read_unlock();
        return 0;
err:
        if (filters != res->bpf_filters)
                __io_put_bpf_filters(filters);
err_prog:
        bpf_prog_destroy(prog);
        return ret;
}