root/tools/testing/selftests/bpf/progs/bpf_arena_spin_lock.h
// SPDX-License-Identifier: GPL-2.0
/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */
#ifndef BPF_ARENA_SPIN_LOCK_H
#define BPF_ARENA_SPIN_LOCK_H

#include <vmlinux.h>
#include <bpf/bpf_helpers.h>
#include "bpf_atomic.h"

#define arch_mcs_spin_lock_contended_label(l, label) smp_cond_load_acquire_label(l, VAL, label)
#define arch_mcs_spin_unlock_contended(l) smp_store_release((l), 1)

#if defined(ENABLE_ATOMICS_TESTS) && defined(__BPF_FEATURE_ADDR_SPACE_CAST)

#define EBUSY 16
#define EOPNOTSUPP 95
#define ETIMEDOUT 110

#ifndef __arena
#define __arena __attribute__((address_space(1)))
#endif

extern unsigned long CONFIG_NR_CPUS __kconfig;

/*
 * Typically, we'd just rely on the definition in vmlinux.h for qspinlock, but
 * PowerPC overrides the definition to define lock->val as u32 instead of
 * atomic_t, leading to compilation errors.  Import a local definition below so
 * that we don't depend on the vmlinux.h version.
 */

struct __qspinlock {
        union {
                atomic_t val;
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
                struct {
                        u8 locked;
                        u8 pending;
                };
                struct {
                        u16 locked_pending;
                        u16 tail;
                };
#else
                struct {
                        u16 tail;
                        u16 locked_pending;
                };
                struct {
                        u8 reserved[2];
                        u8 pending;
                        u8 locked;
                };
#endif
        };
};

#define arena_spinlock_t struct __qspinlock
/* FIXME: Using typedef causes CO-RE relocation error */
/* typedef struct qspinlock arena_spinlock_t; */

struct arena_mcs_spinlock {
        struct arena_mcs_spinlock __arena *next;
        int locked;
        int count;
};

struct arena_qnode {
        struct arena_mcs_spinlock mcs;
};

#define _Q_MAX_NODES            4
#define _Q_PENDING_LOOPS        1

/*
 * Bitfields in the atomic value:
 *
 *  0- 7: locked byte
 *     8: pending
 *  9-15: not used
 * 16-17: tail index
 * 18-31: tail cpu (+1)
 */
#define _Q_MAX_CPUS             1024

#define _Q_SET_MASK(type)       (((1U << _Q_ ## type ## _BITS) - 1)\
                                      << _Q_ ## type ## _OFFSET)
#define _Q_LOCKED_OFFSET        0
#define _Q_LOCKED_BITS          8
#define _Q_LOCKED_MASK          _Q_SET_MASK(LOCKED)

#define _Q_PENDING_OFFSET       (_Q_LOCKED_OFFSET + _Q_LOCKED_BITS)
#define _Q_PENDING_BITS         8
#define _Q_PENDING_MASK         _Q_SET_MASK(PENDING)

#define _Q_TAIL_IDX_OFFSET      (_Q_PENDING_OFFSET + _Q_PENDING_BITS)
#define _Q_TAIL_IDX_BITS        2
#define _Q_TAIL_IDX_MASK        _Q_SET_MASK(TAIL_IDX)

#define _Q_TAIL_CPU_OFFSET      (_Q_TAIL_IDX_OFFSET + _Q_TAIL_IDX_BITS)
#define _Q_TAIL_CPU_BITS        (32 - _Q_TAIL_CPU_OFFSET)
#define _Q_TAIL_CPU_MASK        _Q_SET_MASK(TAIL_CPU)

#define _Q_TAIL_OFFSET          _Q_TAIL_IDX_OFFSET
#define _Q_TAIL_MASK            (_Q_TAIL_IDX_MASK | _Q_TAIL_CPU_MASK)

#define _Q_LOCKED_VAL           (1U << _Q_LOCKED_OFFSET)
#define _Q_PENDING_VAL          (1U << _Q_PENDING_OFFSET)

struct arena_qnode __arena qnodes[_Q_MAX_CPUS][_Q_MAX_NODES];

static inline u32 encode_tail(int cpu, int idx)
{
        u32 tail;

        tail  = (cpu + 1) << _Q_TAIL_CPU_OFFSET;
        tail |= idx << _Q_TAIL_IDX_OFFSET; /* assume < 4 */

        return tail;
}

static inline struct arena_mcs_spinlock __arena *decode_tail(u32 tail)
{
        u32 cpu = (tail >> _Q_TAIL_CPU_OFFSET) - 1;
        u32 idx = (tail &  _Q_TAIL_IDX_MASK) >> _Q_TAIL_IDX_OFFSET;

        return &qnodes[cpu][idx].mcs;
}

static inline
struct arena_mcs_spinlock __arena *grab_mcs_node(struct arena_mcs_spinlock __arena *base, int idx)
{
        return &((struct arena_qnode __arena *)base + idx)->mcs;
}

#define _Q_LOCKED_PENDING_MASK (_Q_LOCKED_MASK | _Q_PENDING_MASK)

/**
 * xchg_tail - Put in the new queue tail code word & retrieve previous one
 * @lock : Pointer to queued spinlock structure
 * @tail : The new queue tail code word
 * Return: The previous queue tail code word
 *
 * xchg(lock, tail)
 *
 * p,*,* -> n,*,* ; prev = xchg(lock, node)
 */
static __always_inline u32 xchg_tail(arena_spinlock_t __arena *lock, u32 tail)
{
        u32 old, new;

        old = atomic_read(&lock->val);
        do {
                new = (old & _Q_LOCKED_PENDING_MASK) | tail;
                /*
                 * We can use relaxed semantics since the caller ensures that
                 * the MCS node is properly initialized before updating the
                 * tail.
                 */
                /* These loops are not expected to stall, but we still need to
                 * prove to the verifier they will terminate eventually.
                 */
                cond_break_label(out);
        } while (!atomic_try_cmpxchg_relaxed(&lock->val, &old, new));

        return old;
out:
        bpf_printk("RUNTIME ERROR: %s unexpected cond_break exit!!!", __func__);
        return old;
}

/**
 * clear_pending - clear the pending bit.
 * @lock: Pointer to queued spinlock structure
 *
 * *,1,* -> *,0,*
 */
static __always_inline void clear_pending(arena_spinlock_t __arena *lock)
{
        WRITE_ONCE(lock->pending, 0);
}

/**
 * clear_pending_set_locked - take ownership and clear the pending bit.
 * @lock: Pointer to queued spinlock structure
 *
 * *,1,0 -> *,0,1
 *
 * Lock stealing is not allowed if this function is used.
 */
static __always_inline void clear_pending_set_locked(arena_spinlock_t __arena *lock)
{
        WRITE_ONCE(lock->locked_pending, _Q_LOCKED_VAL);
}

/**
 * set_locked - Set the lock bit and own the lock
 * @lock: Pointer to queued spinlock structure
 *
 * *,*,0 -> *,0,1
 */
static __always_inline void set_locked(arena_spinlock_t __arena *lock)
{
        WRITE_ONCE(lock->locked, _Q_LOCKED_VAL);
}

static __always_inline
u32 arena_fetch_set_pending_acquire(arena_spinlock_t __arena *lock)
{
        u32 old, new;

        old = atomic_read(&lock->val);
        do {
                new = old | _Q_PENDING_VAL;
                /*
                 * These loops are not expected to stall, but we still need to
                 * prove to the verifier they will terminate eventually.
                 */
                cond_break_label(out);
        } while (!atomic_try_cmpxchg_acquire(&lock->val, &old, new));

        return old;
out:
        bpf_printk("RUNTIME ERROR: %s unexpected cond_break exit!!!", __func__);
        return old;
}

/**
 * arena_spin_trylock - try to acquire the queued spinlock
 * @lock : Pointer to queued spinlock structure
 * Return: 1 if lock acquired, 0 if failed
 */
static __always_inline int arena_spin_trylock(arena_spinlock_t __arena *lock)
{
        int val = atomic_read(&lock->val);

        if (unlikely(val))
                return 0;

        return likely(atomic_try_cmpxchg_acquire(&lock->val, &val, _Q_LOCKED_VAL));
}

__noinline
int arena_spin_lock_slowpath(arena_spinlock_t __arena __arg_arena *lock, u32 val)
{
        struct arena_mcs_spinlock __arena *prev, *next, *node0, *node;
        int ret = -ETIMEDOUT;
        u32 old, tail;
        int idx;

        /*
         * Wait for in-progress pending->locked hand-overs with a bounded
         * number of spins so that we guarantee forward progress.
         *
         * 0,1,0 -> 0,0,1
         */
        if (val == _Q_PENDING_VAL) {
                int cnt = _Q_PENDING_LOOPS;
                val = atomic_cond_read_relaxed_label(&lock->val,
                                                     (VAL != _Q_PENDING_VAL) || !cnt--,
                                                     release_err);
        }

        /*
         * If we observe any contention; queue.
         */
        if (val & ~_Q_LOCKED_MASK)
                goto queue;

        /*
         * trylock || pending
         *
         * 0,0,* -> 0,1,* -> 0,0,1 pending, trylock
         */
        val = arena_fetch_set_pending_acquire(lock);

        /*
         * If we observe contention, there is a concurrent locker.
         *
         * Undo and queue; our setting of PENDING might have made the
         * n,0,0 -> 0,0,0 transition fail and it will now be waiting
         * on @next to become !NULL.
         */
        if (unlikely(val & ~_Q_LOCKED_MASK)) {

                /* Undo PENDING if we set it. */
                if (!(val & _Q_PENDING_MASK))
                        clear_pending(lock);

                goto queue;
        }

        /*
         * We're pending, wait for the owner to go away.
         *
         * 0,1,1 -> *,1,0
         *
         * this wait loop must be a load-acquire such that we match the
         * store-release that clears the locked bit and create lock
         * sequentiality; this is because not all
         * clear_pending_set_locked() implementations imply full
         * barriers.
         */
        if (val & _Q_LOCKED_MASK)
                (void)smp_cond_load_acquire_label(&lock->locked, !VAL, release_err);

        /*
         * take ownership and clear the pending bit.
         *
         * 0,1,0 -> 0,0,1
         */
        clear_pending_set_locked(lock);
        return 0;

        /*
         * End of pending bit optimistic spinning and beginning of MCS
         * queuing.
         */
queue:
        node0 = &(qnodes[bpf_get_smp_processor_id()])[0].mcs;
        idx = node0->count++;
        tail = encode_tail(bpf_get_smp_processor_id(), idx);

        /*
         * 4 nodes are allocated based on the assumption that there will not be
         * nested NMIs taking spinlocks. That may not be true in some
         * architectures even though the chance of needing more than 4 nodes
         * will still be extremely unlikely. When that happens, we simply return
         * an error. Original qspinlock has a trylock fallback in this case.
         */
        if (unlikely(idx >= _Q_MAX_NODES)) {
                ret = -EBUSY;
                goto release_node_err;
        }

        node = grab_mcs_node(node0, idx);

        /*
         * Ensure that we increment the head node->count before initialising
         * the actual node. If the compiler is kind enough to reorder these
         * stores, then an IRQ could overwrite our assignments.
         */
        barrier();

        node->locked = 0;
        node->next = NULL;

        /*
         * We touched a (possibly) cold cacheline in the per-cpu queue node;
         * attempt the trylock once more in the hope someone let go while we
         * weren't watching.
         */
        if (arena_spin_trylock(lock))
                goto release;

        /*
         * Ensure that the initialisation of @node is complete before we
         * publish the updated tail via xchg_tail() and potentially link
         * @node into the waitqueue via WRITE_ONCE(prev->next, node) below.
         */
        smp_wmb();

        /*
         * Publish the updated tail.
         * We have already touched the queueing cacheline; don't bother with
         * pending stuff.
         *
         * p,*,* -> n,*,*
         */
        old = xchg_tail(lock, tail);
        next = NULL;

        /*
         * if there was a previous node; link it and wait until reaching the
         * head of the waitqueue.
         */
        if (old & _Q_TAIL_MASK) {
                prev = decode_tail(old);

                /* Link @node into the waitqueue. */
                WRITE_ONCE(prev->next, node);

                (void)arch_mcs_spin_lock_contended_label(&node->locked, release_node_err);

                /*
                 * While waiting for the MCS lock, the next pointer may have
                 * been set by another lock waiter. We cannot prefetch here
                 * due to lack of equivalent instruction in BPF ISA.
                 */
                next = READ_ONCE(node->next);
        }

        /*
         * we're at the head of the waitqueue, wait for the owner & pending to
         * go away.
         *
         * *,x,y -> *,0,0
         *
         * this wait loop must use a load-acquire such that we match the
         * store-release that clears the locked bit and create lock
         * sequentiality; this is because the set_locked() function below
         * does not imply a full barrier.
         */
        val = atomic_cond_read_acquire_label(&lock->val, !(VAL & _Q_LOCKED_PENDING_MASK),
                                             release_node_err);

        /*
         * claim the lock:
         *
         * n,0,0 -> 0,0,1 : lock, uncontended
         * *,*,0 -> *,*,1 : lock, contended
         *
         * If the queue head is the only one in the queue (lock value == tail)
         * and nobody is pending, clear the tail code and grab the lock.
         * Otherwise, we only need to grab the lock.
         */

        /*
         * In the PV case we might already have _Q_LOCKED_VAL set, because
         * of lock stealing; therefore we must also allow:
         *
         * n,0,1 -> 0,0,1
         *
         * Note: at this point: (val & _Q_PENDING_MASK) == 0, because of the
         *       above wait condition, therefore any concurrent setting of
         *       PENDING will make the uncontended transition fail.
         */
        if ((val & _Q_TAIL_MASK) == tail) {
                if (atomic_try_cmpxchg_relaxed(&lock->val, &val, _Q_LOCKED_VAL))
                        goto release; /* No contention */
        }

        /*
         * Either somebody is queued behind us or _Q_PENDING_VAL got set
         * which will then detect the remaining tail and queue behind us
         * ensuring we'll see a @next.
         */
        set_locked(lock);

        /*
         * contended path; wait for next if not observed yet, release.
         */
        if (!next)
                next = smp_cond_load_relaxed_label(&node->next, (VAL), release_node_err);

        arch_mcs_spin_unlock_contended(&next->locked);

release:;
        /*
         * release the node
         *
         * Doing a normal dec vs this_cpu_dec is fine. An upper context always
         * decrements count it incremented before returning, thus we're fine.
         * For contexts interrupting us, they either observe our dec or not.
         * Just ensure the compiler doesn't reorder this statement, as a
         * this_cpu_dec implicitly implied that.
         */
        barrier();
        node0->count--;
        return 0;
release_node_err:
        barrier();
        node0->count--;
        goto release_err;
release_err:
        return ret;
}

/**
 * arena_spin_lock - acquire a queued spinlock
 * @lock: Pointer to queued spinlock structure
 *
 * On error, returned value will be negative.
 * On success, zero is returned.
 *
 * The return value _must_ be tested against zero for success,
 * instead of checking it against negative, for passing the
 * BPF verifier.
 *
 * The user should do:
 *      if (arena_spin_lock(...) != 0) // failure
 *              or
 *      if (arena_spin_lock(...) == 0) // success
 *              or
 *      if (arena_spin_lock(...)) // failure
 *              or
 *      if (!arena_spin_lock(...)) // success
 * instead of:
 *      if (arena_spin_lock(...) < 0) // failure
 *
 * The return value can still be inspected later.
 */
static __always_inline int arena_spin_lock(arena_spinlock_t __arena *lock)
{
        int val = 0;

        if (CONFIG_NR_CPUS > 1024)
                return -EOPNOTSUPP;

        bpf_preempt_disable();
        if (likely(atomic_try_cmpxchg_acquire(&lock->val, &val, _Q_LOCKED_VAL)))
                return 0;

        val = arena_spin_lock_slowpath(lock, val);
        /* FIXME: bpf_assert_range(-MAX_ERRNO, 0) once we have it working for all cases. */
        if (val)
                bpf_preempt_enable();
        return val;
}

/**
 * arena_spin_unlock - release a queued spinlock
 * @lock : Pointer to queued spinlock structure
 */
static __always_inline void arena_spin_unlock(arena_spinlock_t __arena *lock)
{
        /*
         * unlock() needs release semantics:
         */
        smp_store_release(&lock->locked, 0);
        bpf_preempt_enable();
}

#define arena_spin_lock_irqsave(lock, flags)             \
        ({                                               \
                int __ret;                               \
                bpf_local_irq_save(&(flags));            \
                __ret = arena_spin_lock((lock));         \
                if (__ret)                               \
                        bpf_local_irq_restore(&(flags)); \
                (__ret);                                 \
        })

#define arena_spin_unlock_irqrestore(lock, flags) \
        ({                                        \
                arena_spin_unlock((lock));        \
                bpf_local_irq_restore(&(flags));  \
        })

#endif

#endif /* BPF_ARENA_SPIN_LOCK_H */