root/include/asm-generic/rqspinlock.h
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * Resilient Queued Spin Lock
 *
 * (C) Copyright 2024-2025 Meta Platforms, Inc. and affiliates.
 *
 * Authors: Kumar Kartikeya Dwivedi <memxor@gmail.com>
 */
#ifndef __ASM_GENERIC_RQSPINLOCK_H
#define __ASM_GENERIC_RQSPINLOCK_H

#include <linux/types.h>
#include <vdso/time64.h>
#include <linux/percpu.h>
#ifdef CONFIG_QUEUED_SPINLOCKS
#include <asm/qspinlock.h>
#endif

struct rqspinlock {
        union {
                atomic_t val;
                u32 locked;
        };
};

/* Even though this is same as struct rqspinlock, we need to emit a distinct
 * type in BTF for BPF programs.
 */
struct bpf_res_spin_lock {
        u32 val;
} __aligned(__alignof__(struct rqspinlock));

struct qspinlock;
#ifdef CONFIG_QUEUED_SPINLOCKS
typedef struct qspinlock rqspinlock_t;
#else
typedef struct rqspinlock rqspinlock_t;
#endif

extern int resilient_tas_spin_lock(rqspinlock_t *lock);
#ifdef CONFIG_QUEUED_SPINLOCKS
extern int resilient_queued_spin_lock_slowpath(rqspinlock_t *lock, u32 val);
#endif

#ifndef resilient_virt_spin_lock_enabled
static __always_inline bool resilient_virt_spin_lock_enabled(void)
{
        return false;
}
#endif

#ifndef resilient_virt_spin_lock
static __always_inline int resilient_virt_spin_lock(rqspinlock_t *lock)
{
        return 0;
}
#endif

/*
 * Default timeout for waiting loops is 0.25 seconds
 */
#define RES_DEF_TIMEOUT (NSEC_PER_SEC / 4)

/*
 * Choose 31 as it makes rqspinlock_held cacheline-aligned.
 */
#define RES_NR_HELD 31

struct rqspinlock_held {
        int cnt;
        void *locks[RES_NR_HELD];
};

DECLARE_PER_CPU_ALIGNED(struct rqspinlock_held, rqspinlock_held_locks);

static __always_inline void grab_held_lock_entry(void *lock)
{
        int cnt = this_cpu_inc_return(rqspinlock_held_locks.cnt);

        if (unlikely(cnt > RES_NR_HELD)) {
                /* Still keep the inc so we decrement later. */
                return;
        }

        /*
         * Implied compiler barrier in per-CPU operations; otherwise we can have
         * the compiler reorder inc with write to table, allowing interrupts to
         * overwrite and erase our write to the table (as on interrupt exit it
         * will be reset to NULL).
         *
         * It is fine for cnt inc to be reordered wrt remote readers though,
         * they won't observe our entry until the cnt update is visible, that's
         * all.
         */
        this_cpu_write(rqspinlock_held_locks.locks[cnt - 1], lock);
}

/*
 * We simply don't support out-of-order unlocks, and keep the logic simple here.
 * The verifier prevents BPF programs from unlocking out-of-order, and the same
 * holds for in-kernel users.
 *
 * It is possible to run into misdetection scenarios of AA deadlocks on the same
 * CPU, and missed ABBA deadlocks on remote CPUs if this function pops entries
 * out of order (due to lock A, lock B, unlock A, unlock B) pattern. The correct
 * logic to preserve right entries in the table would be to walk the array of
 * held locks and swap and clear out-of-order entries, but that's too
 * complicated and we don't have a compelling use case for out of order unlocking.
 */
static __always_inline void release_held_lock_entry(void)
{
        struct rqspinlock_held *rqh = this_cpu_ptr(&rqspinlock_held_locks);

        if (unlikely(rqh->cnt > RES_NR_HELD))
                goto dec;
        WRITE_ONCE(rqh->locks[rqh->cnt - 1], NULL);
dec:
        /*
         * Reordering of clearing above with inc and its write in
         * grab_held_lock_entry that came before us (in same acquisition
         * attempt) is ok, we either see a valid entry or NULL when it's
         * visible.
         *
         * But this helper is invoked when we unwind upon failing to acquire the
         * lock. Unlike the unlock path which constitutes a release store after
         * we clear the entry, we need to emit a write barrier here. Otherwise,
         * we may have a situation as follows:
         *
         * <error> for lock B
         * release_held_lock_entry
         *
         * grab_held_lock_entry
         * try_cmpxchg_acquire for lock A
         *
         * Lack of any ordering means reordering may occur such that dec, inc
         * are done before entry is overwritten. This permits a remote lock
         * holder of lock B (which this CPU failed to acquire) to now observe it
         * as being attempted on this CPU, and may lead to misdetection (if this
         * CPU holds a lock it is attempting to acquire, leading to false ABBA
         * diagnosis).
         *
         * The case of unlock is treated differently due to NMI reentrancy, see
         * comments in res_spin_unlock.
         *
         * In theory we don't have a problem if the dec and WRITE_ONCE above get
         * reordered with each other, we either notice an empty NULL entry on
         * top (if dec succeeds WRITE_ONCE), or a potentially stale entry which
         * cannot be observed (if dec precedes WRITE_ONCE).
         *
         * Emit the write barrier _before_ the dec, this permits dec-inc
         * reordering but that is harmless as we'd have new entry set to NULL
         * already, i.e. they cannot precede the NULL store above.
         */
        smp_wmb();
        this_cpu_dec(rqspinlock_held_locks.cnt);
}

#ifdef CONFIG_QUEUED_SPINLOCKS

/**
 * res_spin_lock - acquire a queued spinlock
 * @lock: Pointer to queued spinlock structure
 *
 * Return:
 * * 0          - Lock was acquired successfully.
 * * -EDEADLK   - Lock acquisition failed because of AA/ABBA deadlock.
 * * -ETIMEDOUT - Lock acquisition failed because of timeout.
 */
static __always_inline int res_spin_lock(rqspinlock_t *lock)
{
        int val = 0;

        /*
         * Grab the deadlock detection entry before doing the cmpxchg, so that
         * reentrancy due to NMIs between the succeeding cmpxchg and creation of
         * held lock entry can correctly detect an acquisition attempt in the
         * interrupted context.
         *
         * cmpxchg lock A
         * <NMI>
         * res_spin_lock(A) --> missed AA, leads to timeout
         * </NMI>
         * grab_held_lock_entry(A)
         */
        grab_held_lock_entry(lock);

        if (likely(atomic_try_cmpxchg_acquire(&lock->val, &val, _Q_LOCKED_VAL)))
                return 0;
        return resilient_queued_spin_lock_slowpath(lock, val);
}

#else

#define res_spin_lock(lock) ({ grab_held_lock_entry(lock); resilient_tas_spin_lock(lock); })

#endif /* CONFIG_QUEUED_SPINLOCKS */

static __always_inline void res_spin_unlock(rqspinlock_t *lock)
{
        struct rqspinlock_held *rqh = this_cpu_ptr(&rqspinlock_held_locks);

        /*
         * Release barrier, ensures correct ordering. Perform release store
         * instead of queued_spin_unlock, since we use this function for the TAS
         * fallback as well. When we have CONFIG_QUEUED_SPINLOCKS=n, we clear
         * the full 4-byte lockword.
         *
         * Perform the smp_store_release before clearing the lock entry so that
         * NMIs landing in the unlock path can correctly detect AA issues. The
         * opposite order shown below may lead to missed AA checks:
         *
         * WRITE_ONCE(rqh->locks[rqh->cnt - 1], NULL)
         * <NMI>
         * res_spin_lock(A) --> missed AA, leads to timeout
         * </NMI>
         * smp_store_release(A->locked, 0)
         */
        smp_store_release(&lock->locked, 0);
        if (likely(rqh->cnt <= RES_NR_HELD))
                WRITE_ONCE(rqh->locks[rqh->cnt - 1], NULL);
        this_cpu_dec(rqspinlock_held_locks.cnt);
}

#ifdef CONFIG_QUEUED_SPINLOCKS
#define raw_res_spin_lock_init(lock) ({ *(lock) = (rqspinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; })
#else
#define raw_res_spin_lock_init(lock) ({ *(lock) = (rqspinlock_t){0}; })
#endif

#define raw_res_spin_lock(lock)                    \
        ({                                         \
                int __ret;                         \
                preempt_disable();                 \
                __ret = res_spin_lock(lock);       \
                if (__ret)                         \
                        preempt_enable();          \
                __ret;                             \
        })

#define raw_res_spin_unlock(lock) ({ res_spin_unlock(lock); preempt_enable(); })

#define raw_res_spin_lock_irqsave(lock, flags)    \
        ({                                        \
                int __ret;                        \
                local_irq_save(flags);            \
                __ret = raw_res_spin_lock(lock);  \
                if (__ret)                        \
                        local_irq_restore(flags); \
                __ret;                            \
        })

#define raw_res_spin_unlock_irqrestore(lock, flags) ({ raw_res_spin_unlock(lock); local_irq_restore(flags); })

#endif /* __ASM_GENERIC_RQSPINLOCK_H */