root/mm/mmap_lock.c
// SPDX-License-Identifier: GPL-2.0
#define CREATE_TRACE_POINTS
#include <trace/events/mmap_lock.h>

#include <linux/mm.h>
#include <linux/cgroup.h>
#include <linux/memcontrol.h>
#include <linux/mmap_lock.h>
#include <linux/mutex.h>
#include <linux/percpu.h>
#include <linux/rcupdate.h>
#include <linux/smp.h>
#include <linux/trace_events.h>
#include <linux/local_lock.h>

EXPORT_TRACEPOINT_SYMBOL(mmap_lock_start_locking);
EXPORT_TRACEPOINT_SYMBOL(mmap_lock_acquire_returned);
EXPORT_TRACEPOINT_SYMBOL(mmap_lock_released);

#ifdef CONFIG_TRACING
/*
 * Trace calls must be in a separate file, as otherwise there's a circular
 * dependency between linux/mmap_lock.h and trace/events/mmap_lock.h.
 */

void __mmap_lock_do_trace_start_locking(struct mm_struct *mm, bool write)
{
        trace_mmap_lock_start_locking(mm, write);
}
EXPORT_SYMBOL(__mmap_lock_do_trace_start_locking);

void __mmap_lock_do_trace_acquire_returned(struct mm_struct *mm, bool write,
                                           bool success)
{
        trace_mmap_lock_acquire_returned(mm, write, success);
}
EXPORT_SYMBOL(__mmap_lock_do_trace_acquire_returned);

void __mmap_lock_do_trace_released(struct mm_struct *mm, bool write)
{
        trace_mmap_lock_released(mm, write);
}
EXPORT_SYMBOL(__mmap_lock_do_trace_released);
#endif /* CONFIG_TRACING */

#ifdef CONFIG_MMU
#ifdef CONFIG_PER_VMA_LOCK

/* State shared across __vma_[start, end]_exclude_readers. */
struct vma_exclude_readers_state {
        /* Input parameters. */
        struct vm_area_struct *vma;
        int state; /* TASK_KILLABLE or TASK_UNINTERRUPTIBLE. */
        bool detaching;

        /* Output parameters. */
        bool detached;
        bool exclusive; /* Are we exclusively locked? */
};

/*
 * Now that all readers have been evicted, mark the VMA as being out of the
 * 'exclude readers' state.
 */
static void __vma_end_exclude_readers(struct vma_exclude_readers_state *ves)
{
        struct vm_area_struct *vma = ves->vma;

        VM_WARN_ON_ONCE(ves->detached);

        ves->detached = refcount_sub_and_test(VM_REFCNT_EXCLUDE_READERS_FLAG,
                                              &vma->vm_refcnt);
        __vma_lockdep_release_exclusive(vma);
}

static unsigned int get_target_refcnt(struct vma_exclude_readers_state *ves)
{
        const unsigned int tgt = ves->detaching ? 0 : 1;

        return tgt | VM_REFCNT_EXCLUDE_READERS_FLAG;
}

/*
 * Mark the VMA as being in a state of excluding readers, check to see if any
 * VMA read locks are indeed held, and if so wait for them to be released.
 *
 * Note that this function pairs with vma_refcount_put() which will wake up this
 * thread when it detects that the last reader has released its lock.
 *
 * The ves->state parameter ought to be set to TASK_UNINTERRUPTIBLE in cases
 * where we wish the thread to sleep uninterruptibly or TASK_KILLABLE if a fatal
 * signal is permitted to kill it.
 *
 * The function sets the ves->exclusive parameter to true if readers were
 * excluded, or false if the VMA was detached or an error arose on wait.
 *
 * If the function indicates an exclusive lock was acquired via ves->exclusive
 * the caller is required to invoke __vma_end_exclude_readers() once the
 * exclusive state is no longer required.
 *
 * If ves->state is set to something other than TASK_UNINTERRUPTIBLE, the
 * function may also return -EINTR to indicate a fatal signal was received while
 * waiting.  Otherwise, the function returns 0.
 */
static int __vma_start_exclude_readers(struct vma_exclude_readers_state *ves)
{
        struct vm_area_struct *vma = ves->vma;
        unsigned int tgt_refcnt = get_target_refcnt(ves);
        int err = 0;

        mmap_assert_write_locked(vma->vm_mm);

        /*
         * If vma is detached then only vma_mark_attached() can raise the
         * vm_refcnt. mmap_write_lock prevents racing with vma_mark_attached().
         *
         * See the comment describing the vm_area_struct->vm_refcnt field for
         * details of possible refcnt values.
         */
        if (!refcount_add_not_zero(VM_REFCNT_EXCLUDE_READERS_FLAG, &vma->vm_refcnt)) {
                ves->detached = true;
                return 0;
        }

        __vma_lockdep_acquire_exclusive(vma);
        err = rcuwait_wait_event(&vma->vm_mm->vma_writer_wait,
                   refcount_read(&vma->vm_refcnt) == tgt_refcnt,
                   ves->state);
        if (err) {
                __vma_end_exclude_readers(ves);
                return err;
        }

        __vma_lockdep_stat_mark_acquired(vma);
        ves->exclusive = true;
        return 0;
}

int __vma_start_write(struct vm_area_struct *vma, int state)
{
        const unsigned int mm_lock_seq = __vma_raw_mm_seqnum(vma);
        struct vma_exclude_readers_state ves = {
                .vma = vma,
                .state = state,
        };
        int err;

        err = __vma_start_exclude_readers(&ves);
        if (err) {
                WARN_ON_ONCE(ves.detached);
                return err;
        }

        /*
         * We should use WRITE_ONCE() here because we can have concurrent reads
         * from the early lockless pessimistic check in vma_start_read().
         * We don't really care about the correctness of that early check, but
         * we should use WRITE_ONCE() for cleanliness and to keep KCSAN happy.
         */
        WRITE_ONCE(vma->vm_lock_seq, mm_lock_seq);

        if (ves.exclusive) {
                __vma_end_exclude_readers(&ves);
                /* VMA should remain attached. */
                WARN_ON_ONCE(ves.detached);
        }

        return 0;
}
EXPORT_SYMBOL_GPL(__vma_start_write);

void __vma_exclude_readers_for_detach(struct vm_area_struct *vma)
{
        struct vma_exclude_readers_state ves = {
                .vma = vma,
                .state = TASK_UNINTERRUPTIBLE,
                .detaching = true,
        };
        int err;

        /*
         * Wait until the VMA is detached with no readers. Since we hold the VMA
         * write lock, the only read locks that might be present are those from
         * threads trying to acquire the read lock and incrementing the
         * reference count before realising the write lock is held and
         * decrementing it.
         */
        err = __vma_start_exclude_readers(&ves);
        if (!err && ves.exclusive) {
                /*
                 * Once this is complete, no readers can increment the
                 * reference count, and the VMA is marked detached.
                 */
                __vma_end_exclude_readers(&ves);
        }
        /* If an error arose but we were detached anyway, we don't care. */
        WARN_ON_ONCE(!ves.detached);
}

/*
 * Try to read-lock a vma. The function is allowed to occasionally yield false
 * locked result to avoid performance overhead, in which case we fall back to
 * using mmap_lock. The function should never yield false unlocked result.
 * False locked result is possible if mm_lock_seq overflows or if vma gets
 * reused and attached to a different mm before we lock it.
 * Returns the vma on success, NULL on failure to lock and EAGAIN if vma got
 * detached.
 *
 * IMPORTANT: RCU lock must be held upon entering the function, but upon error
 *            IT IS RELEASED. The caller must handle this correctly.
 */
static inline struct vm_area_struct *vma_start_read(struct mm_struct *mm,
                                                    struct vm_area_struct *vma)
{
        struct mm_struct *other_mm;
        int oldcnt;

        RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "no rcu lock held");
        /*
         * Check before locking. A race might cause false locked result.
         * We can use READ_ONCE() for the mm_lock_seq here, and don't need
         * ACQUIRE semantics, because this is just a lockless check whose result
         * we don't rely on for anything - the mm_lock_seq read against which we
         * need ordering is below.
         */
        if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(mm->mm_lock_seq.sequence)) {
                vma = NULL;
                goto err;
        }

        /*
         * If VM_REFCNT_EXCLUDE_READERS_FLAG is set,
         * __refcount_inc_not_zero_limited_acquire() will fail because
         * VM_REFCNT_LIMIT is less than VM_REFCNT_EXCLUDE_READERS_FLAG.
         *
         * Acquire fence is required here to avoid reordering against later
         * vm_lock_seq check and checks inside lock_vma_under_rcu().
         */
        if (unlikely(!__refcount_inc_not_zero_limited_acquire(&vma->vm_refcnt, &oldcnt,
                                                              VM_REFCNT_LIMIT))) {
                /* return EAGAIN if vma got detached from under us */
                vma = oldcnt ? NULL : ERR_PTR(-EAGAIN);
                goto err;
        }

        __vma_lockdep_acquire_read(vma);

        if (unlikely(vma->vm_mm != mm))
                goto err_unstable;

        /*
         * Overflow of vm_lock_seq/mm_lock_seq might produce false locked result.
         * False unlocked result is impossible because we modify and check
         * vma->vm_lock_seq under vma->vm_refcnt protection and mm->mm_lock_seq
         * modification invalidates all existing locks.
         *
         * We must use ACQUIRE semantics for the mm_lock_seq so that if we are
         * racing with vma_end_write_all(), we only start reading from the VMA
         * after it has been unlocked.
         * This pairs with RELEASE semantics in vma_end_write_all().
         */
        if (unlikely(vma->vm_lock_seq == raw_read_seqcount(&mm->mm_lock_seq))) {
                vma_refcount_put(vma);
                vma = NULL;
                goto err;
        }

        return vma;
err:
        rcu_read_unlock();

        return vma;
err_unstable:
        /*
         * If vma got attached to another mm from under us, that mm is not
         * stable and can be freed in the narrow window after vma->vm_refcnt
         * is dropped and before rcuwait_wake_up(mm) is called. Grab it before
         * releasing vma->vm_refcnt.
         */
        other_mm = vma->vm_mm; /* use a copy as vma can be freed after we drop vm_refcnt */

        /* __mmdrop() is a heavy operation, do it after dropping RCU lock. */
        rcu_read_unlock();
        mmgrab(other_mm);
        vma_refcount_put(vma);
        mmdrop(other_mm);

        return NULL;
}

/*
 * Lookup and lock a VMA under RCU protection. Returned VMA is guaranteed to be
 * stable and not isolated. If the VMA is not found or is being modified the
 * function returns NULL.
 */
struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm,
                                          unsigned long address)
{
        MA_STATE(mas, &mm->mm_mt, address, address);
        struct vm_area_struct *vma;

retry:
        rcu_read_lock();
        vma = mas_walk(&mas);
        if (!vma) {
                rcu_read_unlock();
                goto inval;
        }

        vma = vma_start_read(mm, vma);
        if (IS_ERR_OR_NULL(vma)) {
                /* Check if the VMA got isolated after we found it */
                if (PTR_ERR(vma) == -EAGAIN) {
                        count_vm_vma_lock_event(VMA_LOCK_MISS);
                        /* The area was replaced with another one */
                        mas_set(&mas, address);
                        goto retry;
                }

                /* Failed to lock the VMA */
                goto inval;
        }
        /*
         * At this point, we have a stable reference to a VMA: The VMA is
         * locked and we know it hasn't already been isolated.
         * From here on, we can access the VMA without worrying about which
         * fields are accessible for RCU readers.
         */
        rcu_read_unlock();

        /* Check if the vma we locked is the right one. */
        if (unlikely(address < vma->vm_start || address >= vma->vm_end)) {
                vma_end_read(vma);
                goto inval;
        }

        return vma;

inval:
        count_vm_vma_lock_event(VMA_LOCK_ABORT);
        return NULL;
}

static struct vm_area_struct *lock_next_vma_under_mmap_lock(struct mm_struct *mm,
                                                            struct vma_iterator *vmi,
                                                            unsigned long from_addr)
{
        struct vm_area_struct *vma;
        int ret;

        ret = mmap_read_lock_killable(mm);
        if (ret)
                return ERR_PTR(ret);

        /* Lookup the vma at the last position again under mmap_read_lock */
        vma_iter_set(vmi, from_addr);
        vma = vma_next(vmi);
        if (vma) {
                /* Very unlikely vma->vm_refcnt overflow case */
                if (unlikely(!vma_start_read_locked(vma)))
                        vma = ERR_PTR(-EAGAIN);
        }

        mmap_read_unlock(mm);

        return vma;
}

struct vm_area_struct *lock_next_vma(struct mm_struct *mm,
                                     struct vma_iterator *vmi,
                                     unsigned long from_addr)
{
        struct vm_area_struct *vma;
        unsigned int mm_wr_seq;
        bool mmap_unlocked;

        RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "no rcu read lock held");
retry:
        /* Start mmap_lock speculation in case we need to verify the vma later */
        mmap_unlocked = mmap_lock_speculate_try_begin(mm, &mm_wr_seq);
        vma = vma_next(vmi);
        if (!vma)
                return NULL;

        vma = vma_start_read(mm, vma);
        if (IS_ERR_OR_NULL(vma)) {
                /*
                 * Retry immediately if the vma gets detached from under us.
                 * Infinite loop should not happen because the vma we find will
                 * have to be constantly knocked out from under us.
                 */
                if (PTR_ERR(vma) == -EAGAIN) {
                        /* reset to search from the last address */
                        rcu_read_lock();
                        vma_iter_set(vmi, from_addr);
                        goto retry;
                }

                goto fallback;
        }

        /* Verify the vma is not behind the last search position. */
        if (unlikely(from_addr >= vma->vm_end))
                goto fallback_unlock;

        /*
         * vma can be ahead of the last search position but we need to verify
         * it was not shrunk after we found it and another vma has not been
         * installed ahead of it. Otherwise we might observe a gap that should
         * not be there.
         */
        if (from_addr < vma->vm_start) {
                /* Verify only if the address space might have changed since vma lookup. */
                if (!mmap_unlocked || mmap_lock_speculate_retry(mm, mm_wr_seq)) {
                        vma_iter_set(vmi, from_addr);
                        if (vma != vma_next(vmi))
                                goto fallback_unlock;
                }
        }

        return vma;

fallback_unlock:
        rcu_read_unlock();
        vma_end_read(vma);
fallback:
        vma = lock_next_vma_under_mmap_lock(mm, vmi, from_addr);
        rcu_read_lock();
        /* Reinitialize the iterator after re-entering rcu read section */
        vma_iter_set(vmi, IS_ERR_OR_NULL(vma) ? from_addr : vma->vm_end);

        return vma;
}
#endif /* CONFIG_PER_VMA_LOCK */

#ifdef CONFIG_LOCK_MM_AND_FIND_VMA
#include <linux/extable.h>

static inline bool get_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs)
{
        if (likely(mmap_read_trylock(mm)))
                return true;

        if (regs && !user_mode(regs)) {
                unsigned long ip = exception_ip(regs);
                if (!search_exception_tables(ip))
                        return false;
        }

        return !mmap_read_lock_killable(mm);
}

static inline bool mmap_upgrade_trylock(struct mm_struct *mm)
{
        /*
         * We don't have this operation yet.
         *
         * It should be easy enough to do: it's basically a
         *    atomic_long_try_cmpxchg_acquire()
         * from RWSEM_READER_BIAS -> RWSEM_WRITER_LOCKED, but
         * it also needs the proper lockdep magic etc.
         */
        return false;
}

static inline bool upgrade_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs)
{
        mmap_read_unlock(mm);
        if (regs && !user_mode(regs)) {
                unsigned long ip = exception_ip(regs);
                if (!search_exception_tables(ip))
                        return false;
        }
        return !mmap_write_lock_killable(mm);
}

/*
 * Helper for page fault handling.
 *
 * This is kind of equivalent to "mmap_read_lock()" followed
 * by "find_extend_vma()", except it's a lot more careful about
 * the locking (and will drop the lock on failure).
 *
 * For example, if we have a kernel bug that causes a page
 * fault, we don't want to just use mmap_read_lock() to get
 * the mm lock, because that would deadlock if the bug were
 * to happen while we're holding the mm lock for writing.
 *
 * So this checks the exception tables on kernel faults in
 * order to only do this all for instructions that are actually
 * expected to fault.
 *
 * We can also actually take the mm lock for writing if we
 * need to extend the vma, which helps the VM layer a lot.
 */
struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm,
                        unsigned long addr, struct pt_regs *regs)
{
        struct vm_area_struct *vma;

        if (!get_mmap_lock_carefully(mm, regs))
                return NULL;

        vma = find_vma(mm, addr);
        if (likely(vma && (vma->vm_start <= addr)))
                return vma;

        /*
         * Well, dang. We might still be successful, but only
         * if we can extend a vma to do so.
         */
        if (!vma || !(vma->vm_flags & VM_GROWSDOWN)) {
                mmap_read_unlock(mm);
                return NULL;
        }

        /*
         * We can try to upgrade the mmap lock atomically,
         * in which case we can continue to use the vma
         * we already looked up.
         *
         * Otherwise we'll have to drop the mmap lock and
         * re-take it, and also look up the vma again,
         * re-checking it.
         */
        if (!mmap_upgrade_trylock(mm)) {
                if (!upgrade_mmap_lock_carefully(mm, regs))
                        return NULL;

                vma = find_vma(mm, addr);
                if (!vma)
                        goto fail;
                if (vma->vm_start <= addr)
                        goto success;
                if (!(vma->vm_flags & VM_GROWSDOWN))
                        goto fail;
        }

        if (expand_stack_locked(vma, addr))
                goto fail;

success:
        mmap_write_downgrade(mm);
        return vma;

fail:
        mmap_write_unlock(mm);
        return NULL;
}
#endif /* CONFIG_LOCK_MM_AND_FIND_VMA */

#else /* CONFIG_MMU */

/*
 * At least xtensa ends up having protection faults even with no
 * MMU.. No stack expansion, at least.
 */
struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm,
                        unsigned long addr, struct pt_regs *regs)
{
        struct vm_area_struct *vma;

        mmap_read_lock(mm);
        vma = vma_lookup(mm, addr);
        if (!vma)
                mmap_read_unlock(mm);
        return vma;
}

#endif /* CONFIG_MMU */