root/fs/xfs/xfs_extent_busy.c
// SPDX-License-Identifier: GPL-2.0
/*
 * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
 * Copyright (c) 2010 David Chinner.
 * Copyright (c) 2011 Christoph Hellwig.
 * All Rights Reserved.
 */
#include "xfs_platform.h"
#include "xfs_fs.h"
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_shared.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
#include "xfs_alloc.h"
#include "xfs_extent_busy.h"
#include "xfs_trace.h"
#include "xfs_trans.h"
#include "xfs_log.h"
#include "xfs_ag.h"
#include "xfs_rtgroup.h"

struct xfs_extent_busy_tree {
        spinlock_t              eb_lock;
        struct rb_root          eb_tree;
        unsigned int            eb_gen;
        wait_queue_head_t       eb_wait;
};

static void
xfs_extent_busy_insert_list(
        struct xfs_group        *xg,
        xfs_agblock_t           bno,
        xfs_extlen_t            len,
        unsigned int            flags,
        struct list_head        *busy_list)
{
        struct xfs_extent_busy_tree *eb = xg->xg_busy_extents;
        struct xfs_extent_busy  *new;
        struct xfs_extent_busy  *busyp;
        struct rb_node          **rbp;
        struct rb_node          *parent = NULL;

        new = kzalloc_obj(struct xfs_extent_busy, GFP_KERNEL | __GFP_NOFAIL);
        new->group = xfs_group_hold(xg);
        new->bno = bno;
        new->length = len;
        INIT_LIST_HEAD(&new->list);
        new->flags = flags;

        /* trace before insert to be able to see failed inserts */
        trace_xfs_extent_busy(xg, bno, len);

        spin_lock(&eb->eb_lock);
        rbp = &eb->eb_tree.rb_node;
        while (*rbp) {
                parent = *rbp;
                busyp = rb_entry(parent, struct xfs_extent_busy, rb_node);

                if (new->bno < busyp->bno) {
                        rbp = &(*rbp)->rb_left;
                        ASSERT(new->bno + new->length <= busyp->bno);
                } else if (new->bno > busyp->bno) {
                        rbp = &(*rbp)->rb_right;
                        ASSERT(bno >= busyp->bno + busyp->length);
                } else {
                        ASSERT(0);
                }
        }

        rb_link_node(&new->rb_node, parent, rbp);
        rb_insert_color(&new->rb_node, &eb->eb_tree);

        /* always process discard lists in fifo order */
        list_add_tail(&new->list, busy_list);
        spin_unlock(&eb->eb_lock);
}

void
xfs_extent_busy_insert(
        struct xfs_trans        *tp,
        struct xfs_group        *xg,
        xfs_agblock_t           bno,
        xfs_extlen_t            len,
        unsigned int            flags)
{
        xfs_extent_busy_insert_list(xg, bno, len, flags, &tp->t_busy);
}

void
xfs_extent_busy_insert_discard(
        struct xfs_group        *xg,
        xfs_agblock_t           bno,
        xfs_extlen_t            len,
        struct list_head        *busy_list)
{
        xfs_extent_busy_insert_list(xg, bno, len, XFS_EXTENT_BUSY_DISCARDED,
                        busy_list);
}

/*
 * Search for a busy extent within the range of the extent we are about to
 * allocate.  You need to be holding the busy extent tree lock when calling
 * xfs_extent_busy_search(). This function returns 0 for no overlapping busy
 * extent, -1 for an overlapping but not exact busy extent, and 1 for an exact
 * match. This is done so that a non-zero return indicates an overlap that
 * will require a synchronous transaction, but it can still be
 * used to distinguish between a partial or exact match.
 */
int
xfs_extent_busy_search(
        struct xfs_group        *xg,
        xfs_agblock_t           bno,
        xfs_extlen_t            len)
{
        struct xfs_extent_busy_tree *eb = xg->xg_busy_extents;
        struct rb_node          *rbp;
        struct xfs_extent_busy  *busyp;
        int                     match = 0;

        /* find closest start bno overlap */
        spin_lock(&eb->eb_lock);
        rbp = eb->eb_tree.rb_node;
        while (rbp) {
                busyp = rb_entry(rbp, struct xfs_extent_busy, rb_node);
                if (bno < busyp->bno) {
                        /* may overlap, but exact start block is lower */
                        if (bno + len > busyp->bno)
                                match = -1;
                        rbp = rbp->rb_left;
                } else if (bno > busyp->bno) {
                        /* may overlap, but exact start block is higher */
                        if (bno < busyp->bno + busyp->length)
                                match = -1;
                        rbp = rbp->rb_right;
                } else {
                        /* bno matches busyp, length determines exact match */
                        match = (busyp->length == len) ? 1 : -1;
                        break;
                }
        }
        spin_unlock(&eb->eb_lock);
        return match;
}

/*
 * The found free extent [fbno, fend] overlaps part or all of the given busy
 * extent.  If the overlap covers the beginning, the end, or all of the busy
 * extent, the overlapping portion can be made unbusy and used for the
 * allocation.  We can't split a busy extent because we can't modify a
 * transaction/CIL context busy list, but we can update an entry's block
 * number or length.
 *
 * Returns true if the extent can safely be reused, or false if the search
 * needs to be restarted.
 */
STATIC bool
xfs_extent_busy_update_extent(
        struct xfs_group        *xg,
        struct xfs_extent_busy  *busyp,
        xfs_agblock_t           fbno,
        xfs_extlen_t            flen,
        bool                    userdata)
                __releases(&eb->eb_lock)
                __acquires(&eb->eb_lock)
{
        struct xfs_extent_busy_tree *eb = xg->xg_busy_extents;
        xfs_agblock_t           fend = fbno + flen;
        xfs_agblock_t           bbno = busyp->bno;
        xfs_agblock_t           bend = bbno + busyp->length;

        /*
         * This extent is currently being discarded.  Give the thread
         * performing the discard a chance to mark the extent unbusy
         * and retry.
         */
        if (busyp->flags & XFS_EXTENT_BUSY_DISCARDED) {
                spin_unlock(&eb->eb_lock);
                delay(1);
                spin_lock(&eb->eb_lock);
                return false;
        }

        /*
         * If there is a busy extent overlapping a user allocation, we have
         * no choice but to force the log and retry the search.
         *
         * Fortunately this does not happen during normal operation, but
         * only if the filesystem is very low on space and has to dip into
         * the AGFL for normal allocations.
         */
        if (userdata)
                goto out_force_log;

        if (bbno < fbno && bend > fend) {
                /*
                 * Case 1:
                 *    bbno           bend
                 *    +BBBBBBBBBBBBBBBBB+
                 *        +---------+
                 *        fbno   fend
                 */

                /*
                 * We would have to split the busy extent to be able to track
                 * it correct, which we cannot do because we would have to
                 * modify the list of busy extents attached to the transaction
                 * or CIL context, which is immutable.
                 *
                 * Force out the log to clear the busy extent and retry the
                 * search.
                 */
                goto out_force_log;
        } else if (bbno >= fbno && bend <= fend) {
                /*
                 * Case 2:
                 *    bbno           bend
                 *    +BBBBBBBBBBBBBBBBB+
                 *    +-----------------+
                 *    fbno           fend
                 *
                 * Case 3:
                 *    bbno           bend
                 *    +BBBBBBBBBBBBBBBBB+
                 *    +--------------------------+
                 *    fbno                    fend
                 *
                 * Case 4:
                 *             bbno           bend
                 *             +BBBBBBBBBBBBBBBBB+
                 *    +--------------------------+
                 *    fbno                    fend
                 *
                 * Case 5:
                 *             bbno           bend
                 *             +BBBBBBBBBBBBBBBBB+
                 *    +-----------------------------------+
                 *    fbno                             fend
                 *
                 */

                /*
                 * The busy extent is fully covered by the extent we are
                 * allocating, and can simply be removed from the rbtree.
                 * However we cannot remove it from the immutable list
                 * tracking busy extents in the transaction or CIL context,
                 * so set the length to zero to mark it invalid.
                 *
                 * We also need to restart the busy extent search from the
                 * tree root, because erasing the node can rearrange the
                 * tree topology.
                 */
                rb_erase(&busyp->rb_node, &eb->eb_tree);
                busyp->length = 0;
                return false;
        } else if (fend < bend) {
                /*
                 * Case 6:
                 *              bbno           bend
                 *             +BBBBBBBBBBBBBBBBB+
                 *             +---------+
                 *             fbno   fend
                 *
                 * Case 7:
                 *             bbno           bend
                 *             +BBBBBBBBBBBBBBBBB+
                 *    +------------------+
                 *    fbno            fend
                 *
                 */
                busyp->bno = fend;
                busyp->length = bend - fend;
        } else if (bbno < fbno) {
                /*
                 * Case 8:
                 *    bbno           bend
                 *    +BBBBBBBBBBBBBBBBB+
                 *        +-------------+
                 *        fbno       fend
                 *
                 * Case 9:
                 *    bbno           bend
                 *    +BBBBBBBBBBBBBBBBB+
                 *        +----------------------+
                 *        fbno                fend
                 */
                busyp->length = fbno - busyp->bno;
        } else {
                ASSERT(0);
        }

        trace_xfs_extent_busy_reuse(xg, fbno, flen);
        return true;

out_force_log:
        spin_unlock(&eb->eb_lock);
        xfs_log_force(xg->xg_mount, XFS_LOG_SYNC);
        trace_xfs_extent_busy_force(xg, fbno, flen);
        spin_lock(&eb->eb_lock);
        return false;
}

/*
 * For a given extent [fbno, flen], make sure we can reuse it safely.
 */
void
xfs_extent_busy_reuse(
        struct xfs_group        *xg,
        xfs_agblock_t           fbno,
        xfs_extlen_t            flen,
        bool                    userdata)
{
        struct xfs_extent_busy_tree *eb = xg->xg_busy_extents;
        struct rb_node          *rbp;

        ASSERT(flen > 0);
        spin_lock(&eb->eb_lock);
restart:
        rbp = eb->eb_tree.rb_node;
        while (rbp) {
                struct xfs_extent_busy *busyp =
                        rb_entry(rbp, struct xfs_extent_busy, rb_node);
                xfs_agblock_t   bbno = busyp->bno;
                xfs_agblock_t   bend = bbno + busyp->length;

                if (fbno + flen <= bbno) {
                        rbp = rbp->rb_left;
                        continue;
                } else if (fbno >= bend) {
                        rbp = rbp->rb_right;
                        continue;
                }

                if (!xfs_extent_busy_update_extent(xg, busyp, fbno, flen,
                                                  userdata))
                        goto restart;
        }
        spin_unlock(&eb->eb_lock);
}

/*
 * For a given extent [fbno, flen], search the busy extent list to find a
 * subset of the extent that is not busy.  If *rlen is smaller than
 * args->minlen no suitable extent could be found, and the higher level
 * code needs to force out the log and retry the allocation.
 *
 * Return the current busy generation for the group if the extent is busy. This
 * value can be used to wait for at least one of the currently busy extents
 * to be cleared. Note that the busy list is not guaranteed to be empty after
 * the gen is woken. The state of a specific extent must always be confirmed
 * with another call to xfs_extent_busy_trim() before it can be used.
 */
bool
xfs_extent_busy_trim(
        struct xfs_group        *xg,
        xfs_extlen_t            minlen,
        xfs_extlen_t            maxlen,
        xfs_agblock_t           *bno,
        xfs_extlen_t            *len,
        unsigned                *busy_gen)
{
        struct xfs_extent_busy_tree *eb = xg->xg_busy_extents;
        xfs_agblock_t           fbno;
        xfs_extlen_t            flen;
        struct rb_node          *rbp;
        bool                    ret = false;

        ASSERT(*len > 0);

        spin_lock(&eb->eb_lock);
        fbno = *bno;
        flen = *len;
        rbp = eb->eb_tree.rb_node;
        while (rbp && flen >= minlen) {
                struct xfs_extent_busy *busyp =
                        rb_entry(rbp, struct xfs_extent_busy, rb_node);
                xfs_agblock_t   fend = fbno + flen;
                xfs_agblock_t   bbno = busyp->bno;
                xfs_agblock_t   bend = bbno + busyp->length;

                if (fend <= bbno) {
                        rbp = rbp->rb_left;
                        continue;
                } else if (fbno >= bend) {
                        rbp = rbp->rb_right;
                        continue;
                }

                if (bbno <= fbno) {
                        /* start overlap */

                        /*
                         * Case 1:
                         *    bbno           bend
                         *    +BBBBBBBBBBBBBBBBB+
                         *        +---------+
                         *        fbno   fend
                         *
                         * Case 2:
                         *    bbno           bend
                         *    +BBBBBBBBBBBBBBBBB+
                         *    +-------------+
                         *    fbno       fend
                         *
                         * Case 3:
                         *    bbno           bend
                         *    +BBBBBBBBBBBBBBBBB+
                         *        +-------------+
                         *        fbno       fend
                         *
                         * Case 4:
                         *    bbno           bend
                         *    +BBBBBBBBBBBBBBBBB+
                         *    +-----------------+
                         *    fbno           fend
                         *
                         * No unbusy region in extent, return failure.
                         */
                        if (fend <= bend)
                                goto fail;

                        /*
                         * Case 5:
                         *    bbno           bend
                         *    +BBBBBBBBBBBBBBBBB+
                         *        +----------------------+
                         *        fbno                fend
                         *
                         * Case 6:
                         *    bbno           bend
                         *    +BBBBBBBBBBBBBBBBB+
                         *    +--------------------------+
                         *    fbno                    fend
                         *
                         * Needs to be trimmed to:
                         *                       +-------+
                         *                       fbno fend
                         */
                        fbno = bend;
                } else if (bend >= fend) {
                        /* end overlap */

                        /*
                         * Case 7:
                         *             bbno           bend
                         *             +BBBBBBBBBBBBBBBBB+
                         *    +------------------+
                         *    fbno            fend
                         *
                         * Case 8:
                         *             bbno           bend
                         *             +BBBBBBBBBBBBBBBBB+
                         *    +--------------------------+
                         *    fbno                    fend
                         *
                         * Needs to be trimmed to:
                         *    +-------+
                         *    fbno fend
                         */
                        fend = bbno;
                } else {
                        /* middle overlap */

                        /*
                         * Case 9:
                         *             bbno           bend
                         *             +BBBBBBBBBBBBBBBBB+
                         *    +-----------------------------------+
                         *    fbno                             fend
                         *
                         * Can be trimmed to:
                         *    +-------+        OR         +-------+
                         *    fbno fend                   fbno fend
                         *
                         * Backward allocation leads to significant
                         * fragmentation of directories, which degrades
                         * directory performance, therefore we always want to
                         * choose the option that produces forward allocation
                         * patterns.
                         * Preferring the lower bno extent will make the next
                         * request use "fend" as the start of the next
                         * allocation;  if the segment is no longer busy at
                         * that point, we'll get a contiguous allocation, but
                         * even if it is still busy, we will get a forward
                         * allocation.
                         * We try to avoid choosing the segment at "bend",
                         * because that can lead to the next allocation
                         * taking the segment at "fbno", which would be a
                         * backward allocation.  We only use the segment at
                         * "fbno" if it is much larger than the current
                         * requested size, because in that case there's a
                         * good chance subsequent allocations will be
                         * contiguous.
                         */
                        if (bbno - fbno >= maxlen) {
                                /* left candidate fits perfect */
                                fend = bbno;
                        } else if (fend - bend >= maxlen * 4) {
                                /* right candidate has enough free space */
                                fbno = bend;
                        } else if (bbno - fbno >= minlen) {
                                /* left candidate fits minimum requirement */
                                fend = bbno;
                        } else {
                                goto fail;
                        }
                }

                flen = fend - fbno;
        }
out:

        if (fbno != *bno || flen != *len) {
                trace_xfs_extent_busy_trim(xg, *bno, *len, fbno, flen);
                *bno = fbno;
                *len = flen;
                *busy_gen = eb->eb_gen;
                ret = true;
        }
        spin_unlock(&eb->eb_lock);
        return ret;
fail:
        /*
         * Return a zero extent length as failure indications.  All callers
         * re-check if the trimmed extent satisfies the minlen requirement.
         */
        flen = 0;
        goto out;
}

static bool
xfs_extent_busy_clear_one(
        struct xfs_extent_busy  *busyp,
        bool                    do_discard)
{
        struct xfs_extent_busy_tree *eb = busyp->group->xg_busy_extents;

        if (busyp->length) {
                if (do_discard &&
                    !(busyp->flags & XFS_EXTENT_BUSY_SKIP_DISCARD)) {
                        busyp->flags = XFS_EXTENT_BUSY_DISCARDED;
                        return false;
                }
                trace_xfs_extent_busy_clear(busyp->group, busyp->bno,
                                busyp->length);
                rb_erase(&busyp->rb_node, &eb->eb_tree);
        }

        list_del_init(&busyp->list);
        xfs_group_put(busyp->group);
        kfree(busyp);
        return true;
}

/*
 * Remove all extents on the passed in list from the busy extents tree.
 * If do_discard is set skip extents that need to be discarded, and mark
 * these as undergoing a discard operation instead.
 */
void
xfs_extent_busy_clear(
        struct list_head        *list,
        bool                    do_discard)
{
        struct xfs_extent_busy  *busyp, *next;

        busyp = list_first_entry_or_null(list, typeof(*busyp), list);
        if (!busyp)
                return;

        do {
                struct xfs_group        *xg = xfs_group_hold(busyp->group);
                struct xfs_extent_busy_tree *eb = xg->xg_busy_extents;
                bool                    wakeup = false;

                spin_lock(&eb->eb_lock);
                do {
                        next = list_next_entry(busyp, list);
                        if (xfs_extent_busy_clear_one(busyp, do_discard))
                                wakeup = true;
                        busyp = next;
                } while (!list_entry_is_head(busyp, list, list) &&
                         busyp->group == xg);

                if (wakeup) {
                        eb->eb_gen++;
                        wake_up_all(&eb->eb_wait);
                }
                spin_unlock(&eb->eb_lock);
                xfs_group_put(xg);
        } while (!list_entry_is_head(busyp, list, list));
}

/*
 * Flush out all busy extents for this group.
 *
 * If the current transaction is holding busy extents, the caller may not want
 * to wait for committed busy extents to resolve. If we are being told just to
 * try a flush or progress has been made since we last skipped a busy extent,
 * return immediately to allow the caller to try again.
 *
 * If we are freeing extents, we might actually be holding the only free extents
 * in the transaction busy list and the log force won't resolve that situation.
 * In this case, we must return -EAGAIN to avoid a deadlock by informing the
 * caller it needs to commit the busy extents it holds before retrying the
 * extent free operation.
 */
int
xfs_extent_busy_flush(
        struct xfs_trans        *tp,
        struct xfs_group        *xg,
        unsigned                busy_gen,
        uint32_t                alloc_flags)
{
        struct xfs_extent_busy_tree *eb = xg->xg_busy_extents;
        DEFINE_WAIT             (wait);
        int                     error;

        error = xfs_log_force(tp->t_mountp, XFS_LOG_SYNC);
        if (error)
                return error;

        /* Avoid deadlocks on uncommitted busy extents. */
        if (!list_empty(&tp->t_busy)) {
                if (alloc_flags & XFS_ALLOC_FLAG_TRYFLUSH)
                        return 0;

                if (busy_gen != READ_ONCE(eb->eb_gen))
                        return 0;

                if (alloc_flags & XFS_ALLOC_FLAG_FREEING)
                        return -EAGAIN;
        }

        /* Wait for committed busy extents to resolve. */
        do {
                prepare_to_wait(&eb->eb_wait, &wait, TASK_KILLABLE);
                if  (busy_gen != READ_ONCE(eb->eb_gen))
                        break;
                schedule();
        } while (1);

        finish_wait(&eb->eb_wait, &wait);
        return 0;
}

static void
xfs_extent_busy_wait_group(
        struct xfs_group        *xg)
{
        DEFINE_WAIT             (wait);
        struct xfs_extent_busy_tree *eb = xg->xg_busy_extents;

        do {
                prepare_to_wait(&eb->eb_wait, &wait, TASK_KILLABLE);
                if  (RB_EMPTY_ROOT(&eb->eb_tree))
                        break;
                schedule();
        } while (1);
        finish_wait(&eb->eb_wait, &wait);
}

void
xfs_extent_busy_wait_all(
        struct xfs_mount        *mp)
{
        struct xfs_perag        *pag = NULL;
        struct xfs_rtgroup      *rtg = NULL;

        while ((pag = xfs_perag_next(mp, pag)))
                xfs_extent_busy_wait_group(pag_group(pag));

        if (xfs_has_rtgroups(mp) && !xfs_has_zoned(mp))
                while ((rtg = xfs_rtgroup_next(mp, rtg)))
                        xfs_extent_busy_wait_group(rtg_group(rtg));
}

/*
 * Callback for list_sort to sort busy extents by the group they reside in.
 */
int
xfs_extent_busy_ag_cmp(
        void                    *priv,
        const struct list_head  *l1,
        const struct list_head  *l2)
{
        struct xfs_extent_busy  *b1 =
                container_of(l1, struct xfs_extent_busy, list);
        struct xfs_extent_busy  *b2 =
                container_of(l2, struct xfs_extent_busy, list);
        s32 diff;

        diff = b1->group->xg_gno - b2->group->xg_gno;
        if (!diff)
                diff = b1->bno - b2->bno;
        return diff;
}

/* Are there any busy extents in this group? */
bool
xfs_extent_busy_list_empty(
        struct xfs_group        *xg,
        unsigned                *busy_gen)
{
        struct xfs_extent_busy_tree *eb = xg->xg_busy_extents;
        bool                    res;

        spin_lock(&eb->eb_lock);
        res = RB_EMPTY_ROOT(&eb->eb_tree);
        *busy_gen = READ_ONCE(eb->eb_gen);
        spin_unlock(&eb->eb_lock);
        return res;
}

struct xfs_extent_busy_tree *
xfs_extent_busy_alloc(void)
{
        struct xfs_extent_busy_tree *eb;

        eb = kzalloc_obj(*eb);
        if (!eb)
                return NULL;
        spin_lock_init(&eb->eb_lock);
        init_waitqueue_head(&eb->eb_wait);
        eb->eb_tree = RB_ROOT;
        return eb;
}