root/fs/xfs/scrub/refcount_repair.c
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * Copyright (C) 2018-2023 Oracle.  All Rights Reserved.
 * Author: Darrick J. Wong <djwong@kernel.org>
 */
#include "xfs_platform.h"
#include "xfs_fs.h"
#include "xfs_shared.h"
#include "xfs_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
#include "xfs_defer.h"
#include "xfs_btree.h"
#include "xfs_btree_staging.h"
#include "xfs_inode.h"
#include "xfs_bit.h"
#include "xfs_log_format.h"
#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_alloc.h"
#include "xfs_ialloc.h"
#include "xfs_rmap.h"
#include "xfs_rmap_btree.h"
#include "xfs_refcount.h"
#include "xfs_refcount_btree.h"
#include "xfs_error.h"
#include "xfs_ag.h"
#include "xfs_health.h"
#include "scrub/xfs_scrub.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/btree.h"
#include "scrub/trace.h"
#include "scrub/repair.h"
#include "scrub/bitmap.h"
#include "scrub/agb_bitmap.h"
#include "scrub/xfile.h"
#include "scrub/xfarray.h"
#include "scrub/newbt.h"
#include "scrub/reap.h"
#include "scrub/rcbag.h"

/*
 * Rebuilding the Reference Count Btree
 * ====================================
 *
 * This algorithm is "borrowed" from xfs_repair.  Imagine the rmap
 * entries as rectangles representing extents of physical blocks, and
 * that the rectangles can be laid down to allow them to overlap each
 * other; then we know that we must emit a refcnt btree entry wherever
 * the amount of overlap changes, i.e. the emission stimulus is
 * level-triggered:
 *
 *                 -    ---
 *       --      ----- ----   ---        ------
 * --   ----     ----------- ----     ---------
 * -------------------------------- -----------
 * ^ ^  ^^ ^^    ^ ^^ ^^^  ^^^^  ^ ^^ ^  ^     ^
 * 2 1  23 21    3 43 234  2123  1 01 2  3     0
 *
 * For our purposes, a rmap is a tuple (startblock, len, fileoff, owner).
 *
 * Note that in the actual refcnt btree we don't store the refcount < 2
 * cases because the bnobt tells us which blocks are free; single-use
 * blocks aren't recorded in the bnobt or the refcntbt.  If the rmapbt
 * supports storing multiple entries covering a given block we could
 * theoretically dispense with the refcntbt and simply count rmaps, but
 * that's inefficient in the (hot) write path, so we'll take the cost of
 * the extra tree to save time.  Also there's no guarantee that rmap
 * will be enabled.
 *
 * Given an array of rmaps sorted by physical block number, a starting
 * physical block (sp), a bag to hold rmaps that cover sp, and the next
 * physical block where the level changes (np), we can reconstruct the
 * refcount btree as follows:
 *
 * While there are still unprocessed rmaps in the array,
 *  - Set sp to the physical block (pblk) of the next unprocessed rmap.
 *  - Add to the bag all rmaps in the array where startblock == sp.
 *  - Set np to the physical block where the bag size will change.  This
 *    is the minimum of (the pblk of the next unprocessed rmap) and
 *    (startblock + len of each rmap in the bag).
 *  - Record the bag size as old_bag_size.
 *
 *  - While the bag isn't empty,
 *     - Remove from the bag all rmaps where startblock + len == np.
 *     - Add to the bag all rmaps in the array where startblock == np.
 *     - If the bag size isn't old_bag_size, store the refcount entry
 *       (sp, np - sp, bag_size) in the refcnt btree.
 *     - If the bag is empty, break out of the inner loop.
 *     - Set old_bag_size to the bag size
 *     - Set sp = np.
 *     - Set np to the physical block where the bag size will change.
 *       This is the minimum of (the pblk of the next unprocessed rmap)
 *       and (startblock + len of each rmap in the bag).
 *
 * Like all the other repairers, we make a list of all the refcount
 * records we need, then reinitialize the refcount btree root and
 * insert all the records.
 */

struct xrep_refc {
        /* refcount extents */
        struct xfarray          *refcount_records;

        /* new refcountbt information */
        struct xrep_newbt       new_btree;

        /* old refcountbt blocks */
        struct xagb_bitmap      old_refcountbt_blocks;

        struct xfs_scrub        *sc;

        /* get_records()'s position in the refcount record array. */
        xfarray_idx_t           array_cur;

        /* # of refcountbt blocks */
        xfs_extlen_t            btblocks;
};

/* Set us up to repair refcount btrees. */
int
xrep_setup_ag_refcountbt(
        struct xfs_scrub        *sc)
{
        return xrep_setup_xfbtree(sc, "rmap record bag");
}

/* Check for any obvious conflicts with this shared/CoW staging extent. */
STATIC int
xrep_refc_check_ext(
        struct xfs_scrub                *sc,
        const struct xfs_refcount_irec  *rec)
{
        enum xbtree_recpacking          outcome;
        int                             error;

        if (xfs_refcount_check_irec(sc->sa.pag, rec) != NULL)
                return -EFSCORRUPTED;

        /* Make sure this isn't free space. */
        error = xfs_alloc_has_records(sc->sa.bno_cur, rec->rc_startblock,
                        rec->rc_blockcount, &outcome);
        if (error)
                return error;
        if (outcome != XBTREE_RECPACKING_EMPTY)
                return -EFSCORRUPTED;

        /* Must not be an inode chunk. */
        error = xfs_ialloc_has_inodes_at_extent(sc->sa.ino_cur,
                        rec->rc_startblock, rec->rc_blockcount, &outcome);
        if (error)
                return error;
        if (outcome != XBTREE_RECPACKING_EMPTY)
                return -EFSCORRUPTED;

        return 0;
}

/* Record a reference count extent. */
STATIC int
xrep_refc_stash(
        struct xrep_refc                *rr,
        enum xfs_refc_domain            domain,
        xfs_agblock_t                   agbno,
        xfs_extlen_t                    len,
        uint64_t                        refcount)
{
        struct xfs_refcount_irec        irec = {
                .rc_startblock          = agbno,
                .rc_blockcount          = len,
                .rc_domain              = domain,
        };
        struct xfs_scrub                *sc = rr->sc;
        int                             error = 0;

        if (xchk_should_terminate(sc, &error))
                return error;

        irec.rc_refcount = min_t(uint64_t, XFS_REFC_REFCOUNT_MAX, refcount);

        error = xrep_refc_check_ext(rr->sc, &irec);
        if (error)
                return error;

        trace_xrep_refc_found(pag_group(sc->sa.pag), &irec);

        return xfarray_append(rr->refcount_records, &irec);
}

/* Record a CoW staging extent. */
STATIC int
xrep_refc_stash_cow(
        struct xrep_refc                *rr,
        xfs_agblock_t                   agbno,
        xfs_extlen_t                    len)
{
        return xrep_refc_stash(rr, XFS_REFC_DOMAIN_COW, agbno, len, 1);
}

/* Decide if an rmap could describe a shared extent. */
static inline bool
xrep_refc_rmap_shareable(
        struct xfs_mount                *mp,
        const struct xfs_rmap_irec      *rmap)
{
        /* AG metadata are never sharable */
        if (XFS_RMAP_NON_INODE_OWNER(rmap->rm_owner))
                return false;

        /* Metadata in files are never shareable */
        if (xfs_is_sb_inum(mp, rmap->rm_owner))
                return false;

        /* Metadata and unwritten file blocks are not shareable. */
        if (rmap->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK |
                              XFS_RMAP_UNWRITTEN))
                return false;

        return true;
}

/*
 * Walk along the reverse mapping records until we find one that could describe
 * a shared extent.
 */
STATIC int
xrep_refc_walk_rmaps(
        struct xrep_refc        *rr,
        struct xfs_rmap_irec    *rmap,
        bool                    *have_rec)
{
        struct xfs_btree_cur    *cur = rr->sc->sa.rmap_cur;
        struct xfs_mount        *mp = cur->bc_mp;
        int                     have_gt;
        int                     error = 0;

        *have_rec = false;

        /*
         * Loop through the remaining rmaps.  Remember CoW staging
         * extents and the refcountbt blocks from the old tree for later
         * disposal.  We can only share written data fork extents, so
         * keep looping until we find an rmap for one.
         */
        do {
                if (xchk_should_terminate(rr->sc, &error))
                        return error;

                error = xfs_btree_increment(cur, 0, &have_gt);
                if (error)
                        return error;
                if (!have_gt)
                        return 0;

                error = xfs_rmap_get_rec(cur, rmap, &have_gt);
                if (error)
                        return error;
                if (XFS_IS_CORRUPT(mp, !have_gt)) {
                        xfs_btree_mark_sick(cur);
                        return -EFSCORRUPTED;
                }

                if (rmap->rm_owner == XFS_RMAP_OWN_COW) {
                        error = xrep_refc_stash_cow(rr, rmap->rm_startblock,
                                        rmap->rm_blockcount);
                        if (error)
                                return error;
                } else if (rmap->rm_owner == XFS_RMAP_OWN_REFC) {
                        /* refcountbt block, dump it when we're done. */
                        rr->btblocks += rmap->rm_blockcount;
                        error = xagb_bitmap_set(&rr->old_refcountbt_blocks,
                                        rmap->rm_startblock,
                                        rmap->rm_blockcount);
                        if (error)
                                return error;
                }
        } while (!xrep_refc_rmap_shareable(mp, rmap));

        *have_rec = true;
        return 0;
}

static inline uint32_t
xrep_refc_encode_startblock(
        const struct xfs_refcount_irec  *irec)
{
        uint32_t                        start;

        start = irec->rc_startblock & ~XFS_REFC_COWFLAG;
        if (irec->rc_domain == XFS_REFC_DOMAIN_COW)
                start |= XFS_REFC_COWFLAG;

        return start;
}

/* Sort in the same order as the ondisk records. */
static int
xrep_refc_extent_cmp(
        const void                      *a,
        const void                      *b)
{
        const struct xfs_refcount_irec  *ap = a;
        const struct xfs_refcount_irec  *bp = b;
        uint32_t                        sa, sb;

        sa = xrep_refc_encode_startblock(ap);
        sb = xrep_refc_encode_startblock(bp);

        if (sa > sb)
                return 1;
        if (sa < sb)
                return -1;
        return 0;
}

/*
 * Sort the refcount extents by startblock or else the btree records will be in
 * the wrong order.  Make sure the records do not overlap in physical space.
 */
STATIC int
xrep_refc_sort_records(
        struct xrep_refc                *rr)
{
        struct xfs_refcount_irec        irec;
        xfarray_idx_t                   cur;
        enum xfs_refc_domain            dom = XFS_REFC_DOMAIN_SHARED;
        xfs_agblock_t                   next_agbno = 0;
        int                             error;

        error = xfarray_sort(rr->refcount_records, xrep_refc_extent_cmp,
                        XFARRAY_SORT_KILLABLE);
        if (error)
                return error;

        foreach_xfarray_idx(rr->refcount_records, cur) {
                if (xchk_should_terminate(rr->sc, &error))
                        return error;

                error = xfarray_load(rr->refcount_records, cur, &irec);
                if (error)
                        return error;

                if (dom == XFS_REFC_DOMAIN_SHARED &&
                    irec.rc_domain == XFS_REFC_DOMAIN_COW) {
                        dom = irec.rc_domain;
                        next_agbno = 0;
                }

                if (dom != irec.rc_domain)
                        return -EFSCORRUPTED;
                if (irec.rc_startblock < next_agbno)
                        return -EFSCORRUPTED;

                next_agbno = irec.rc_startblock + irec.rc_blockcount;
        }

        return error;
}

/*
 * Walk forward through the rmap btree to collect all rmaps starting at
 * @bno in @rmap_bag.  These represent the file(s) that share ownership of
 * the current block.  Upon return, the rmap cursor points to the last record
 * satisfying the startblock constraint.
 */
static int
xrep_refc_push_rmaps_at(
        struct xrep_refc        *rr,
        struct rcbag            *rcstack,
        xfs_agblock_t           bno,
        struct xfs_rmap_irec    *rmap,
        bool                    *have)
{
        struct xfs_scrub        *sc = rr->sc;
        int                     have_gt;
        int                     error;

        while (*have && rmap->rm_startblock == bno) {
                error = rcbag_add(rcstack, rr->sc->tp, rmap);
                if (error)
                        return error;

                error = xrep_refc_walk_rmaps(rr, rmap, have);
                if (error)
                        return error;
        }

        error = xfs_btree_decrement(sc->sa.rmap_cur, 0, &have_gt);
        if (error)
                return error;
        if (XFS_IS_CORRUPT(sc->mp, !have_gt)) {
                xfs_btree_mark_sick(sc->sa.rmap_cur);
                return -EFSCORRUPTED;
        }

        return 0;
}

/* Iterate all the rmap records to generate reference count data. */
STATIC int
xrep_refc_find_refcounts(
        struct xrep_refc        *rr)
{
        struct xfs_scrub        *sc = rr->sc;
        struct rcbag            *rcstack;
        uint64_t                old_stack_height;
        xfs_agblock_t           sbno;
        xfs_agblock_t           cbno;
        xfs_agblock_t           nbno;
        bool                    have;
        int                     error;

        xrep_ag_btcur_init(sc, &sc->sa);

        /*
         * Set up a bag to store all the rmap records that we're tracking to
         * generate a reference count record.  If the size of the bag exceeds
         * XFS_REFC_REFCOUNT_MAX, we clamp rc_refcount.
         */
        error = rcbag_init(sc->mp, sc->xmbtp, &rcstack);
        if (error)
                goto out_cur;

        /* Start the rmapbt cursor to the left of all records. */
        error = xfs_btree_goto_left_edge(sc->sa.rmap_cur);
        if (error)
                goto out_bag;

        /* Process reverse mappings into refcount data. */
        while (xfs_btree_has_more_records(sc->sa.rmap_cur)) {
                struct xfs_rmap_irec    rmap;

                /* Push all rmaps with pblk == sbno onto the stack */
                error = xrep_refc_walk_rmaps(rr, &rmap, &have);
                if (error)
                        goto out_bag;
                if (!have)
                        break;
                sbno = cbno = rmap.rm_startblock;
                error = xrep_refc_push_rmaps_at(rr, rcstack, sbno, &rmap,
                                &have);
                if (error)
                        goto out_bag;

                /* Set nbno to the bno of the next refcount change */
                error = rcbag_next_edge(rcstack, sc->tp, &rmap, have, &nbno);
                if (error)
                        goto out_bag;

                ASSERT(nbno > sbno);
                old_stack_height = rcbag_count(rcstack);

                /* While stack isn't empty... */
                while (rcbag_count(rcstack) > 0) {
                        /* Pop all rmaps that end at nbno */
                        error = rcbag_remove_ending_at(rcstack, sc->tp, nbno);
                        if (error)
                                goto out_bag;

                        /* Push array items that start at nbno */
                        error = xrep_refc_walk_rmaps(rr, &rmap, &have);
                        if (error)
                                goto out_bag;
                        if (have) {
                                error = xrep_refc_push_rmaps_at(rr, rcstack,
                                                nbno, &rmap, &have);
                                if (error)
                                        goto out_bag;
                        }

                        /* Emit refcount if necessary */
                        ASSERT(nbno > cbno);
                        if (rcbag_count(rcstack) != old_stack_height) {
                                if (old_stack_height > 1) {
                                        error = xrep_refc_stash(rr,
                                                        XFS_REFC_DOMAIN_SHARED,
                                                        cbno, nbno - cbno,
                                                        old_stack_height);
                                        if (error)
                                                goto out_bag;
                                }
                                cbno = nbno;
                        }

                        /* Stack empty, go find the next rmap */
                        if (rcbag_count(rcstack) == 0)
                                break;
                        old_stack_height = rcbag_count(rcstack);
                        sbno = nbno;

                        /* Set nbno to the bno of the next refcount change */
                        error = rcbag_next_edge(rcstack, sc->tp, &rmap, have,
                                        &nbno);
                        if (error)
                                goto out_bag;

                        ASSERT(nbno > sbno);
                }
        }

        ASSERT(rcbag_count(rcstack) == 0);
out_bag:
        rcbag_free(&rcstack);
out_cur:
        xchk_ag_btcur_free(&sc->sa);
        return error;
}

/* Retrieve refcountbt data for bulk load. */
STATIC int
xrep_refc_get_records(
        struct xfs_btree_cur            *cur,
        unsigned int                    idx,
        struct xfs_btree_block          *block,
        unsigned int                    nr_wanted,
        void                            *priv)
{
        struct xfs_refcount_irec        *irec = &cur->bc_rec.rc;
        struct xrep_refc                *rr = priv;
        union xfs_btree_rec             *block_rec;
        unsigned int                    loaded;
        int                             error;

        for (loaded = 0; loaded < nr_wanted; loaded++, idx++) {
                error = xfarray_load(rr->refcount_records, rr->array_cur++,
                                irec);
                if (error)
                        return error;

                block_rec = xfs_btree_rec_addr(cur, idx, block);
                cur->bc_ops->init_rec_from_cur(cur, block_rec);
        }

        return loaded;
}

/* Feed one of the new btree blocks to the bulk loader. */
STATIC int
xrep_refc_claim_block(
        struct xfs_btree_cur    *cur,
        union xfs_btree_ptr     *ptr,
        void                    *priv)
{
        struct xrep_refc        *rr = priv;

        return xrep_newbt_claim_block(cur, &rr->new_btree, ptr);
}

/* Update the AGF counters. */
STATIC int
xrep_refc_reset_counters(
        struct xrep_refc        *rr)
{
        struct xfs_scrub        *sc = rr->sc;
        struct xfs_perag        *pag = sc->sa.pag;

        /*
         * After we commit the new btree to disk, it is possible that the
         * process to reap the old btree blocks will race with the AIL trying
         * to checkpoint the old btree blocks into the filesystem.  If the new
         * tree is shorter than the old one, the refcountbt write verifier will
         * fail and the AIL will shut down the filesystem.
         *
         * To avoid this, save the old incore btree height values as the alt
         * height values before re-initializing the perag info from the updated
         * AGF to capture all the new values.
         */
        pag->pagf_repair_refcount_level = pag->pagf_refcount_level;

        /* Reinitialize with the values we just logged. */
        return xrep_reinit_pagf(sc);
}

/*
 * Use the collected refcount information to stage a new refcount btree.  If
 * this is successful we'll return with the new btree root information logged
 * to the repair transaction but not yet committed.
 */
STATIC int
xrep_refc_build_new_tree(
        struct xrep_refc        *rr)
{
        struct xfs_scrub        *sc = rr->sc;
        struct xfs_btree_cur    *refc_cur;
        struct xfs_perag        *pag = sc->sa.pag;
        int                     error;

        error = xrep_refc_sort_records(rr);
        if (error)
                return error;

        /*
         * Prepare to construct the new btree by reserving disk space for the
         * new btree and setting up all the accounting information we'll need
         * to root the new btree while it's under construction and before we
         * attach it to the AG header.
         */
        xrep_newbt_init_ag(&rr->new_btree, sc, &XFS_RMAP_OINFO_REFC,
                        xfs_agbno_to_fsb(pag, xfs_refc_block(sc->mp)),
                        XFS_AG_RESV_METADATA);
        rr->new_btree.bload.get_records = xrep_refc_get_records;
        rr->new_btree.bload.claim_block = xrep_refc_claim_block;

        /* Compute how many blocks we'll need. */
        refc_cur = xfs_refcountbt_init_cursor(sc->mp, NULL, NULL, pag);
        xfs_btree_stage_afakeroot(refc_cur, &rr->new_btree.afake);
        error = xfs_btree_bload_compute_geometry(refc_cur,
                        &rr->new_btree.bload,
                        xfarray_length(rr->refcount_records));
        if (error)
                goto err_cur;

        /* Last chance to abort before we start committing fixes. */
        if (xchk_should_terminate(sc, &error))
                goto err_cur;

        /* Reserve the space we'll need for the new btree. */
        error = xrep_newbt_alloc_blocks(&rr->new_btree,
                        rr->new_btree.bload.nr_blocks);
        if (error)
                goto err_cur;

        /*
         * Due to btree slack factors, it's possible for a new btree to be one
         * level taller than the old btree.  Update the incore btree height so
         * that we don't trip the verifiers when writing the new btree blocks
         * to disk.
         */
        pag->pagf_repair_refcount_level = rr->new_btree.bload.btree_height;

        /* Add all observed refcount records. */
        rr->array_cur = XFARRAY_CURSOR_INIT;
        error = xfs_btree_bload(refc_cur, &rr->new_btree.bload, rr);
        if (error)
                goto err_level;

        /*
         * Install the new btree in the AG header.  After this point the old
         * btree is no longer accessible and the new tree is live.
         */
        xfs_refcountbt_commit_staged_btree(refc_cur, sc->tp, sc->sa.agf_bp);
        xfs_btree_del_cursor(refc_cur, 0);

        /* Reset the AGF counters now that we've changed the btree shape. */
        error = xrep_refc_reset_counters(rr);
        if (error)
                goto err_newbt;

        /* Dispose of any unused blocks and the accounting information. */
        error = xrep_newbt_commit(&rr->new_btree);
        if (error)
                return error;

        return xrep_roll_ag_trans(sc);

err_level:
        pag->pagf_repair_refcount_level = 0;
err_cur:
        xfs_btree_del_cursor(refc_cur, error);
err_newbt:
        xrep_newbt_cancel(&rr->new_btree);
        return error;
}

/*
 * Now that we've logged the roots of the new btrees, invalidate all of the
 * old blocks and free them.
 */
STATIC int
xrep_refc_remove_old_tree(
        struct xrep_refc        *rr)
{
        struct xfs_scrub        *sc = rr->sc;
        struct xfs_perag        *pag = sc->sa.pag;
        int                     error;

        /* Free the old refcountbt blocks if they're not in use. */
        error = xrep_reap_agblocks(sc, &rr->old_refcountbt_blocks,
                        &XFS_RMAP_OINFO_REFC, XFS_AG_RESV_METADATA);
        if (error)
                return error;

        /*
         * Now that we've zapped all the old refcountbt blocks we can turn off
         * the alternate height mechanism and reset the per-AG space
         * reservations.
         */
        pag->pagf_repair_refcount_level = 0;
        sc->flags |= XREP_RESET_PERAG_RESV;
        return 0;
}

/* Rebuild the refcount btree. */
int
xrep_refcountbt(
        struct xfs_scrub        *sc)
{
        struct xrep_refc        *rr;
        struct xfs_mount        *mp = sc->mp;
        int                     error;

        /* We require the rmapbt to rebuild anything. */
        if (!xfs_has_rmapbt(mp))
                return -EOPNOTSUPP;

        rr = kzalloc_obj(struct xrep_refc, XCHK_GFP_FLAGS);
        if (!rr)
                return -ENOMEM;
        rr->sc = sc;

        /* Set up enough storage to handle one refcount record per block. */
        error = xfarray_create("reference count records", mp->m_sb.sb_agblocks,
                        sizeof(struct xfs_refcount_irec),
                        &rr->refcount_records);
        if (error)
                goto out_rr;

        /* Collect all reference counts. */
        xagb_bitmap_init(&rr->old_refcountbt_blocks);
        error = xrep_refc_find_refcounts(rr);
        if (error)
                goto out_bitmap;

        /* Rebuild the refcount information. */
        error = xrep_refc_build_new_tree(rr);
        if (error)
                goto out_bitmap;

        /* Kill the old tree. */
        error = xrep_refc_remove_old_tree(rr);
        if (error)
                goto out_bitmap;

out_bitmap:
        xagb_bitmap_destroy(&rr->old_refcountbt_blocks);
        xfarray_destroy(rr->refcount_records);
out_rr:
        kfree(rr);
        return error;
}