root/fs/xfs/scrub/rtrmap_repair.c
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * Copyright (c) 2020-2024 Oracle.  All Rights Reserved.
 * Author: Darrick J. Wong <djwong@kernel.org>
 */
#include "xfs_platform.h"
#include "xfs_fs.h"
#include "xfs_shared.h"
#include "xfs_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
#include "xfs_defer.h"
#include "xfs_btree.h"
#include "xfs_btree_staging.h"
#include "xfs_buf_mem.h"
#include "xfs_btree_mem.h"
#include "xfs_bit.h"
#include "xfs_log_format.h"
#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_alloc.h"
#include "xfs_rmap.h"
#include "xfs_rmap_btree.h"
#include "xfs_rtrmap_btree.h"
#include "xfs_inode.h"
#include "xfs_icache.h"
#include "xfs_bmap.h"
#include "xfs_bmap_btree.h"
#include "xfs_quota.h"
#include "xfs_rtalloc.h"
#include "xfs_ag.h"
#include "xfs_rtgroup.h"
#include "xfs_refcount.h"
#include "scrub/xfs_scrub.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/btree.h"
#include "scrub/trace.h"
#include "scrub/repair.h"
#include "scrub/bitmap.h"
#include "scrub/fsb_bitmap.h"
#include "scrub/rgb_bitmap.h"
#include "scrub/xfile.h"
#include "scrub/xfarray.h"
#include "scrub/iscan.h"
#include "scrub/newbt.h"
#include "scrub/reap.h"

/*
 * Realtime Reverse Mapping Btree Repair
 * =====================================
 *
 * This isn't quite as difficult as repairing the rmap btree on the data
 * device, since we only store the data fork extents of realtime files on the
 * realtime device.  We still have to freeze the filesystem and stop the
 * background threads like we do for the rmap repair, but we only have to scan
 * realtime inodes.
 *
 * Collecting entries for the new realtime rmap btree is easy -- all we have
 * to do is generate rtrmap entries from the data fork mappings of all realtime
 * files in the filesystem.  We then scan the rmap btrees of the data device
 * looking for extents belonging to the old btree and note them in a bitmap.
 *
 * To rebuild the realtime rmap btree, we bulk-load the collected mappings into
 * a new btree cursor and atomically swap that into the realtime inode.  Then
 * we can free the blocks from the old btree.
 *
 * We use the 'xrep_rtrmap' prefix for all the rmap functions.
 */

/* Context for collecting rmaps */
struct xrep_rtrmap {
        /* new rtrmapbt information */
        struct xrep_newbt       new_btree;

        /* lock for the xfbtree and xfile */
        struct mutex            lock;

        /* rmap records generated from primary metadata */
        struct xfbtree          rtrmap_btree;

        struct xfs_scrub        *sc;

        /* bitmap of old rtrmapbt blocks */
        struct xfsb_bitmap      old_rtrmapbt_blocks;

        /* Hooks into rtrmap update code. */
        struct xfs_rmap_hook    rhook;

        /* inode scan cursor */
        struct xchk_iscan       iscan;

        /* in-memory btree cursor for the ->get_blocks walk */
        struct xfs_btree_cur    *mcur;

        /* Number of records we're staging in the new btree. */
        uint64_t                nr_records;
};

/* Set us up to repair rt reverse mapping btrees. */
int
xrep_setup_rtrmapbt(
        struct xfs_scrub        *sc)
{
        struct xrep_rtrmap      *rr;
        int                     error;

        xchk_fsgates_enable(sc, XCHK_FSGATES_RMAP);

        error = xrep_setup_xfbtree(sc, "realtime reverse mapping records");
        if (error)
                return error;

        rr = kzalloc_obj(struct xrep_rtrmap, XCHK_GFP_FLAGS);
        if (!rr)
                return -ENOMEM;

        rr->sc = sc;
        sc->buf = rr;
        return 0;
}

/* Make sure there's nothing funny about this mapping. */
STATIC int
xrep_rtrmap_check_mapping(
        struct xfs_scrub        *sc,
        const struct xfs_rmap_irec *rec)
{
        if (xfs_rtrmap_check_irec(sc->sr.rtg, rec) != NULL)
                return -EFSCORRUPTED;

        /* Make sure this isn't free space. */
        return xrep_require_rtext_inuse(sc, rec->rm_startblock,
                        rec->rm_blockcount);
}

/* Store a reverse-mapping record. */
static inline int
xrep_rtrmap_stash(
        struct xrep_rtrmap      *rr,
        xfs_rgblock_t           startblock,
        xfs_extlen_t            blockcount,
        uint64_t                owner,
        uint64_t                offset,
        unsigned int            flags)
{
        struct xfs_rmap_irec    rmap = {
                .rm_startblock  = startblock,
                .rm_blockcount  = blockcount,
                .rm_owner       = owner,
                .rm_offset      = offset,
                .rm_flags       = flags,
        };
        struct xfs_scrub        *sc = rr->sc;
        struct xfs_btree_cur    *mcur;
        int                     error = 0;

        if (xchk_should_terminate(sc, &error))
                return error;

        if (xchk_iscan_aborted(&rr->iscan))
                return -EFSCORRUPTED;

        trace_xrep_rtrmap_found(sc->mp, &rmap);

        /* Add entry to in-memory btree. */
        mutex_lock(&rr->lock);
        mcur = xfs_rtrmapbt_mem_cursor(sc->sr.rtg, sc->tp, &rr->rtrmap_btree);
        error = xfs_rmap_map_raw(mcur, &rmap);
        xfs_btree_del_cursor(mcur, error);
        if (error)
                goto out_cancel;

        error = xfbtree_trans_commit(&rr->rtrmap_btree, sc->tp);
        if (error)
                goto out_abort;

        mutex_unlock(&rr->lock);
        return 0;

out_cancel:
        xfbtree_trans_cancel(&rr->rtrmap_btree, sc->tp);
out_abort:
        xchk_iscan_abort(&rr->iscan);
        mutex_unlock(&rr->lock);
        return error;
}

/* Finding all file and bmbt extents. */

/* Context for accumulating rmaps for an inode fork. */
struct xrep_rtrmap_ifork {
        /*
         * Accumulate rmap data here to turn multiple adjacent bmaps into a
         * single rmap.
         */
        struct xfs_rmap_irec    accum;

        struct xrep_rtrmap      *rr;
};

/* Stash an rmap that we accumulated while walking an inode fork. */
STATIC int
xrep_rtrmap_stash_accumulated(
        struct xrep_rtrmap_ifork        *rf)
{
        if (rf->accum.rm_blockcount == 0)
                return 0;

        return xrep_rtrmap_stash(rf->rr, rf->accum.rm_startblock,
                        rf->accum.rm_blockcount, rf->accum.rm_owner,
                        rf->accum.rm_offset, rf->accum.rm_flags);
}

/* Accumulate a bmbt record. */
STATIC int
xrep_rtrmap_visit_bmbt(
        struct xfs_btree_cur    *cur,
        struct xfs_bmbt_irec    *rec,
        void                    *priv)
{
        struct xrep_rtrmap_ifork *rf = priv;
        struct xfs_rmap_irec    *accum = &rf->accum;
        struct xfs_mount        *mp = rf->rr->sc->mp;
        xfs_rgblock_t           rgbno;
        unsigned int            rmap_flags = 0;
        int                     error;

        if (xfs_rtb_to_rgno(mp, rec->br_startblock) !=
            rtg_rgno(rf->rr->sc->sr.rtg))
                return 0;

        if (rec->br_state == XFS_EXT_UNWRITTEN)
                rmap_flags |= XFS_RMAP_UNWRITTEN;

        /* If this bmap is adjacent to the previous one, just add it. */
        rgbno = xfs_rtb_to_rgbno(mp, rec->br_startblock);
        if (accum->rm_blockcount > 0 &&
            rec->br_startoff == accum->rm_offset + accum->rm_blockcount &&
            rgbno == accum->rm_startblock + accum->rm_blockcount &&
            rmap_flags == accum->rm_flags) {
                accum->rm_blockcount += rec->br_blockcount;
                return 0;
        }

        /* Otherwise stash the old rmap and start accumulating a new one. */
        error = xrep_rtrmap_stash_accumulated(rf);
        if (error)
                return error;

        accum->rm_startblock = rgbno;
        accum->rm_blockcount = rec->br_blockcount;
        accum->rm_offset = rec->br_startoff;
        accum->rm_flags = rmap_flags;
        return 0;
}

/*
 * Iterate the block mapping btree to collect rmap records for anything in this
 * fork that maps to the rt volume.  Sets @mappings_done to true if we've
 * scanned the block mappings in this fork.
 */
STATIC int
xrep_rtrmap_scan_bmbt(
        struct xrep_rtrmap_ifork *rf,
        struct xfs_inode        *ip,
        bool                    *mappings_done)
{
        struct xrep_rtrmap      *rr = rf->rr;
        struct xfs_btree_cur    *cur;
        struct xfs_ifork        *ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK);
        int                     error = 0;

        *mappings_done = false;

        /*
         * If the incore extent cache is already loaded, we'll just use the
         * incore extent scanner to record mappings.  Don't bother walking the
         * ondisk extent tree.
         */
        if (!xfs_need_iread_extents(ifp))
                return 0;

        /* Accumulate all the mappings in the bmap btree. */
        cur = xfs_bmbt_init_cursor(rr->sc->mp, rr->sc->tp, ip, XFS_DATA_FORK);
        error = xfs_bmap_query_all(cur, xrep_rtrmap_visit_bmbt, rf);
        xfs_btree_del_cursor(cur, error);
        if (error)
                return error;

        /* Stash any remaining accumulated rmaps and exit. */
        *mappings_done = true;
        return xrep_rtrmap_stash_accumulated(rf);
}

/*
 * Iterate the in-core extent cache to collect rmap records for anything in
 * this fork that matches the AG.
 */
STATIC int
xrep_rtrmap_scan_iext(
        struct xrep_rtrmap_ifork *rf,
        struct xfs_ifork        *ifp)
{
        struct xfs_bmbt_irec    rec;
        struct xfs_iext_cursor  icur;
        int                     error;

        for_each_xfs_iext(ifp, &icur, &rec) {
                if (isnullstartblock(rec.br_startblock))
                        continue;
                error = xrep_rtrmap_visit_bmbt(NULL, &rec, rf);
                if (error)
                        return error;
        }

        return xrep_rtrmap_stash_accumulated(rf);
}

/* Find all the extents on the realtime device mapped by an inode fork. */
STATIC int
xrep_rtrmap_scan_dfork(
        struct xrep_rtrmap      *rr,
        struct xfs_inode        *ip)
{
        struct xrep_rtrmap_ifork rf = {
                .accum          = { .rm_owner = ip->i_ino, },
                .rr             = rr,
        };
        struct xfs_ifork        *ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK);
        int                     error = 0;

        if (ifp->if_format == XFS_DINODE_FMT_BTREE) {
                bool            mappings_done;

                /*
                 * Scan the bmbt for mappings.  If the incore extent tree is
                 * loaded, we want to scan the cached mappings since that's
                 * faster when the extent counts are very high.
                 */
                error = xrep_rtrmap_scan_bmbt(&rf, ip, &mappings_done);
                if (error || mappings_done)
                        return error;
        } else if (ifp->if_format != XFS_DINODE_FMT_EXTENTS) {
                /* realtime data forks should only be extents or btree */
                return -EFSCORRUPTED;
        }

        /* Scan incore extent cache. */
        return xrep_rtrmap_scan_iext(&rf, ifp);
}

/* Record reverse mappings for a file. */
STATIC int
xrep_rtrmap_scan_inode(
        struct xrep_rtrmap      *rr,
        struct xfs_inode        *ip)
{
        unsigned int            lock_mode;
        int                     error = 0;

        /* Skip the rt rmap btree inode. */
        if (rr->sc->ip == ip)
                return 0;

        lock_mode = xfs_ilock_data_map_shared(ip);

        /* Check the data fork if it's on the realtime device. */
        if (XFS_IS_REALTIME_INODE(ip)) {
                error = xrep_rtrmap_scan_dfork(rr, ip);
                if (error)
                        goto out_unlock;
        }

        xchk_iscan_mark_visited(&rr->iscan, ip);
out_unlock:
        xfs_iunlock(ip, lock_mode);
        return error;
}

/* Record extents that belong to the realtime rmap inode. */
STATIC int
xrep_rtrmap_walk_rmap(
        struct xfs_btree_cur            *cur,
        const struct xfs_rmap_irec      *rec,
        void                            *priv)
{
        struct xrep_rtrmap              *rr = priv;
        int                             error = 0;

        if (xchk_should_terminate(rr->sc, &error))
                return error;

        /* Skip extents which are not owned by this inode and fork. */
        if (rec->rm_owner != rr->sc->ip->i_ino)
                return 0;

        error = xrep_check_ino_btree_mapping(rr->sc, rec);
        if (error)
                return error;

        return xfsb_bitmap_set(&rr->old_rtrmapbt_blocks,
                        xfs_gbno_to_fsb(cur->bc_group, rec->rm_startblock),
                        rec->rm_blockcount);
}

/* Scan one AG for reverse mappings for the realtime rmap btree. */
STATIC int
xrep_rtrmap_scan_ag(
        struct xrep_rtrmap      *rr,
        struct xfs_perag        *pag)
{
        struct xfs_scrub        *sc = rr->sc;
        int                     error;

        error = xrep_ag_init(sc, pag, &sc->sa);
        if (error)
                return error;

        error = xfs_rmap_query_all(sc->sa.rmap_cur, xrep_rtrmap_walk_rmap, rr);
        xchk_ag_free(sc, &sc->sa);
        return error;
}

struct xrep_rtrmap_stash_run {
        struct xrep_rtrmap      *rr;
        uint64_t                owner;
};

static int
xrep_rtrmap_stash_run(
        uint32_t                        start,
        uint32_t                        len,
        void                            *priv)
{
        struct xrep_rtrmap_stash_run    *rsr = priv;
        struct xrep_rtrmap              *rr = rsr->rr;
        xfs_rgblock_t                   rgbno = start;

        return xrep_rtrmap_stash(rr, rgbno, len, rsr->owner, 0, 0);
}

/*
 * Emit rmaps for every extent of bits set in the bitmap.  Caller must ensure
 * that the ranges are in units of FS blocks.
 */
STATIC int
xrep_rtrmap_stash_bitmap(
        struct xrep_rtrmap              *rr,
        struct xrgb_bitmap              *bitmap,
        const struct xfs_owner_info     *oinfo)
{
        struct xrep_rtrmap_stash_run    rsr = {
                .rr                     = rr,
                .owner                  = oinfo->oi_owner,
        };

        return xrgb_bitmap_walk(bitmap, xrep_rtrmap_stash_run, &rsr);
}

/* Record a CoW staging extent. */
STATIC int
xrep_rtrmap_walk_cowblocks(
        struct xfs_btree_cur            *cur,
        const struct xfs_refcount_irec  *irec,
        void                            *priv)
{
        struct xrgb_bitmap              *bitmap = priv;

        if (!xfs_refcount_check_domain(irec) ||
            irec->rc_domain != XFS_REFC_DOMAIN_COW)
                return -EFSCORRUPTED;

        return xrgb_bitmap_set(bitmap, irec->rc_startblock,
                        irec->rc_blockcount);
}

/*
 * Collect rmaps for the blocks containing the refcount btree, and all CoW
 * staging extents.
 */
STATIC int
xrep_rtrmap_find_refcount_rmaps(
        struct xrep_rtrmap      *rr)
{
        struct xrgb_bitmap      cow_blocks;             /* COWBIT */
        struct xfs_refcount_irec low = {
                .rc_startblock  = 0,
                .rc_domain      = XFS_REFC_DOMAIN_COW,
        };
        struct xfs_refcount_irec high = {
                .rc_startblock  = -1U,
                .rc_domain      = XFS_REFC_DOMAIN_COW,
        };
        struct xfs_scrub        *sc = rr->sc;
        int                     error;

        if (!xfs_has_rtreflink(sc->mp))
                return 0;

        xrgb_bitmap_init(&cow_blocks);

        /* Collect rmaps for CoW staging extents. */
        error = xfs_refcount_query_range(sc->sr.refc_cur, &low, &high,
                        xrep_rtrmap_walk_cowblocks, &cow_blocks);
        if (error)
                goto out_bitmap;

        /* Generate rmaps for everything. */
        error = xrep_rtrmap_stash_bitmap(rr, &cow_blocks, &XFS_RMAP_OINFO_COW);
        if (error)
                goto out_bitmap;

out_bitmap:
        xrgb_bitmap_destroy(&cow_blocks);
        return error;
}

/* Count and check all collected records. */
STATIC int
xrep_rtrmap_check_record(
        struct xfs_btree_cur            *cur,
        const struct xfs_rmap_irec      *rec,
        void                            *priv)
{
        struct xrep_rtrmap              *rr = priv;
        int                             error;

        error = xrep_rtrmap_check_mapping(rr->sc, rec);
        if (error)
                return error;

        rr->nr_records++;
        return 0;
}

/* Generate all the reverse-mappings for the realtime device. */
STATIC int
xrep_rtrmap_find_rmaps(
        struct xrep_rtrmap      *rr)
{
        struct xfs_scrub        *sc = rr->sc;
        struct xfs_perag        *pag = NULL;
        struct xfs_inode        *ip;
        struct xfs_btree_cur    *mcur;
        int                     error;

        /* Generate rmaps for the realtime superblock */
        if (xfs_has_rtsb(sc->mp) && rtg_rgno(rr->sc->sr.rtg) == 0) {
                error = xrep_rtrmap_stash(rr, 0, sc->mp->m_sb.sb_rextsize,
                                XFS_RMAP_OWN_FS, 0, 0);
                if (error)
                        return error;
        }

        /* Find CoW staging extents. */
        xrep_rtgroup_btcur_init(sc, &sc->sr);
        error = xrep_rtrmap_find_refcount_rmaps(rr);
        xchk_rtgroup_btcur_free(&sc->sr);
        if (error)
                return error;

        /*
         * Set up for a potentially lengthy filesystem scan by reducing our
         * transaction resource usage for the duration.  Specifically:
         *
         * Unlock the realtime metadata inodes and cancel the transaction to
         * release the log grant space while we scan the filesystem.
         *
         * Create a new empty transaction to eliminate the possibility of the
         * inode scan deadlocking on cyclical metadata.
         *
         * We pass the empty transaction to the file scanning function to avoid
         * repeatedly cycling empty transactions.  This can be done even though
         * we take the IOLOCK to quiesce the file because empty transactions
         * do not take sb_internal.
         */
        xchk_trans_cancel(sc);
        xchk_rtgroup_unlock(&sc->sr);
        xchk_trans_alloc_empty(sc);

        while ((error = xchk_iscan_iter(&rr->iscan, &ip)) == 1) {
                error = xrep_rtrmap_scan_inode(rr, ip);
                xchk_irele(sc, ip);
                if (error)
                        break;

                if (xchk_should_terminate(sc, &error))
                        break;
        }
        xchk_iscan_iter_finish(&rr->iscan);
        if (error)
                return error;

        /*
         * Switch out for a real transaction and lock the RT metadata in
         * preparation for building a new tree.
         */
        xchk_trans_cancel(sc);
        error = xchk_setup_rt(sc);
        if (error)
                return error;
        error = xchk_rtgroup_lock(sc, &sc->sr, XCHK_RTGLOCK_ALL);
        if (error)
                return error;

        /*
         * If a hook failed to update the in-memory btree, we lack the data to
         * continue the repair.
         */
        if (xchk_iscan_aborted(&rr->iscan))
                return -EFSCORRUPTED;

        /* Scan for old rtrmap blocks. */
        while ((pag = xfs_perag_next(sc->mp, pag))) {
                error = xrep_rtrmap_scan_ag(rr, pag);
                if (error) {
                        xfs_perag_rele(pag);
                        return error;
                }
        }

        /*
         * Now that we have everything locked again, we need to count the
         * number of rmap records stashed in the btree.  This should reflect
         * all actively-owned rt files in the filesystem.  At the same time,
         * check all our records before we start building a new btree, which
         * requires the rtbitmap lock.
         */
        mcur = xfs_rtrmapbt_mem_cursor(rr->sc->sr.rtg, NULL, &rr->rtrmap_btree);
        rr->nr_records = 0;
        error = xfs_rmap_query_all(mcur, xrep_rtrmap_check_record, rr);
        xfs_btree_del_cursor(mcur, error);

        return error;
}

/* Building the new rtrmap btree. */

/* Retrieve rtrmapbt data for bulk load. */
STATIC int
xrep_rtrmap_get_records(
        struct xfs_btree_cur            *cur,
        unsigned int                    idx,
        struct xfs_btree_block          *block,
        unsigned int                    nr_wanted,
        void                            *priv)
{
        struct xrep_rtrmap              *rr = priv;
        union xfs_btree_rec             *block_rec;
        unsigned int                    loaded;
        int                             error;

        for (loaded = 0; loaded < nr_wanted; loaded++, idx++) {
                int                     stat = 0;

                error = xfs_btree_increment(rr->mcur, 0, &stat);
                if (error)
                        return error;
                if (!stat)
                        return -EFSCORRUPTED;

                error = xfs_rmap_get_rec(rr->mcur, &cur->bc_rec.r, &stat);
                if (error)
                        return error;
                if (!stat)
                        return -EFSCORRUPTED;

                block_rec = xfs_btree_rec_addr(cur, idx, block);
                cur->bc_ops->init_rec_from_cur(cur, block_rec);
        }

        return loaded;
}

/* Feed one of the new btree blocks to the bulk loader. */
STATIC int
xrep_rtrmap_claim_block(
        struct xfs_btree_cur    *cur,
        union xfs_btree_ptr     *ptr,
        void                    *priv)
{
        struct xrep_rtrmap      *rr = priv;

        return xrep_newbt_claim_block(cur, &rr->new_btree, ptr);
}

/* Figure out how much space we need to create the incore btree root block. */
STATIC size_t
xrep_rtrmap_iroot_size(
        struct xfs_btree_cur    *cur,
        unsigned int            level,
        unsigned int            nr_this_level,
        void                    *priv)
{
        return xfs_rtrmap_broot_space_calc(cur->bc_mp, level, nr_this_level);
}

/*
 * Use the collected rmap information to stage a new rmap btree.  If this is
 * successful we'll return with the new btree root information logged to the
 * repair transaction but not yet committed.  This implements section (III)
 * above.
 */
STATIC int
xrep_rtrmap_build_new_tree(
        struct xrep_rtrmap      *rr)
{
        struct xfs_scrub        *sc = rr->sc;
        struct xfs_rtgroup      *rtg = sc->sr.rtg;
        struct xfs_btree_cur    *rmap_cur;
        int                     error;

        /*
         * Prepare to construct the new btree by reserving disk space for the
         * new btree and setting up all the accounting information we'll need
         * to root the new btree while it's under construction and before we
         * attach it to the realtime rmapbt inode.
         */
        error = xrep_newbt_init_metadir_inode(&rr->new_btree, sc);
        if (error)
                return error;

        rr->new_btree.bload.get_records = xrep_rtrmap_get_records;
        rr->new_btree.bload.claim_block = xrep_rtrmap_claim_block;
        rr->new_btree.bload.iroot_size = xrep_rtrmap_iroot_size;

        rmap_cur = xfs_rtrmapbt_init_cursor(NULL, rtg);
        xfs_btree_stage_ifakeroot(rmap_cur, &rr->new_btree.ifake);

        /* Compute how many blocks we'll need for the rmaps collected. */
        error = xfs_btree_bload_compute_geometry(rmap_cur,
                        &rr->new_btree.bload, rr->nr_records);
        if (error)
                goto err_cur;

        /* Last chance to abort before we start committing fixes. */
        if (xchk_should_terminate(sc, &error))
                goto err_cur;

        /*
         * Guess how many blocks we're going to need to rebuild an entire
         * rtrmapbt from the number of extents we found, and pump up our
         * transaction to have sufficient block reservation.  We're allowed
         * to exceed quota to repair inconsistent metadata, though this is
         * unlikely.
         */
        error = xfs_trans_reserve_more_inode(sc->tp, rtg_rmap(rtg),
                        rr->new_btree.bload.nr_blocks, 0, true);
        if (error)
                goto err_cur;

        /* Reserve the space we'll need for the new btree. */
        error = xrep_newbt_alloc_blocks(&rr->new_btree,
                        rr->new_btree.bload.nr_blocks);
        if (error)
                goto err_cur;

        /*
         * Create a cursor to the in-memory btree so that we can bulk load the
         * new btree.
         */
        rr->mcur = xfs_rtrmapbt_mem_cursor(sc->sr.rtg, NULL, &rr->rtrmap_btree);
        error = xfs_btree_goto_left_edge(rr->mcur);
        if (error)
                goto err_mcur;

        /* Add all observed rmap records. */
        rr->new_btree.ifake.if_fork->if_format = XFS_DINODE_FMT_META_BTREE;
        error = xfs_btree_bload(rmap_cur, &rr->new_btree.bload, rr);
        if (error)
                goto err_mcur;

        /*
         * Install the new rtrmap btree in the inode.  After this point the old
         * btree is no longer accessible, the new tree is live, and we can
         * delete the cursor.
         */
        xfs_rtrmapbt_commit_staged_btree(rmap_cur, sc->tp);
        xrep_inode_set_nblocks(rr->sc, rr->new_btree.ifake.if_blocks);
        xfs_btree_del_cursor(rmap_cur, 0);
        xfs_btree_del_cursor(rr->mcur, 0);
        rr->mcur = NULL;

        /*
         * Now that we've written the new btree to disk, we don't need to keep
         * updating the in-memory btree.  Abort the scan to stop live updates.
         */
        xchk_iscan_abort(&rr->iscan);

        /* Dispose of any unused blocks and the accounting information. */
        error = xrep_newbt_commit(&rr->new_btree);
        if (error)
                return error;

        return xrep_roll_trans(sc);

err_mcur:
        xfs_btree_del_cursor(rr->mcur, error);
err_cur:
        xfs_btree_del_cursor(rmap_cur, error);
        xrep_newbt_cancel(&rr->new_btree);
        return error;
}

/* Reaping the old btree. */

static inline bool
xrep_rtrmapbt_want_live_update(
        struct xchk_iscan               *iscan,
        const struct xfs_owner_info     *oi)
{
        if (xchk_iscan_aborted(iscan))
                return false;

        /*
         * We scanned the CoW staging extents before we started the iscan, so
         * we need all the updates.
         */
        if (XFS_RMAP_NON_INODE_OWNER(oi->oi_owner))
                return true;

        /* Ignore updates to files that the scanner hasn't visited yet. */
        return xchk_iscan_want_live_update(iscan, oi->oi_owner);
}

/*
 * Apply a rtrmapbt update from the regular filesystem into our shadow btree.
 * We're running from the thread that owns the rtrmap ILOCK and is generating
 * the update, so we must be careful about which parts of the struct
 * xrep_rtrmap that we change.
 */
static int
xrep_rtrmapbt_live_update(
        struct notifier_block           *nb,
        unsigned long                   action,
        void                            *data)
{
        struct xfs_rmap_update_params   *p = data;
        struct xrep_rtrmap              *rr;
        struct xfs_mount                *mp;
        struct xfs_btree_cur            *mcur;
        struct xfs_trans                *tp;
        int                             error;

        rr = container_of(nb, struct xrep_rtrmap, rhook.rmap_hook.nb);
        mp = rr->sc->mp;

        if (!xrep_rtrmapbt_want_live_update(&rr->iscan, &p->oinfo))
                goto out_unlock;

        trace_xrep_rmap_live_update(rtg_group(rr->sc->sr.rtg), action, p);

        tp = xfs_trans_alloc_empty(mp);

        mutex_lock(&rr->lock);
        mcur = xfs_rtrmapbt_mem_cursor(rr->sc->sr.rtg, tp, &rr->rtrmap_btree);
        error = __xfs_rmap_finish_intent(mcur, action, p->startblock,
                        p->blockcount, &p->oinfo, p->unwritten);
        xfs_btree_del_cursor(mcur, error);
        if (error)
                goto out_cancel;

        error = xfbtree_trans_commit(&rr->rtrmap_btree, tp);
        if (error)
                goto out_cancel;

        xfs_trans_cancel(tp);
        mutex_unlock(&rr->lock);
        return NOTIFY_DONE;

out_cancel:
        xfbtree_trans_cancel(&rr->rtrmap_btree, tp);
        xfs_trans_cancel(tp);
        xchk_iscan_abort(&rr->iscan);
        mutex_unlock(&rr->lock);
out_unlock:
        return NOTIFY_DONE;
}

/* Set up the filesystem scan components. */
STATIC int
xrep_rtrmap_setup_scan(
        struct xrep_rtrmap      *rr)
{
        struct xfs_scrub        *sc = rr->sc;
        int                     error;

        mutex_init(&rr->lock);
        xfsb_bitmap_init(&rr->old_rtrmapbt_blocks);

        /* Set up some storage */
        error = xfs_rtrmapbt_mem_init(sc->mp, &rr->rtrmap_btree, sc->xmbtp,
                        rtg_rgno(sc->sr.rtg));
        if (error)
                goto out_bitmap;

        /* Retry iget every tenth of a second for up to 30 seconds. */
        xchk_iscan_start(sc, 30000, 100, &rr->iscan);

        /*
         * Hook into live rtrmap operations so that we can update our in-memory
         * btree to reflect live changes on the filesystem.  Since we drop the
         * rtrmap ILOCK to scan all the inodes, we need this piece to avoid
         * installing a stale btree.
         */
        ASSERT(sc->flags & XCHK_FSGATES_RMAP);
        xfs_rmap_hook_setup(&rr->rhook, xrep_rtrmapbt_live_update);
        error = xfs_rmap_hook_add(rtg_group(sc->sr.rtg), &rr->rhook);
        if (error)
                goto out_iscan;
        return 0;

out_iscan:
        xchk_iscan_teardown(&rr->iscan);
        xfbtree_destroy(&rr->rtrmap_btree);
out_bitmap:
        xfsb_bitmap_destroy(&rr->old_rtrmapbt_blocks);
        mutex_destroy(&rr->lock);
        return error;
}

/* Tear down scan components. */
STATIC void
xrep_rtrmap_teardown(
        struct xrep_rtrmap      *rr)
{
        struct xfs_scrub        *sc = rr->sc;

        xchk_iscan_abort(&rr->iscan);
        xfs_rmap_hook_del(rtg_group(sc->sr.rtg), &rr->rhook);
        xchk_iscan_teardown(&rr->iscan);
        xfbtree_destroy(&rr->rtrmap_btree);
        xfsb_bitmap_destroy(&rr->old_rtrmapbt_blocks);
        mutex_destroy(&rr->lock);
}

/* Repair the realtime rmap btree. */
int
xrep_rtrmapbt(
        struct xfs_scrub        *sc)
{
        struct xrep_rtrmap      *rr = sc->buf;
        int                     error;

        /* Make sure any problems with the fork are fixed. */
        error = xrep_metadata_inode_forks(sc);
        if (error)
                return error;

        error = xrep_rtrmap_setup_scan(rr);
        if (error)
                return error;

        /* Collect rmaps for realtime files. */
        error = xrep_rtrmap_find_rmaps(rr);
        if (error)
                goto out_records;

        xfs_trans_ijoin(sc->tp, sc->ip, 0);

        /* Rebuild the rtrmap information. */
        error = xrep_rtrmap_build_new_tree(rr);
        if (error)
                goto out_records;

        /*
         * Free all the extents that were allocated to the former rtrmapbt and
         * aren't cross-linked with something else.
         */
        error = xrep_reap_metadir_fsblocks(rr->sc, &rr->old_rtrmapbt_blocks);
        if (error)
                goto out_records;

out_records:
        xrep_rtrmap_teardown(rr);
        return error;
}