root/fs/xfs/xfs_verify_media.c
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * Copyright (c) 2026 Oracle.  All Rights Reserved.
 * Author: Darrick J. Wong <djwong@kernel.org>
 */
#include "xfs_platform.h"
#include "xfs_shared.h"
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
#include "xfs_bit.h"
#include "xfs_btree.h"
#include "xfs_inode.h"
#include "xfs_icache.h"
#include "xfs_trans.h"
#include "xfs_alloc.h"
#include "xfs_ag.h"
#include "xfs_rmap.h"
#include "xfs_rmap_btree.h"
#include "xfs_rtgroup.h"
#include "xfs_rtrmap_btree.h"
#include "xfs_health.h"
#include "xfs_healthmon.h"
#include "xfs_trace.h"
#include "xfs_verify_media.h"

#include <linux/fserror.h>

struct xfs_group_data_lost {
        xfs_agblock_t           startblock;
        xfs_extlen_t            blockcount;
};

/* Report lost file data from rmap records */
static int
xfs_verify_report_data_lost(
        struct xfs_btree_cur            *cur,
        const struct xfs_rmap_irec      *rec,
        void                            *data)
{
        struct xfs_mount                *mp = cur->bc_mp;
        struct xfs_inode                *ip;
        struct xfs_group_data_lost      *lost = data;
        xfs_fileoff_t                   fileoff = rec->rm_offset;
        xfs_extlen_t                    blocks = rec->rm_blockcount;
        const bool                      is_attr =
                        (rec->rm_flags & XFS_RMAP_ATTR_FORK);
        const xfs_agblock_t             lost_end =
                        lost->startblock + lost->blockcount;
        const xfs_agblock_t             rmap_end =
                        rec->rm_startblock + rec->rm_blockcount;
        int                             error = 0;

        if (XFS_RMAP_NON_INODE_OWNER(rec->rm_owner))
               return 0;

        error = xfs_iget(mp, cur->bc_tp, rec->rm_owner, 0, 0, &ip);
        if (error)
                return 0;

        if (rec->rm_flags & XFS_RMAP_BMBT_BLOCK) {
                xfs_bmap_mark_sick(ip, is_attr ? XFS_ATTR_FORK : XFS_DATA_FORK);
                goto out_rele;
        }

        if (is_attr) {
                xfs_inode_mark_sick(ip, XFS_SICK_INO_XATTR);
                goto out_rele;
        }

        if (lost->startblock > rec->rm_startblock) {
                fileoff += lost->startblock - rec->rm_startblock;
                blocks -= lost->startblock - rec->rm_startblock;
        }
        if (rmap_end > lost_end)
                blocks -= rmap_end - lost_end;

        fserror_report_data_lost(VFS_I(ip), XFS_FSB_TO_B(mp, fileoff),
                        XFS_FSB_TO_B(mp, blocks), GFP_NOFS);

out_rele:
        xfs_irele(ip);
        return 0;
}

/* Walk reverse mappings to look for all file data loss */
static int
xfs_verify_report_losses(
        struct xfs_mount        *mp,
        enum xfs_group_type     type,
        xfs_daddr_t             daddr,
        u64                     bblen)
{
        struct xfs_group        *xg = NULL;
        struct xfs_trans        *tp;
        xfs_fsblock_t           start_bno, end_bno;
        uint32_t                start_gno, end_gno;
        int                     error;

        if (type == XG_TYPE_RTG) {
                start_bno = xfs_daddr_to_rtb(mp, daddr);
                end_bno = xfs_daddr_to_rtb(mp, daddr + bblen - 1);
        } else {
                start_bno = XFS_DADDR_TO_FSB(mp, daddr);
                end_bno = XFS_DADDR_TO_FSB(mp, daddr + bblen - 1);
        }

        tp = xfs_trans_alloc_empty(mp);
        start_gno = xfs_fsb_to_gno(mp, start_bno, type);
        end_gno = xfs_fsb_to_gno(mp, end_bno, type);
        while ((xg = xfs_group_next_range(mp, xg, start_gno, end_gno, type))) {
                struct xfs_buf          *agf_bp = NULL;
                struct xfs_rtgroup      *rtg = NULL;
                struct xfs_btree_cur    *cur;
                struct xfs_rmap_irec    ri_low = { };
                struct xfs_rmap_irec    ri_high;
                struct xfs_group_data_lost lost;

                if (type == XG_TYPE_AG) {
                        struct xfs_perag        *pag = to_perag(xg);

                        error = xfs_alloc_read_agf(pag, tp, 0, &agf_bp);
                        if (error) {
                                xfs_perag_rele(pag);
                                break;
                        }

                        cur = xfs_rmapbt_init_cursor(mp, tp, agf_bp, pag);
                } else {
                        rtg = to_rtg(xg);
                        xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP);
                        cur = xfs_rtrmapbt_init_cursor(tp, rtg);
                }

                /*
                 * Set the rmap range from ri_low to ri_high, which represents
                 * a [start, end] where we looking for the files or metadata.
                 */
                memset(&ri_high, 0xFF, sizeof(ri_high));
                if (xg->xg_gno == start_gno)
                        ri_low.rm_startblock =
                                xfs_fsb_to_gbno(mp, start_bno, type);
                if (xg->xg_gno == end_gno)
                        ri_high.rm_startblock =
                                xfs_fsb_to_gbno(mp, end_bno, type);

                lost.startblock = ri_low.rm_startblock;
                lost.blockcount = min(xg->xg_block_count,
                                      ri_high.rm_startblock + 1) -
                                                        ri_low.rm_startblock;

                error = xfs_rmap_query_range(cur, &ri_low, &ri_high,
                                xfs_verify_report_data_lost, &lost);
                xfs_btree_del_cursor(cur, error);
                if (agf_bp)
                        xfs_trans_brelse(tp, agf_bp);
                if (rtg)
                        xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_RMAP);
                if (error) {
                        xfs_group_rele(xg);
                        break;
                }
        }

        xfs_trans_cancel(tp);
        return 0;
}

/*
 * Compute the desired verify IO size.
 *
 * To minimize command overhead, we'd like to create bios that are 1MB, though
 * we allow the user to ask for a smaller size.
 */
static unsigned int
xfs_verify_iosize(
        const struct xfs_verify_media   *me,
        struct xfs_buftarg              *btp,
        uint64_t                        bbcount)
{
        unsigned int                    iosize =
                        min_not_zero(SZ_1M, me->me_max_io_size);

        BUILD_BUG_ON(BBSHIFT != SECTOR_SHIFT);
        ASSERT(BBTOB(bbcount) >= btp->bt_logical_sectorsize);

        return clamp(iosize, btp->bt_logical_sectorsize, BBTOB(bbcount));
}

/* Allocate as much memory as we can get for verification buffer. */
static struct folio *
xfs_verify_alloc_folio(
        const unsigned int      iosize)
{
        unsigned int            order = get_order(iosize);

        while (order > 0) {
                struct folio    *folio =
                        folio_alloc(GFP_KERNEL | __GFP_NORETRY, order);

                if (folio)
                        return folio;
                order--;
        }

        return folio_alloc(GFP_KERNEL, 0);
}

/* Report any kind of problem verifying media */
static void
xfs_verify_media_error(
        struct xfs_mount        *mp,
        struct xfs_verify_media *me,
        struct xfs_buftarg      *btp,
        xfs_daddr_t             daddr,
        unsigned int            bio_bbcount,
        blk_status_t            bio_status)
{
        trace_xfs_verify_media_error(mp, me, btp->bt_dev, daddr, bio_bbcount,
                        bio_status);

        /*
         * Pass any error, I/O or otherwise, up to the caller if we didn't
         * successfully verify any bytes at all.
         */
        if (me->me_start_daddr == daddr)
                me->me_ioerror = -blk_status_to_errno(bio_status);

        /*
         * PI validation failures, medium errors, or general IO errors are
         * treated as indicators of data loss.  Everything else are (hopefully)
         * transient errors and are not reported to healthmon or fsnotify.
         */
        switch (bio_status) {
        case BLK_STS_PROTECTION:
        case BLK_STS_IOERR:
        case BLK_STS_MEDIUM:
                break;
        default:
                return;
        }

        if (!(me->me_flags & XFS_VERIFY_MEDIA_REPORT))
                return;

        xfs_healthmon_report_media(mp, me->me_dev, daddr, bio_bbcount);

        if (!xfs_has_rmapbt(mp))
                return;

        switch (me->me_dev) {
        case XFS_DEV_DATA:
                xfs_verify_report_losses(mp, XG_TYPE_AG, daddr, bio_bbcount);
                break;
        case XFS_DEV_RT:
                xfs_verify_report_losses(mp, XG_TYPE_RTG, daddr, bio_bbcount);
                break;
        }
}

/* Verify the media of an xfs device by submitting read requests to the disk. */
static int
xfs_verify_media(
        struct xfs_mount        *mp,
        struct xfs_verify_media *me)
{
        struct xfs_buftarg      *btp = NULL;
        struct bio              *bio;
        struct folio            *folio;
        xfs_daddr_t             daddr;
        uint64_t                bbcount;
        int                     error = 0;

        me->me_ioerror = 0;

        switch (me->me_dev) {
        case XFS_DEV_DATA:
                btp = mp->m_ddev_targp;
                break;
        case XFS_DEV_LOG:
                if (mp->m_logdev_targp != mp->m_ddev_targp)
                        btp = mp->m_logdev_targp;
                break;
        case XFS_DEV_RT:
                btp = mp->m_rtdev_targp;
                break;
        }
        if (!btp)
                return -ENODEV;

        /*
         * If the caller told us to verify beyond the end of the disk, tell the
         * user exactly where that was.
         */
        if (me->me_end_daddr > btp->bt_nr_sectors)
                me->me_end_daddr = btp->bt_nr_sectors;

        /* start and end have to be aligned to the lba size */
        if (!IS_ALIGNED(BBTOB(me->me_start_daddr | me->me_end_daddr),
                        btp->bt_logical_sectorsize))
                return -EINVAL;

        /*
         * end_daddr is the exclusive end of the range, so if start_daddr
         * reaches there (or beyond), there's no work to be done.
         */
        if (me->me_start_daddr >= me->me_end_daddr)
                return 0;

        /*
         * There are three ranges involved here:
         *
         *  - [me->me_start_daddr, me->me_end_daddr) is the range that the
         *    user wants to verify.  end_daddr can be beyond the end of the
         *    disk; we'll constrain it to the end if necessary.
         *
         *  - [daddr, me->me_end_daddr) is the range that we have not yet
         *    verified.  We update daddr after each successful read.
         *    me->me_start_daddr is set to daddr before returning.
         *
         *  - [daddr, daddr + bio_bbcount) is the range that we're currently
         *    verifying.
         */
        daddr = me->me_start_daddr;
        bbcount = min_t(sector_t, me->me_end_daddr, btp->bt_nr_sectors) -
                          me->me_start_daddr;

        folio = xfs_verify_alloc_folio(xfs_verify_iosize(me, btp, bbcount));
        if (!folio)
                return -ENOMEM;

        trace_xfs_verify_media(mp, me, btp->bt_dev, daddr, bbcount, folio);

        bio = bio_alloc(btp->bt_bdev, 1, REQ_OP_READ, GFP_KERNEL);
        if (!bio) {
                error = -ENOMEM;
                goto out_folio;
        }

        while (bbcount > 0) {
                unsigned int    bio_bbcount;
                blk_status_t    bio_status;

                bio_reset(bio, btp->bt_bdev, REQ_OP_READ);
                bio->bi_iter.bi_sector = daddr;
                bio_add_folio_nofail(bio, folio,
                                min(bbcount << SECTOR_SHIFT, folio_size(folio)),
                                0);

                /*
                 * Save the length of the bio before we submit it, because we
                 * need the original daddr and length for reporting IO errors
                 * if the bio fails.
                 */
                bio_bbcount = bio->bi_iter.bi_size >> SECTOR_SHIFT;
                submit_bio_wait(bio);
                bio_status = bio->bi_status;
                if (bio_status != BLK_STS_OK) {
                        xfs_verify_media_error(mp, me, btp, daddr, bio_bbcount,
                                        bio_status);
                        error = 0;
                        break;
                }

                daddr += bio_bbcount;
                bbcount -= bio_bbcount;

                if (bbcount == 0)
                        break;

                if (me->me_rest_us) {
                        ktime_t expires;

                        expires = ktime_add_ns(ktime_get(),
                                        me->me_rest_us * 1000);
                        set_current_state(TASK_KILLABLE);
                        schedule_hrtimeout(&expires, HRTIMER_MODE_ABS);
                }

                if (fatal_signal_pending(current)) {
                        error = -EINTR;
                        break;
                }

                cond_resched();
        }

        bio_put(bio);
out_folio:
        folio_put(folio);

        if (error)
                return error;

        /*
         * Advance start_daddr to the end of what we verified if there wasn't
         * an operational error.
         */
        me->me_start_daddr = daddr;
        trace_xfs_verify_media_end(mp, me, btp->bt_dev);
        return 0;
}

int
xfs_ioc_verify_media(
        struct file                     *file,
        struct xfs_verify_media __user  *arg)
{
        struct xfs_verify_media         me;
        struct xfs_inode                *ip = XFS_I(file_inode(file));
        struct xfs_mount                *mp = ip->i_mount;
        int                             error;

        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;

        if (copy_from_user(&me, arg, sizeof(me)))
                return -EFAULT;

        if (me.me_pad)
                return -EINVAL;
        if (me.me_flags & ~XFS_VERIFY_MEDIA_FLAGS)
                return -EINVAL;

        switch (me.me_dev) {
        case XFS_DEV_DATA:
        case XFS_DEV_LOG:
        case XFS_DEV_RT:
                break;
        default:
                return -EINVAL;
        }

        error = xfs_verify_media(mp, &me);
        if (error)
                return error;

        if (copy_to_user(arg, &me, sizeof(me)))
                return -EFAULT;

        return 0;
}