root/fs/xfs/xfs_zone_gc.c
// SPDX-License-Identifier: GPL-2.0
/*
 * Copyright (c) 2023-2025 Christoph Hellwig.
 * Copyright (c) 2024-2025, Western Digital Corporation or its affiliates.
 */
#include "xfs_platform.h"
#include "xfs_shared.h"
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
#include "xfs_inode.h"
#include "xfs_btree.h"
#include "xfs_trans.h"
#include "xfs_icache.h"
#include "xfs_rmap.h"
#include "xfs_rtbitmap.h"
#include "xfs_rtrmap_btree.h"
#include "xfs_errortag.h"
#include "xfs_error.h"
#include "xfs_zone_alloc.h"
#include "xfs_zone_priv.h"
#include "xfs_zones.h"
#include "xfs_trace.h"

/*
 * Implement Garbage Collection (GC) of partially used zoned.
 *
 * To support the purely sequential writes in each zone, zoned XFS needs to be
 * able to move data remaining in a zone out of it to reset the zone to prepare
 * for writing to it again.
 *
 * This is done by the GC thread implemented in this file.  To support that a
 * number of zones (XFS_GC_ZONES) is reserved from the user visible capacity to
 * write the garbage collected data into.
 *
 * Whenever the available space is below the chosen threshold, the GC thread
 * looks for potential non-empty but not fully used zones that are worth
 * reclaiming.  Once found the rmap for the victim zone is queried, and after
 * a bit of sorting to reduce fragmentation, the still live extents are read
 * into memory and written to the GC target zone, and the bmap btree of the
 * files is updated to point to the new location.  To avoid taking the IOLOCK
 * and MMAPLOCK for the entire GC process and thus affecting the latency of
 * user reads and writes to the files, the GC writes are speculative and the
 * I/O completion checks that no other writes happened for the affected regions
 * before remapping.
 *
 * Once a zone does not contain any valid data, be that through GC or user
 * block removal, it is queued for for a zone reset.  The reset operation
 * carefully ensures that the RT device cache is flushed and all transactions
 * referencing the rmap have been committed to disk.
 */

/*
 * Size of each GC scratch allocation, and the number of buffers.
 */
#define XFS_GC_BUF_SIZE         SZ_1M
#define XFS_GC_NR_BUFS          2
static_assert(XFS_GC_NR_BUFS < BIO_MAX_VECS);

/*
 * Chunk that is read and written for each GC operation.
 *
 * Note that for writes to actual zoned devices, the chunk can be split when
 * reaching the hardware limit.
 */
struct xfs_gc_bio {
        struct xfs_zone_gc_data         *data;

        /*
         * Entry into the reading/writing/resetting list.  Only accessed from
         * the GC thread, so no locking needed.
         */
        struct list_head                entry;

        /*
         * State of this gc_bio.  Done means the current I/O completed.
         * Set from the bio end I/O handler, read from the GC thread.
         */
        enum {
                XFS_GC_BIO_NEW,
                XFS_GC_BIO_DONE,
        } state;

        /*
         * Pointer to the inode and byte range in the inode that this
         * GC chunk is operating on.
         */
        struct xfs_inode                *ip;
        loff_t                          offset;
        unsigned int                    len;

        /*
         * Existing startblock (in the zone to be freed) and newly assigned
         * daddr in the zone GCed into.
         */
        xfs_fsblock_t                   old_startblock;
        xfs_daddr_t                     new_daddr;

        /* Are we writing to a sequential write required zone? */
        bool                            is_seq;

        /* Open Zone being written to */
        struct xfs_open_zone            *oz;

        struct xfs_rtgroup              *victim_rtg;

        /* Bio used for reads and writes, including the bvec used by it */
        struct bio                      bio;    /* must be last */
};

#define XFS_ZONE_GC_RECS                1024

/* iterator, needs to be reinitialized for each victim zone */
struct xfs_zone_gc_iter {
        struct xfs_rtgroup              *victim_rtg;
        unsigned int                    rec_count;
        unsigned int                    rec_idx;
        xfs_agblock_t                   next_startblock;
        struct xfs_rmap_irec            *recs;
};

/*
 * Per-mount GC state.
 */
struct xfs_zone_gc_data {
        struct xfs_mount                *mp;

        /* bioset used to allocate the gc_bios */
        struct bio_set                  bio_set;

        /*
         * Scratchpad to buffer GC data, organized as a ring buffer over
         * discontiguous folios.  scratch_head is where the buffer is filled,
         * scratch_tail tracks the buffer space freed, and scratch_available
         * counts the space available in the ring buffer between the head and
         * the tail.
         */
        struct folio                    *scratch_folios[XFS_GC_NR_BUFS];
        unsigned int                    scratch_size;
        unsigned int                    scratch_available;
        unsigned int                    scratch_head;
        unsigned int                    scratch_tail;

        /*
         * List of bios currently being read, written and reset.
         * These lists are only accessed by the GC thread itself, and must only
         * be processed in order.
         */
        struct list_head                reading;
        struct list_head                writing;
        struct list_head                resetting;

        /*
         * Iterator for the victim zone.
         */
        struct xfs_zone_gc_iter         iter;
};

/*
 * We aim to keep enough zones free in stock to fully use the open zone limit
 * for data placement purposes. Additionally, the m_zonegc_low_space tunable
 * can be set to make sure a fraction of the unused blocks are available for
 * writing.
 */
bool
xfs_zoned_need_gc(
        struct xfs_mount        *mp)
{
        s64                     available, free, threshold;
        s32                     remainder;

        if (!xfs_zoned_have_reclaimable(mp->m_zone_info))
                return false;

        available = xfs_estimate_freecounter(mp, XC_FREE_RTAVAILABLE);

        if (available <
            xfs_rtgs_to_rfsbs(mp, mp->m_max_open_zones - XFS_OPEN_GC_ZONES))
                return true;

        free = xfs_estimate_freecounter(mp, XC_FREE_RTEXTENTS);

        threshold = div_s64_rem(free, 100, &remainder);
        threshold = threshold * mp->m_zonegc_low_space +
                    remainder * div_s64(mp->m_zonegc_low_space, 100);

        if (available < threshold)
                return true;

        return false;
}

static struct xfs_zone_gc_data *
xfs_zone_gc_data_alloc(
        struct xfs_mount        *mp)
{
        struct xfs_zone_gc_data *data;
        int                     i;

        data = kzalloc_obj(*data);
        if (!data)
                return NULL;
        data->iter.recs = kzalloc_objs(*data->iter.recs, XFS_ZONE_GC_RECS);
        if (!data->iter.recs)
                goto out_free_data;

        if (bioset_init(&data->bio_set, 16, offsetof(struct xfs_gc_bio, bio),
                        BIOSET_NEED_BVECS))
                goto out_free_recs;
        for (i = 0; i < XFS_GC_NR_BUFS; i++) {
                data->scratch_folios[i] =
                        folio_alloc(GFP_KERNEL, get_order(XFS_GC_BUF_SIZE));
                if (!data->scratch_folios[i])
                        goto out_free_scratch;
        }
        data->scratch_size = XFS_GC_BUF_SIZE * XFS_GC_NR_BUFS;
        data->scratch_available = data->scratch_size;
        INIT_LIST_HEAD(&data->reading);
        INIT_LIST_HEAD(&data->writing);
        INIT_LIST_HEAD(&data->resetting);
        data->mp = mp;
        return data;

out_free_scratch:
        while (--i >= 0)
                folio_put(data->scratch_folios[i]);
        bioset_exit(&data->bio_set);
out_free_recs:
        kfree(data->iter.recs);
out_free_data:
        kfree(data);
        return NULL;
}

static void
xfs_zone_gc_data_free(
        struct xfs_zone_gc_data *data)
{
        int                     i;

        for (i = 0; i < XFS_GC_NR_BUFS; i++)
                folio_put(data->scratch_folios[i]);
        bioset_exit(&data->bio_set);
        kfree(data->iter.recs);
        kfree(data);
}

static void
xfs_zone_gc_iter_init(
        struct xfs_zone_gc_iter *iter,
        struct xfs_rtgroup      *victim_rtg)

{
        iter->next_startblock = 0;
        iter->rec_count = 0;
        iter->rec_idx = 0;
        iter->victim_rtg = victim_rtg;
        atomic_inc(&victim_rtg->rtg_gccount);
}

/*
 * Query the rmap of the victim zone to gather the records to evacuate.
 */
static int
xfs_zone_gc_query_cb(
        struct xfs_btree_cur    *cur,
        const struct xfs_rmap_irec *irec,
        void                    *private)
{
        struct xfs_zone_gc_iter *iter = private;

        ASSERT(!XFS_RMAP_NON_INODE_OWNER(irec->rm_owner));
        ASSERT(!xfs_is_sb_inum(cur->bc_mp, irec->rm_owner));
        ASSERT(!(irec->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK)));

        iter->recs[iter->rec_count] = *irec;
        if (++iter->rec_count == XFS_ZONE_GC_RECS) {
                iter->next_startblock =
                        irec->rm_startblock + irec->rm_blockcount;
                return 1;
        }
        return 0;
}

static int
xfs_zone_gc_rmap_rec_cmp(
        const void                      *a,
        const void                      *b)
{
        const struct xfs_rmap_irec      *reca = a;
        const struct xfs_rmap_irec      *recb = b;
        int                             diff;

        diff = cmp_int(reca->rm_owner, recb->rm_owner);
        if (diff)
                return diff;
        return cmp_int(reca->rm_offset, recb->rm_offset);
}

static int
xfs_zone_gc_query(
        struct xfs_mount        *mp,
        struct xfs_zone_gc_iter *iter)
{
        struct xfs_rtgroup      *rtg = iter->victim_rtg;
        struct xfs_rmap_irec    ri_low = { };
        struct xfs_rmap_irec    ri_high;
        struct xfs_btree_cur    *cur;
        struct xfs_trans        *tp;
        int                     error;

        ASSERT(iter->next_startblock <= rtg_blocks(rtg));
        if (iter->next_startblock == rtg_blocks(rtg))
                goto done;

        ASSERT(iter->next_startblock < rtg_blocks(rtg));
        ri_low.rm_startblock = iter->next_startblock;
        memset(&ri_high, 0xFF, sizeof(ri_high));

        iter->rec_idx = 0;
        iter->rec_count = 0;

        tp = xfs_trans_alloc_empty(mp);
        xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP);
        cur = xfs_rtrmapbt_init_cursor(tp, rtg);
        error = xfs_rmap_query_range(cur, &ri_low, &ri_high,
                        xfs_zone_gc_query_cb, iter);
        xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_RMAP);
        xfs_btree_del_cursor(cur, error < 0 ? error : 0);
        xfs_trans_cancel(tp);

        if (error < 0)
                return error;

        /*
         * Sort the rmap records by inode number and increasing offset to
         * defragment the mappings.
         *
         * This could be further enhanced by an even bigger look ahead window,
         * but that's better left until we have better detection of changes to
         * inode mapping to avoid the potential of GCing already dead data.
         */
        sort(iter->recs, iter->rec_count, sizeof(iter->recs[0]),
                        xfs_zone_gc_rmap_rec_cmp, NULL);

        if (error == 0) {
                /*
                 * We finished iterating through the zone.
                 */
                iter->next_startblock = rtg_blocks(rtg);
                if (iter->rec_count == 0)
                        goto done;
        }

        return 0;
done:
        atomic_dec(&iter->victim_rtg->rtg_gccount);
        xfs_rtgroup_rele(iter->victim_rtg);
        iter->victim_rtg = NULL;
        return 0;
}

static bool
xfs_zone_gc_iter_next(
        struct xfs_mount        *mp,
        struct xfs_zone_gc_iter *iter,
        struct xfs_rmap_irec    *chunk_rec,
        struct xfs_inode        **ipp)
{
        struct xfs_rmap_irec    *irec;
        int                     error;

        if (!iter->victim_rtg)
                return false;

retry:
        if (iter->rec_idx == iter->rec_count) {
                error = xfs_zone_gc_query(mp, iter);
                if (error)
                        goto fail;
                if (!iter->victim_rtg)
                        return false;
        }

        irec = &iter->recs[iter->rec_idx];
        error = xfs_iget(mp, NULL, irec->rm_owner,
                        XFS_IGET_UNTRUSTED | XFS_IGET_DONTCACHE, 0, ipp);
        if (error) {
                /*
                 * If the inode was already deleted, skip over it.
                 */
                if (error == -ENOENT) {
                        iter->rec_idx++;
                        goto retry;
                }
                goto fail;
        }

        if (!S_ISREG(VFS_I(*ipp)->i_mode) || !XFS_IS_REALTIME_INODE(*ipp)) {
                iter->rec_idx++;
                xfs_irele(*ipp);
                goto retry;
        }

        *chunk_rec = *irec;
        return true;

fail:
        xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
        return false;
}

static void
xfs_zone_gc_iter_advance(
        struct xfs_zone_gc_iter *iter,
        xfs_extlen_t            count_fsb)
{
        struct xfs_rmap_irec    *irec = &iter->recs[iter->rec_idx];

        irec->rm_offset += count_fsb;
        irec->rm_startblock += count_fsb;
        irec->rm_blockcount -= count_fsb;
        if (!irec->rm_blockcount)
                iter->rec_idx++;
}

static struct xfs_rtgroup *
xfs_zone_gc_pick_victim_from(
        struct xfs_mount        *mp,
        uint32_t                bucket)
{
        struct xfs_zone_info    *zi = mp->m_zone_info;
        uint32_t                victim_used = U32_MAX;
        struct xfs_rtgroup      *victim_rtg = NULL;
        uint32_t                bit;

        if (!zi->zi_used_bucket_entries[bucket])
                return NULL;

        for_each_set_bit(bit, zi->zi_used_bucket_bitmap[bucket],
                        mp->m_sb.sb_rgcount) {
                struct xfs_rtgroup *rtg = xfs_rtgroup_grab(mp, bit);

                if (!rtg)
                        continue;

                /*
                 * If the zone is already undergoing GC, don't pick it again.
                 *
                 * This prevents us from picking one of the zones for which we
                 * already submitted GC I/O, but for which the remapping hasn't
                 * concluded yet.  This won't cause data corruption, but
                 * increases write amplification and slows down GC, so this is
                 * a bad thing.
                 */
                if (atomic_read(&rtg->rtg_gccount)) {
                        xfs_rtgroup_rele(rtg);
                        continue;
                }

                /* skip zones that are just waiting for a reset */
                if (rtg_rmap(rtg)->i_used_blocks == 0 ||
                    rtg_rmap(rtg)->i_used_blocks >= victim_used) {
                        xfs_rtgroup_rele(rtg);
                        continue;
                }

                if (victim_rtg)
                        xfs_rtgroup_rele(victim_rtg);
                victim_rtg = rtg;
                victim_used = rtg_rmap(rtg)->i_used_blocks;

                /*
                 * Any zone that is less than 1 percent used is fair game for
                 * instant reclaim. All of these zones are in the last
                 * bucket, so avoid the expensive division for the zones
                 * in the other buckets.
                 */
                if (bucket == 0 &&
                    rtg_rmap(rtg)->i_used_blocks < rtg_blocks(rtg) / 100)
                        break;
        }

        return victim_rtg;
}

/*
 * Iterate through all zones marked as reclaimable and find a candidate to
 * reclaim.
 */
static bool
xfs_zone_gc_select_victim(
        struct xfs_zone_gc_data *data)
{
        struct xfs_zone_gc_iter *iter = &data->iter;
        struct xfs_mount        *mp = data->mp;
        struct xfs_zone_info    *zi = mp->m_zone_info;
        struct xfs_rtgroup      *victim_rtg = NULL;
        unsigned int            bucket;

        spin_lock(&zi->zi_used_buckets_lock);
        for (bucket = 0; bucket < XFS_ZONE_USED_BUCKETS; bucket++) {
                victim_rtg = xfs_zone_gc_pick_victim_from(mp, bucket);
                if (victim_rtg)
                        break;
        }
        spin_unlock(&zi->zi_used_buckets_lock);

        if (!victim_rtg)
                return false;

        trace_xfs_zone_gc_select_victim(victim_rtg, bucket);
        xfs_zone_gc_iter_init(iter, victim_rtg);
        return true;
}

static struct xfs_open_zone *
xfs_zone_gc_steal_open(
        struct xfs_zone_info    *zi)
{
        struct xfs_open_zone    *oz, *found = NULL;

        spin_lock(&zi->zi_open_zones_lock);
        list_for_each_entry(oz, &zi->zi_open_zones, oz_entry) {
                if (!found || oz->oz_allocated < found->oz_allocated)
                        found = oz;
        }

        if (found) {
                found->oz_is_gc = true;
                list_del_init(&found->oz_entry);
                zi->zi_nr_open_zones--;
        }

        spin_unlock(&zi->zi_open_zones_lock);
        return found;
}

static struct xfs_open_zone *
xfs_zone_gc_select_target(
        struct xfs_mount        *mp)
{
        struct xfs_zone_info    *zi = mp->m_zone_info;
        struct xfs_open_zone    *oz = zi->zi_open_gc_zone;

        /*
         * We need to wait for pending writes to finish.
         */
        if (oz && oz->oz_written < rtg_blocks(oz->oz_rtg))
                return NULL;

        ASSERT(zi->zi_nr_open_zones <=
                mp->m_max_open_zones - XFS_OPEN_GC_ZONES);
        oz = xfs_open_zone(mp, WRITE_LIFE_NOT_SET, true);
        if (oz)
                trace_xfs_zone_gc_target_opened(oz->oz_rtg);
        spin_lock(&zi->zi_open_zones_lock);
        zi->zi_open_gc_zone = oz;
        spin_unlock(&zi->zi_open_zones_lock);
        return oz;
}

/*
 * Ensure we have a valid open zone to write the GC data to.
 *
 * If the current target zone has space keep writing to it, else first wait for
 * all pending writes and then pick a new one.
 */
static struct xfs_open_zone *
xfs_zone_gc_ensure_target(
        struct xfs_mount        *mp)
{
        struct xfs_open_zone    *oz = mp->m_zone_info->zi_open_gc_zone;

        if (!oz || oz->oz_allocated == rtg_blocks(oz->oz_rtg))
                return xfs_zone_gc_select_target(mp);
        return oz;
}

static void
xfs_zone_gc_end_io(
        struct bio              *bio)
{
        struct xfs_gc_bio       *chunk =
                container_of(bio, struct xfs_gc_bio, bio);
        struct xfs_zone_gc_data *data = chunk->data;

        WRITE_ONCE(chunk->state, XFS_GC_BIO_DONE);
        wake_up_process(data->mp->m_zone_info->zi_gc_thread);
}

static struct xfs_open_zone *
xfs_zone_gc_alloc_blocks(
        struct xfs_zone_gc_data *data,
        xfs_extlen_t            *count_fsb,
        xfs_daddr_t             *daddr,
        bool                    *is_seq)
{
        struct xfs_mount        *mp = data->mp;
        struct xfs_open_zone    *oz;

        oz = xfs_zone_gc_ensure_target(mp);
        if (!oz)
                return NULL;

        *count_fsb = min(*count_fsb, XFS_B_TO_FSB(mp, data->scratch_available));

        /*
         * Directly allocate GC blocks from the reserved pool.
         *
         * If we'd take them from the normal pool we could be stealing blocks
         * from a regular writer, which would then have to wait for GC and
         * deadlock.
         */
        spin_lock(&mp->m_sb_lock);
        *count_fsb = min(*count_fsb,
                        rtg_blocks(oz->oz_rtg) - oz->oz_allocated);
        *count_fsb = min3(*count_fsb,
                        mp->m_free[XC_FREE_RTEXTENTS].res_avail,
                        mp->m_free[XC_FREE_RTAVAILABLE].res_avail);
        mp->m_free[XC_FREE_RTEXTENTS].res_avail -= *count_fsb;
        mp->m_free[XC_FREE_RTAVAILABLE].res_avail -= *count_fsb;
        spin_unlock(&mp->m_sb_lock);

        if (!*count_fsb)
                return NULL;

        *daddr = xfs_gbno_to_daddr(rtg_group(oz->oz_rtg), 0);
        *is_seq = bdev_zone_is_seq(mp->m_rtdev_targp->bt_bdev, *daddr);
        if (!*is_seq)
                *daddr += XFS_FSB_TO_BB(mp, oz->oz_allocated);
        oz->oz_allocated += *count_fsb;
        atomic_inc(&oz->oz_ref);
        return oz;
}

static void
xfs_zone_gc_add_data(
        struct xfs_gc_bio       *chunk)
{
        struct xfs_zone_gc_data *data = chunk->data;
        unsigned int            len = chunk->len;
        unsigned int            off = data->scratch_head;

        do {
                unsigned int    this_off = off % XFS_GC_BUF_SIZE;
                unsigned int    this_len = min(len, XFS_GC_BUF_SIZE - this_off);

                bio_add_folio_nofail(&chunk->bio,
                                data->scratch_folios[off / XFS_GC_BUF_SIZE],
                                this_len, this_off);
                len -= this_len;
                off += this_len;
                if (off == data->scratch_size)
                        off = 0;
        } while (len);
}

static bool
xfs_zone_gc_start_chunk(
        struct xfs_zone_gc_data *data)
{
        struct xfs_zone_gc_iter *iter = &data->iter;
        struct xfs_mount        *mp = data->mp;
        struct block_device     *bdev = mp->m_rtdev_targp->bt_bdev;
        struct xfs_open_zone    *oz;
        struct xfs_rmap_irec    irec;
        struct xfs_gc_bio       *chunk;
        struct xfs_inode        *ip;
        struct bio              *bio;
        xfs_daddr_t             daddr;
        unsigned int            len;
        bool                    is_seq;

        if (xfs_is_shutdown(mp))
                return false;

        if (!xfs_zone_gc_iter_next(mp, iter, &irec, &ip))
                return false;
        oz = xfs_zone_gc_alloc_blocks(data, &irec.rm_blockcount, &daddr,
                        &is_seq);
        if (!oz) {
                xfs_irele(ip);
                return false;
        }

        len = XFS_FSB_TO_B(mp, irec.rm_blockcount);
        bio = bio_alloc_bioset(bdev,
                        min(howmany(len, XFS_GC_BUF_SIZE) + 1, XFS_GC_NR_BUFS),
                        REQ_OP_READ, GFP_NOFS, &data->bio_set);

        chunk = container_of(bio, struct xfs_gc_bio, bio);
        chunk->ip = ip;
        chunk->offset = XFS_FSB_TO_B(mp, irec.rm_offset);
        chunk->len = len;
        chunk->old_startblock =
                xfs_rgbno_to_rtb(iter->victim_rtg, irec.rm_startblock);
        chunk->new_daddr = daddr;
        chunk->is_seq = is_seq;
        chunk->data = data;
        chunk->oz = oz;
        chunk->victim_rtg = iter->victim_rtg;
        atomic_inc(&rtg_group(chunk->victim_rtg)->xg_active_ref);
        atomic_inc(&chunk->victim_rtg->rtg_gccount);

        bio->bi_iter.bi_sector = xfs_rtb_to_daddr(mp, chunk->old_startblock);
        bio->bi_end_io = xfs_zone_gc_end_io;
        xfs_zone_gc_add_data(chunk);
        data->scratch_head = (data->scratch_head + len) % data->scratch_size;
        data->scratch_available -= len;

        XFS_STATS_INC(mp, xs_gc_read_calls);

        WRITE_ONCE(chunk->state, XFS_GC_BIO_NEW);
        list_add_tail(&chunk->entry, &data->reading);
        xfs_zone_gc_iter_advance(iter, irec.rm_blockcount);

        submit_bio(bio);
        return true;
}

static void
xfs_zone_gc_free_chunk(
        struct xfs_gc_bio       *chunk)
{
        atomic_dec(&chunk->victim_rtg->rtg_gccount);
        xfs_rtgroup_rele(chunk->victim_rtg);
        list_del(&chunk->entry);
        xfs_open_zone_put(chunk->oz);
        xfs_irele(chunk->ip);
        bio_put(&chunk->bio);
}

static void
xfs_zone_gc_submit_write(
        struct xfs_zone_gc_data *data,
        struct xfs_gc_bio       *chunk)
{
        if (chunk->is_seq) {
                chunk->bio.bi_opf &= ~REQ_OP_WRITE;
                chunk->bio.bi_opf |= REQ_OP_ZONE_APPEND;
        }
        chunk->bio.bi_iter.bi_sector = chunk->new_daddr;
        chunk->bio.bi_end_io = xfs_zone_gc_end_io;
        submit_bio(&chunk->bio);
}

static struct xfs_gc_bio *
xfs_zone_gc_split_write(
        struct xfs_zone_gc_data *data,
        struct xfs_gc_bio       *chunk)
{
        struct queue_limits     *lim =
                &bdev_get_queue(chunk->bio.bi_bdev)->limits;
        struct xfs_gc_bio       *split_chunk;
        int                     split_sectors;
        unsigned int            split_len;
        struct bio              *split;
        unsigned int            nsegs;

        if (!chunk->is_seq)
                return NULL;

        split_sectors = bio_split_rw_at(&chunk->bio, lim, &nsegs,
                        lim->max_zone_append_sectors << SECTOR_SHIFT);
        if (!split_sectors)
                return NULL;

        /* ensure the split chunk is still block size aligned */
        split_sectors = ALIGN_DOWN(split_sectors << SECTOR_SHIFT,
                        data->mp->m_sb.sb_blocksize) >> SECTOR_SHIFT;
        split_len = split_sectors << SECTOR_SHIFT;

        split = bio_split(&chunk->bio, split_sectors, GFP_NOFS, &data->bio_set);
        split_chunk = container_of(split, struct xfs_gc_bio, bio);
        split_chunk->data = data;
        ihold(VFS_I(chunk->ip));
        split_chunk->ip = chunk->ip;
        split_chunk->is_seq = chunk->is_seq;
        split_chunk->offset = chunk->offset;
        split_chunk->len = split_len;
        split_chunk->old_startblock = chunk->old_startblock;
        split_chunk->new_daddr = chunk->new_daddr;
        split_chunk->oz = chunk->oz;
        atomic_inc(&chunk->oz->oz_ref);

        split_chunk->victim_rtg = chunk->victim_rtg;
        atomic_inc(&rtg_group(chunk->victim_rtg)->xg_active_ref);
        atomic_inc(&chunk->victim_rtg->rtg_gccount);

        chunk->offset += split_len;
        chunk->len -= split_len;
        chunk->old_startblock += XFS_B_TO_FSB(data->mp, split_len);

        /* add right before the original chunk */
        WRITE_ONCE(split_chunk->state, XFS_GC_BIO_NEW);
        list_add_tail(&split_chunk->entry, &chunk->entry);
        return split_chunk;
}

static void
xfs_zone_gc_write_chunk(
        struct xfs_gc_bio       *chunk)
{
        struct xfs_zone_gc_data *data = chunk->data;
        struct xfs_mount        *mp = chunk->ip->i_mount;
        struct xfs_gc_bio       *split_chunk;

        if (chunk->bio.bi_status)
                xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
        if (xfs_is_shutdown(mp)) {
                xfs_zone_gc_free_chunk(chunk);
                return;
        }

        XFS_STATS_INC(mp, xs_gc_write_calls);
        XFS_STATS_ADD(mp, xs_gc_bytes, chunk->len);

        WRITE_ONCE(chunk->state, XFS_GC_BIO_NEW);
        list_move_tail(&chunk->entry, &data->writing);

        bio_reuse(&chunk->bio, REQ_OP_WRITE);
        while ((split_chunk = xfs_zone_gc_split_write(data, chunk)))
                xfs_zone_gc_submit_write(data, split_chunk);
        xfs_zone_gc_submit_write(data, chunk);
}

static void
xfs_zone_gc_finish_chunk(
        struct xfs_gc_bio       *chunk)
{
        uint                    iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
        struct xfs_zone_gc_data *data = chunk->data;
        struct xfs_inode        *ip = chunk->ip;
        struct xfs_mount        *mp = ip->i_mount;
        int                     error;

        if (chunk->bio.bi_status)
                xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
        if (xfs_is_shutdown(mp)) {
                xfs_zone_gc_free_chunk(chunk);
                return;
        }

        data->scratch_tail =
                (data->scratch_tail + chunk->len) % data->scratch_size;
        data->scratch_available += chunk->len;

        /*
         * Cycle through the iolock and wait for direct I/O and layouts to
         * ensure no one is reading from the old mapping before it goes away.
         *
         * Note that xfs_zoned_end_io() below checks that no other writer raced
         * with us to update the mapping by checking that the old startblock
         * didn't change.
         */
        xfs_ilock(ip, iolock);
        error = xfs_break_layouts(VFS_I(ip), &iolock, BREAK_UNMAP);
        if (!error)
                inode_dio_wait(VFS_I(ip));
        xfs_iunlock(ip, iolock);
        if (error)
                goto free;

        if (chunk->is_seq)
                chunk->new_daddr = chunk->bio.bi_iter.bi_sector;
        error = xfs_zoned_end_io(ip, chunk->offset, chunk->len,
                        chunk->new_daddr, chunk->oz, chunk->old_startblock);
free:
        if (error)
                xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
        xfs_zone_gc_free_chunk(chunk);
}

static void
xfs_zone_gc_finish_reset(
        struct xfs_gc_bio       *chunk)
{
        struct xfs_rtgroup      *rtg = chunk->bio.bi_private;
        struct xfs_mount        *mp = rtg_mount(rtg);
        struct xfs_zone_info    *zi = mp->m_zone_info;

        if (chunk->bio.bi_status) {
                xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
                goto out;
        }

        xfs_group_set_mark(rtg_group(rtg), XFS_RTG_FREE);
        atomic_inc(&zi->zi_nr_free_zones);

        xfs_zoned_add_available(mp, rtg_blocks(rtg));

        wake_up_all(&zi->zi_zone_wait);
out:
        list_del(&chunk->entry);
        bio_put(&chunk->bio);
}

static void
xfs_submit_zone_reset_bio(
        struct xfs_rtgroup      *rtg,
        struct bio              *bio)
{
        struct xfs_mount        *mp = rtg_mount(rtg);

        trace_xfs_zone_reset(rtg);

        ASSERT(rtg_rmap(rtg)->i_used_blocks == 0);

        if (XFS_TEST_ERROR(mp, XFS_ERRTAG_ZONE_RESET)) {
                bio_io_error(bio);
                return;
        }

        XFS_STATS_INC(mp, xs_gc_zone_reset_calls);

        bio->bi_iter.bi_sector = xfs_gbno_to_daddr(rtg_group(rtg), 0);
        if (!bdev_zone_is_seq(bio->bi_bdev, bio->bi_iter.bi_sector)) {
                /*
                 * Also use the bio to drive the state machine when neither
                 * zone reset nor discard is supported to keep things simple.
                 */
                if (!bdev_max_discard_sectors(bio->bi_bdev)) {
                        bio_endio(bio);
                        return;
                }
                bio->bi_opf &= ~REQ_OP_ZONE_RESET;
                bio->bi_opf |= REQ_OP_DISCARD;
                bio->bi_iter.bi_size = XFS_FSB_TO_B(mp, rtg_blocks(rtg));
        }

        submit_bio(bio);
}

static void xfs_bio_wait_endio(struct bio *bio)
{
        complete(bio->bi_private);
}

int
xfs_zone_gc_reset_sync(
        struct xfs_rtgroup      *rtg)
{
        DECLARE_COMPLETION_ONSTACK(done);
        struct bio              bio;
        int                     error;

        bio_init(&bio, rtg_mount(rtg)->m_rtdev_targp->bt_bdev, NULL, 0,
                        REQ_OP_ZONE_RESET | REQ_SYNC);
        bio.bi_private = &done;
        bio.bi_end_io = xfs_bio_wait_endio;
        xfs_submit_zone_reset_bio(rtg, &bio);
        wait_for_completion_io(&done);

        error = blk_status_to_errno(bio.bi_status);
        bio_uninit(&bio);
        return error;
}

static void
xfs_zone_gc_reset_zones(
        struct xfs_zone_gc_data *data,
        struct xfs_group        *reset_list)
{
        struct xfs_group        *next = reset_list;

        if (blkdev_issue_flush(data->mp->m_rtdev_targp->bt_bdev) < 0) {
                xfs_force_shutdown(data->mp, SHUTDOWN_META_IO_ERROR);
                return;
        }

        do {
                struct xfs_rtgroup      *rtg = to_rtg(next);
                struct xfs_gc_bio       *chunk;
                struct bio              *bio;

                xfs_log_force_inode(rtg_rmap(rtg));

                next = rtg_group(rtg)->xg_next_reset;
                rtg_group(rtg)->xg_next_reset = NULL;

                bio = bio_alloc_bioset(rtg_mount(rtg)->m_rtdev_targp->bt_bdev,
                                0, REQ_OP_ZONE_RESET, GFP_NOFS, &data->bio_set);
                bio->bi_private = rtg;
                bio->bi_end_io = xfs_zone_gc_end_io;

                chunk = container_of(bio, struct xfs_gc_bio, bio);
                chunk->data = data;
                WRITE_ONCE(chunk->state, XFS_GC_BIO_NEW);
                list_add_tail(&chunk->entry, &data->resetting);
                xfs_submit_zone_reset_bio(rtg, bio);
        } while (next);
}

static bool
xfs_zone_gc_should_start_new_work(
        struct xfs_zone_gc_data *data)
{
        struct xfs_open_zone    *oz;

        if (xfs_is_shutdown(data->mp))
                return false;
        if (!data->scratch_available)
                return false;

        oz = xfs_zone_gc_ensure_target(data->mp);
        if (!oz || oz->oz_allocated == rtg_blocks(oz->oz_rtg))
                return false;

        if (!data->iter.victim_rtg) {
                if (kthread_should_stop() || kthread_should_park())
                        return false;
                if (!xfs_zoned_need_gc(data->mp))
                        return false;
                if (!xfs_zone_gc_select_victim(data))
                        return false;
        }

        return true;
}

/*
 * Handle the work to read and write data for GC and to reset the zones,
 * including handling all completions.
 *
 * Note that the order of the chunks is preserved so that we don't undo the
 * optimal order established by xfs_zone_gc_query().
 */
static void
xfs_zone_gc_handle_work(
        struct xfs_zone_gc_data *data)
{
        struct xfs_zone_info    *zi = data->mp->m_zone_info;
        struct xfs_gc_bio       *chunk, *next;
        struct xfs_group        *reset_list;
        struct blk_plug         plug;

        spin_lock(&zi->zi_reset_list_lock);
        reset_list = zi->zi_reset_list;
        zi->zi_reset_list = NULL;
        spin_unlock(&zi->zi_reset_list_lock);

        if (reset_list) {
                set_current_state(TASK_RUNNING);
                xfs_zone_gc_reset_zones(data, reset_list);
        }

        list_for_each_entry_safe(chunk, next, &data->resetting, entry) {
                if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE)
                        break;
                set_current_state(TASK_RUNNING);
                xfs_zone_gc_finish_reset(chunk);
        }

        list_for_each_entry_safe(chunk, next, &data->writing, entry) {
                if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE)
                        break;
                set_current_state(TASK_RUNNING);
                xfs_zone_gc_finish_chunk(chunk);
        }

        blk_start_plug(&plug);
        list_for_each_entry_safe(chunk, next, &data->reading, entry) {
                if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE)
                        break;
                set_current_state(TASK_RUNNING);
                xfs_zone_gc_write_chunk(chunk);
        }
        blk_finish_plug(&plug);

        if (xfs_zone_gc_should_start_new_work(data)) {
                set_current_state(TASK_RUNNING);
                blk_start_plug(&plug);
                while (xfs_zone_gc_start_chunk(data))
                        ;
                blk_finish_plug(&plug);
        }
}

/*
 * Note that the current GC algorithm would break reflinks and thus duplicate
 * data that was shared by multiple owners before.  Because of that reflinks
 * are currently not supported on zoned file systems and can't be created or
 * mounted.
 */
static int
xfs_zoned_gcd(
        void                    *private)
{
        struct xfs_zone_gc_data *data = private;
        struct xfs_mount        *mp = data->mp;
        struct xfs_zone_info    *zi = mp->m_zone_info;
        unsigned int            nofs_flag;

        nofs_flag = memalloc_nofs_save();
        set_freezable();

        for (;;) {
                set_current_state(TASK_INTERRUPTIBLE | TASK_FREEZABLE);
                xfs_set_zonegc_running(mp);

                xfs_zone_gc_handle_work(data);

                /*
                 * Only sleep if nothing set the state to running.  Else check for
                 * work again as someone might have queued up more work and woken
                 * us in the meantime.
                 */
                if (get_current_state() == TASK_RUNNING) {
                        try_to_freeze();
                        continue;
                }

                if (list_empty(&data->reading) &&
                    list_empty(&data->writing) &&
                    list_empty(&data->resetting) &&
                    !zi->zi_reset_list) {
                        xfs_clear_zonegc_running(mp);
                        xfs_zoned_resv_wake_all(mp);

                        if (kthread_should_stop()) {
                                __set_current_state(TASK_RUNNING);
                                break;
                        }

                        if (kthread_should_park()) {
                                __set_current_state(TASK_RUNNING);
                                kthread_parkme();
                                continue;
                        }
                }

                schedule();
        }
        xfs_clear_zonegc_running(mp);

        if (data->iter.victim_rtg)
                xfs_rtgroup_rele(data->iter.victim_rtg);

        memalloc_nofs_restore(nofs_flag);
        xfs_zone_gc_data_free(data);
        return 0;
}

void
xfs_zone_gc_start(
        struct xfs_mount        *mp)
{
        if (xfs_has_zoned(mp))
                kthread_unpark(mp->m_zone_info->zi_gc_thread);
}

void
xfs_zone_gc_stop(
        struct xfs_mount        *mp)
{
        if (xfs_has_zoned(mp))
                kthread_park(mp->m_zone_info->zi_gc_thread);
}

int
xfs_zone_gc_mount(
        struct xfs_mount        *mp)
{
        struct xfs_zone_info    *zi = mp->m_zone_info;
        struct xfs_zone_gc_data *data;
        struct xfs_open_zone    *oz;
        int                     error;

        /*
         * If there are no free zones available for GC, pick the open zone with
         * the least used space to GC into.  This should only happen after an
         * unclean shutdown near ENOSPC while GC was ongoing.
         *
         * We also need to do this for the first gc zone allocation if we
         * unmounted while at the open limit.
         */
        if (!xfs_group_marked(mp, XG_TYPE_RTG, XFS_RTG_FREE) ||
            zi->zi_nr_open_zones == mp->m_max_open_zones)
                oz = xfs_zone_gc_steal_open(zi);
        else
                oz = xfs_open_zone(mp, WRITE_LIFE_NOT_SET, true);
        if (!oz) {
                xfs_warn(mp, "unable to allocate a zone for gc");
                error = -EIO;
                goto out;
        }

        trace_xfs_zone_gc_target_opened(oz->oz_rtg);
        zi->zi_open_gc_zone = oz;

        data = xfs_zone_gc_data_alloc(mp);
        if (!data) {
                error = -ENOMEM;
                goto out_put_gc_zone;
        }

        zi->zi_gc_thread = kthread_create(xfs_zoned_gcd, data,
                        "xfs-zone-gc/%s", mp->m_super->s_id);
        if (IS_ERR(zi->zi_gc_thread)) {
                xfs_warn(mp, "unable to create zone gc thread");
                error = PTR_ERR(zi->zi_gc_thread);
                goto out_free_gc_data;
        }

        /* xfs_zone_gc_start will unpark for rw mounts */
        kthread_park(zi->zi_gc_thread);
        return 0;

out_free_gc_data:
        kfree(data);
out_put_gc_zone:
        xfs_open_zone_put(zi->zi_open_gc_zone);
out:
        return error;
}

void
xfs_zone_gc_unmount(
        struct xfs_mount        *mp)
{
        struct xfs_zone_info    *zi = mp->m_zone_info;

        kthread_stop(zi->zi_gc_thread);
        if (zi->zi_open_gc_zone)
                xfs_open_zone_put(zi->zi_open_gc_zone);
}