root/fs/xfs/xfs_exchrange.c
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * Copyright (c) 2020-2024 Oracle.  All Rights Reserved.
 * Author: Darrick J. Wong <djwong@kernel.org>
 */
#include "xfs_platform.h"
#include "xfs_shared.h"
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
#include "xfs_defer.h"
#include "xfs_inode.h"
#include "xfs_trans.h"
#include "xfs_quota.h"
#include "xfs_bmap_util.h"
#include "xfs_reflink.h"
#include "xfs_trace.h"
#include "xfs_exchrange.h"
#include "xfs_exchmaps.h"
#include "xfs_sb.h"
#include "xfs_icache.h"
#include "xfs_log.h"
#include "xfs_rtbitmap.h"
#include <linux/fsnotify.h>

/* Lock (and optionally join) two inodes for a file range exchange. */
void
xfs_exchrange_ilock(
        struct xfs_trans        *tp,
        struct xfs_inode        *ip1,
        struct xfs_inode        *ip2)
{
        if (ip1 != ip2)
                xfs_lock_two_inodes(ip1, XFS_ILOCK_EXCL,
                                    ip2, XFS_ILOCK_EXCL);
        else
                xfs_ilock(ip1, XFS_ILOCK_EXCL);
        if (tp) {
                xfs_trans_ijoin(tp, ip1, 0);
                if (ip2 != ip1)
                        xfs_trans_ijoin(tp, ip2, 0);
        }

}

/* Unlock two inodes after a file range exchange operation. */
void
xfs_exchrange_iunlock(
        struct xfs_inode        *ip1,
        struct xfs_inode        *ip2)
{
        if (ip2 != ip1)
                xfs_iunlock(ip2, XFS_ILOCK_EXCL);
        xfs_iunlock(ip1, XFS_ILOCK_EXCL);
}

/*
 * Estimate the resource requirements to exchange file contents between the two
 * files.  The caller is required to hold the IOLOCK and the MMAPLOCK and to
 * have flushed both inodes' pagecache and active direct-ios.
 */
int
xfs_exchrange_estimate(
        struct xfs_exchmaps_req *req)
{
        int                     error;

        xfs_exchrange_ilock(NULL, req->ip1, req->ip2);
        error = xfs_exchmaps_estimate(req);
        xfs_exchrange_iunlock(req->ip1, req->ip2);
        return error;
}

/*
 * Check that file2's metadata agree with the snapshot that we took for the
 * range commit request.
 *
 * This should be called after the filesystem has locked /all/ inode metadata
 * against modification.
 */
STATIC int
xfs_exchrange_check_freshness(
        const struct xfs_exchrange      *fxr,
        struct xfs_inode                *ip2)
{
        struct inode                    *inode2 = VFS_I(ip2);
        struct timespec64               ctime = inode_get_ctime(inode2);
        struct timespec64               mtime = inode_get_mtime(inode2);

        trace_xfs_exchrange_freshness(fxr, ip2);

        /* Check that file2 hasn't otherwise been modified. */
        if (fxr->file2_ino != ip2->i_ino ||
            fxr->file2_gen != inode2->i_generation ||
            !timespec64_equal(&fxr->file2_ctime, &ctime) ||
            !timespec64_equal(&fxr->file2_mtime, &mtime))
                return -EBUSY;

        return 0;
}

#define QRETRY_IP1      (0x1)
#define QRETRY_IP2      (0x2)

/*
 * Obtain a quota reservation to make sure we don't hit EDQUOT.  We can skip
 * this if quota enforcement is disabled or if both inodes' dquots are the
 * same.  The qretry structure must be initialized to zeroes before the first
 * call to this function.
 */
STATIC int
xfs_exchrange_reserve_quota(
        struct xfs_trans                *tp,
        const struct xfs_exchmaps_req   *req,
        unsigned int                    *qretry)
{
        int64_t                         ddelta, rdelta;
        int                             ip1_error = 0;
        int                             error;

        ASSERT(!xfs_is_metadir_inode(req->ip1));
        ASSERT(!xfs_is_metadir_inode(req->ip2));

        /*
         * Don't bother with a quota reservation if we're not enforcing them
         * or the two inodes have the same dquots.
         */
        if (!XFS_IS_QUOTA_ON(tp->t_mountp) || req->ip1 == req->ip2 ||
            (req->ip1->i_udquot == req->ip2->i_udquot &&
             req->ip1->i_gdquot == req->ip2->i_gdquot &&
             req->ip1->i_pdquot == req->ip2->i_pdquot))
                return 0;

        *qretry = 0;

        /*
         * For each file, compute the net gain in the number of regular blocks
         * that will be mapped into that file and reserve that much quota.  The
         * quota counts must be able to absorb at least that much space.
         */
        ddelta = req->ip2_bcount - req->ip1_bcount;
        rdelta = req->ip2_rtbcount - req->ip1_rtbcount;
        if (ddelta > 0 || rdelta > 0) {
                error = xfs_trans_reserve_quota_nblks(tp, req->ip1,
                                ddelta > 0 ? ddelta : 0,
                                rdelta > 0 ? rdelta : 0,
                                false);
                if (error == -EDQUOT || error == -ENOSPC) {
                        /*
                         * Save this error and see what happens if we try to
                         * reserve quota for ip2.  Then report both.
                         */
                        *qretry |= QRETRY_IP1;
                        ip1_error = error;
                        error = 0;
                }
                if (error)
                        return error;
        }
        if (ddelta < 0 || rdelta < 0) {
                error = xfs_trans_reserve_quota_nblks(tp, req->ip2,
                                ddelta < 0 ? -ddelta : 0,
                                rdelta < 0 ? -rdelta : 0,
                                false);
                if (error == -EDQUOT || error == -ENOSPC)
                        *qretry |= QRETRY_IP2;
                if (error)
                        return error;
        }
        if (ip1_error)
                return ip1_error;

        /*
         * For each file, forcibly reserve the gross gain in mapped blocks so
         * that we don't trip over any quota block reservation assertions.
         * We must reserve the gross gain because the quota code subtracts from
         * bcount the number of blocks that we unmap; it does not add that
         * quantity back to the quota block reservation.
         */
        error = xfs_trans_reserve_quota_nblks(tp, req->ip1, req->ip1_bcount,
                        req->ip1_rtbcount, true);
        if (error)
                return error;

        return xfs_trans_reserve_quota_nblks(tp, req->ip2, req->ip2_bcount,
                        req->ip2_rtbcount, true);
}

/* Exchange the mappings (and hence the contents) of two files' forks. */
STATIC int
xfs_exchrange_mappings(
        const struct xfs_exchrange      *fxr,
        struct xfs_inode                *ip1,
        struct xfs_inode                *ip2)
{
        struct xfs_mount                *mp = ip1->i_mount;
        struct xfs_exchmaps_req         req = {
                .ip1                    = ip1,
                .ip2                    = ip2,
                .startoff1              = XFS_B_TO_FSBT(mp, fxr->file1_offset),
                .startoff2              = XFS_B_TO_FSBT(mp, fxr->file2_offset),
                .blockcount             = XFS_B_TO_FSB(mp, fxr->length),
        };
        struct xfs_trans                *tp;
        unsigned int                    qretry;
        bool                            retried = false;
        int                             error;

        trace_xfs_exchrange_mappings(fxr, ip1, ip2);

        if (fxr->flags & XFS_EXCHANGE_RANGE_TO_EOF)
                req.flags |= XFS_EXCHMAPS_SET_SIZES;
        if (fxr->flags & XFS_EXCHANGE_RANGE_FILE1_WRITTEN)
                req.flags |= XFS_EXCHMAPS_INO1_WRITTEN;

        /*
         * Round the request length up to the nearest file allocation unit.
         * The prep function already checked that the request offsets and
         * length in @fxr are safe to round up.
         */
        if (xfs_inode_has_bigrtalloc(ip2))
                req.blockcount = xfs_blen_roundup_rtx(mp, req.blockcount);

        error = xfs_exchrange_estimate(&req);
        if (error)
                return error;

retry:
        /* Allocate the transaction, lock the inodes, and join them. */
        error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, req.resblks, 0,
                        XFS_TRANS_RES_FDBLKS, &tp);
        if (error)
                return error;

        xfs_exchrange_ilock(tp, ip1, ip2);

        trace_xfs_exchrange_before(ip2, 2);
        trace_xfs_exchrange_before(ip1, 1);

        error = xfs_exchmaps_check_forks(mp, &req);
        if (error)
                goto out_trans_cancel;

        /*
         * Reserve ourselves some quota if any of them are in enforcing mode.
         * In theory we only need enough to satisfy the change in the number
         * of blocks between the two ranges being remapped.
         */
        error = xfs_exchrange_reserve_quota(tp, &req, &qretry);
        if ((error == -EDQUOT || error == -ENOSPC) && !retried) {
                xfs_trans_cancel(tp);
                xfs_exchrange_iunlock(ip1, ip2);
                if (qretry & QRETRY_IP1)
                        xfs_blockgc_free_quota(ip1, 0);
                if (qretry & QRETRY_IP2)
                        xfs_blockgc_free_quota(ip2, 0);
                retried = true;
                goto retry;
        }
        if (error)
                goto out_trans_cancel;

        /* If we got this far on a dry run, all parameters are ok. */
        if (fxr->flags & XFS_EXCHANGE_RANGE_DRY_RUN)
                goto out_trans_cancel;

        /* Update the mtime and ctime of both files. */
        if (fxr->flags & __XFS_EXCHANGE_RANGE_UPD_CMTIME1)
                xfs_trans_ichgtime(tp, ip1, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
        if (fxr->flags & __XFS_EXCHANGE_RANGE_UPD_CMTIME2)
                xfs_trans_ichgtime(tp, ip2, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);

        xfs_exchange_mappings(tp, &req);

        /*
         * Force the log to persist metadata updates if the caller or the
         * administrator requires this.  The generic prep function already
         * flushed the relevant parts of the page cache.
         */
        if (xfs_has_wsync(mp) || (fxr->flags & XFS_EXCHANGE_RANGE_DSYNC))
                xfs_trans_set_sync(tp);

        error = xfs_trans_commit(tp);

        trace_xfs_exchrange_after(ip2, 2);
        trace_xfs_exchrange_after(ip1, 1);

        if (error)
                goto out_unlock;

        /*
         * If the caller wanted us to exchange the contents of two complete
         * files of unequal length, exchange the incore sizes now.  This should
         * be safe because we flushed both files' page caches, exchanged all
         * the mappings, and updated the ondisk sizes.
         */
        if (fxr->flags & XFS_EXCHANGE_RANGE_TO_EOF) {
                loff_t  temp;

                temp = i_size_read(VFS_I(ip2));
                i_size_write(VFS_I(ip2), i_size_read(VFS_I(ip1)));
                i_size_write(VFS_I(ip1), temp);
        }

out_unlock:
        xfs_exchrange_iunlock(ip1, ip2);
        return error;

out_trans_cancel:
        xfs_trans_cancel(tp);
        goto out_unlock;
}

/*
 * Generic code for exchanging ranges of two files via XFS_IOC_EXCHANGE_RANGE.
 * This part deals with struct file objects and byte ranges and does not deal
 * with XFS-specific data structures such as xfs_inodes and block ranges.  This
 * separation may some day facilitate porting to another filesystem.
 *
 * The goal is to exchange fxr.length bytes starting at fxr.file1_offset in
 * file1 with the same number of bytes starting at fxr.file2_offset in file2.
 * Implementations must call xfs_exchange_range_prep to prepare the two
 * files prior to taking locks; and they must update the inode change and mod
 * times of both files as part of the metadata update.  The timestamp update
 * and freshness checks must be done atomically as part of the data exchange
 * operation to ensure correctness of the freshness check.
 * xfs_exchange_range_finish must be called after the operation completes
 * successfully but before locks are dropped.
 */

/*
 * Performs necessary checks before doing a range exchange, having stabilized
 * mutable inode attributes via i_rwsem.
 */
static inline int
xfs_exchange_range_checks(
        struct xfs_exchrange    *fxr,
        unsigned int            alloc_unit)
{
        struct inode            *inode1 = file_inode(fxr->file1);
        loff_t                  size1 = i_size_read(inode1);
        struct inode            *inode2 = file_inode(fxr->file2);
        loff_t                  size2 = i_size_read(inode2);
        uint64_t                allocmask = alloc_unit - 1;
        int64_t                 test_len;
        uint64_t                blen;
        loff_t                  tmp;
        int                     error;

        /* Don't touch certain kinds of inodes */
        if (IS_IMMUTABLE(inode1) || IS_IMMUTABLE(inode2))
                return -EPERM;
        if (IS_SWAPFILE(inode1) || IS_SWAPFILE(inode2))
                return -ETXTBSY;

        /* Ranges cannot start after EOF. */
        if (fxr->file1_offset > size1 || fxr->file2_offset > size2)
                return -EINVAL;

        if (fxr->flags & XFS_EXCHANGE_RANGE_TO_EOF) {
                /*
                 * If the caller said to exchange to EOF, we set the length of
                 * the request large enough to cover everything to the end of
                 * both files.
                 */
                fxr->length = max_t(int64_t, size1 - fxr->file1_offset,
                                             size2 - fxr->file2_offset);
        } else {
                /*
                 * Otherwise we require both ranges to end within EOF.
                 */
                if (fxr->file1_offset + fxr->length > size1 ||
                    fxr->file2_offset + fxr->length > size2)
                        return -EINVAL;
        }

        /*
         * The start of both ranges must be aligned to the file allocation
         * unit.
         */
        if (!IS_ALIGNED(fxr->file1_offset, alloc_unit) ||
            !IS_ALIGNED(fxr->file2_offset, alloc_unit))
                return -EINVAL;

        /* Ensure offsets don't wrap. */
        if (check_add_overflow(fxr->file1_offset, fxr->length, &tmp) ||
            check_add_overflow(fxr->file2_offset, fxr->length, &tmp))
                return -EINVAL;

        /*
         * Make sure we don't hit any file size limits.  If we hit any size
         * limits such that test_length was adjusted, we abort the whole
         * operation.
         */
        test_len = fxr->length;
        error = generic_write_check_limits(fxr->file2, fxr->file2_offset,
                        &test_len);
        if (error)
                return error;
        error = generic_write_check_limits(fxr->file1, fxr->file1_offset,
                        &test_len);
        if (error)
                return error;
        if (test_len != fxr->length)
                return -EINVAL;

        /*
         * If the user wanted us to exchange up to the infile's EOF, round up
         * to the next allocation unit boundary for this check.  Do the same
         * for the outfile.
         *
         * Otherwise, reject the range length if it's not aligned to an
         * allocation unit.
         */
        if (fxr->file1_offset + fxr->length == size1)
                blen = ALIGN(size1, alloc_unit) - fxr->file1_offset;
        else if (fxr->file2_offset + fxr->length == size2)
                blen = ALIGN(size2, alloc_unit) - fxr->file2_offset;
        else if (!IS_ALIGNED(fxr->length, alloc_unit))
                return -EINVAL;
        else
                blen = fxr->length;

        /* Don't allow overlapped exchanges within the same file. */
        if (inode1 == inode2 &&
            fxr->file2_offset + blen > fxr->file1_offset &&
            fxr->file1_offset + blen > fxr->file2_offset)
                return -EINVAL;

        /*
         * Ensure that we don't exchange a partial EOF block into the middle of
         * another file.
         */
        if ((fxr->length & allocmask) == 0)
                return 0;

        blen = fxr->length;
        if (fxr->file2_offset + blen < size2)
                blen &= ~allocmask;

        if (fxr->file1_offset + blen < size1)
                blen &= ~allocmask;

        return blen == fxr->length ? 0 : -EINVAL;
}

/*
 * Check that the two inodes are eligible for range exchanges, the ranges make
 * sense, and then flush all dirty data.  Caller must ensure that the inodes
 * have been locked against any other modifications.
 */
static inline int
xfs_exchange_range_prep(
        struct xfs_exchrange    *fxr,
        unsigned int            alloc_unit)
{
        struct inode            *inode1 = file_inode(fxr->file1);
        struct inode            *inode2 = file_inode(fxr->file2);
        bool                    same_inode = (inode1 == inode2);
        int                     error;

        /* Check that we don't violate system file offset limits. */
        error = xfs_exchange_range_checks(fxr, alloc_unit);
        if (error || fxr->length == 0)
                return error;

        /* Wait for the completion of any pending IOs on both files */
        inode_dio_wait(inode1);
        if (!same_inode)
                inode_dio_wait(inode2);

        error = filemap_write_and_wait_range(inode1->i_mapping,
                        fxr->file1_offset,
                        fxr->file1_offset + fxr->length - 1);
        if (error)
                return error;

        error = filemap_write_and_wait_range(inode2->i_mapping,
                        fxr->file2_offset,
                        fxr->file2_offset + fxr->length - 1);
        if (error)
                return error;

        /*
         * If the files or inodes involved require synchronous writes, amend
         * the request to force the filesystem to flush all data and metadata
         * to disk after the operation completes.
         */
        if (((fxr->file1->f_flags | fxr->file2->f_flags) & O_SYNC) ||
            IS_SYNC(inode1) || IS_SYNC(inode2))
                fxr->flags |= XFS_EXCHANGE_RANGE_DSYNC;

        return 0;
}

/*
 * Finish a range exchange operation, if it was successful.  Caller must ensure
 * that the inodes are still locked against any other modifications.
 */
static inline int
xfs_exchange_range_finish(
        struct xfs_exchrange    *fxr)
{
        int                     error;

        error = file_remove_privs(fxr->file1);
        if (error)
                return error;
        if (file_inode(fxr->file1) == file_inode(fxr->file2))
                return 0;

        return file_remove_privs(fxr->file2);
}

/*
 * Check the alignment of an exchange request when the allocation unit size
 * isn't a power of two.  The generic file-level helpers use (fast)
 * bitmask-based alignment checks, but here we have to use slow long division.
 */
static int
xfs_exchrange_check_rtalign(
        const struct xfs_exchrange      *fxr,
        struct xfs_inode                *ip1,
        struct xfs_inode                *ip2,
        unsigned int                    alloc_unit)
{
        uint64_t                        length = fxr->length;
        uint64_t                        blen;
        loff_t                          size1, size2;

        size1 = i_size_read(VFS_I(ip1));
        size2 = i_size_read(VFS_I(ip2));

        /* The start of both ranges must be aligned to a rt extent. */
        if (!isaligned_64(fxr->file1_offset, alloc_unit) ||
            !isaligned_64(fxr->file2_offset, alloc_unit))
                return -EINVAL;

        if (fxr->flags & XFS_EXCHANGE_RANGE_TO_EOF)
                length = max_t(int64_t, size1 - fxr->file1_offset,
                                        size2 - fxr->file2_offset);

        /*
         * If the user wanted us to exchange up to the infile's EOF, round up
         * to the next rt extent boundary for this check.  Do the same for the
         * outfile.
         *
         * Otherwise, reject the range length if it's not rt extent aligned.
         * We already confirmed the starting offsets' rt extent block
         * alignment.
         */
        if (fxr->file1_offset + length == size1)
                blen = roundup_64(size1, alloc_unit) - fxr->file1_offset;
        else if (fxr->file2_offset + length == size2)
                blen = roundup_64(size2, alloc_unit) - fxr->file2_offset;
        else if (!isaligned_64(length, alloc_unit))
                return -EINVAL;
        else
                blen = length;

        /* Don't allow overlapped exchanges within the same file. */
        if (ip1 == ip2 &&
            fxr->file2_offset + blen > fxr->file1_offset &&
            fxr->file1_offset + blen > fxr->file2_offset)
                return -EINVAL;

        /*
         * Ensure that we don't exchange a partial EOF rt extent into the
         * middle of another file.
         */
        if (isaligned_64(length, alloc_unit))
                return 0;

        blen = length;
        if (fxr->file2_offset + length < size2)
                blen = rounddown_64(blen, alloc_unit);

        if (fxr->file1_offset + blen < size1)
                blen = rounddown_64(blen, alloc_unit);

        return blen == length ? 0 : -EINVAL;
}

/* Prepare two files to have their data exchanged. */
STATIC int
xfs_exchrange_prep(
        struct xfs_exchrange    *fxr,
        struct xfs_inode        *ip1,
        struct xfs_inode        *ip2)
{
        struct xfs_mount        *mp = ip2->i_mount;
        unsigned int            alloc_unit = xfs_inode_alloc_unitsize(ip2);
        int                     error;

        trace_xfs_exchrange_prep(fxr, ip1, ip2);

        /* Verify both files are either real-time or non-realtime */
        if (XFS_IS_REALTIME_INODE(ip1) != XFS_IS_REALTIME_INODE(ip2))
                return -EINVAL;

        /* Check non-power of two alignment issues, if necessary. */
        if (!is_power_of_2(alloc_unit)) {
                error = xfs_exchrange_check_rtalign(fxr, ip1, ip2, alloc_unit);
                if (error)
                        return error;

                /*
                 * Do the generic file-level checks with the regular block
                 * alignment.
                 */
                alloc_unit = mp->m_sb.sb_blocksize;
        }

        error = xfs_exchange_range_prep(fxr, alloc_unit);
        if (error || fxr->length == 0)
                return error;

        if (fxr->flags & __XFS_EXCHANGE_RANGE_CHECK_FRESH2) {
                error = xfs_exchrange_check_freshness(fxr, ip2);
                if (error)
                        return error;
        }

        /* Attach dquots to both inodes before changing block maps. */
        error = xfs_qm_dqattach(ip2);
        if (error)
                return error;
        error = xfs_qm_dqattach(ip1);
        if (error)
                return error;

        trace_xfs_exchrange_flush(fxr, ip1, ip2);

        /* Flush the relevant ranges of both files. */
        error = xfs_flush_unmap_range(ip2, fxr->file2_offset, fxr->length);
        if (error)
                return error;
        error = xfs_flush_unmap_range(ip1, fxr->file1_offset, fxr->length);
        if (error)
                return error;

        /*
         * Cancel CoW fork preallocations for the ranges of both files.  The
         * prep function should have flushed all the dirty data, so the only
         * CoW mappings remaining should be speculative.
         */
        if (xfs_inode_has_cow_data(ip1)) {
                error = xfs_reflink_cancel_cow_range(ip1, fxr->file1_offset,
                                fxr->length, true);
                if (error)
                        return error;
        }

        if (xfs_inode_has_cow_data(ip2)) {
                error = xfs_reflink_cancel_cow_range(ip2, fxr->file2_offset,
                                fxr->length, true);
                if (error)
                        return error;
        }

        return 0;
}

/*
 * Exchange contents of files.  This is the binding between the generic
 * file-level concepts and the XFS inode-specific implementation.
 */
STATIC int
xfs_exchrange_contents(
        struct xfs_exchrange    *fxr)
{
        struct inode            *inode1 = file_inode(fxr->file1);
        struct inode            *inode2 = file_inode(fxr->file2);
        struct xfs_inode        *ip1 = XFS_I(inode1);
        struct xfs_inode        *ip2 = XFS_I(inode2);
        struct xfs_mount        *mp = ip1->i_mount;
        int                     error;

        if (!xfs_has_exchange_range(mp))
                return -EOPNOTSUPP;

        if (fxr->flags & ~(XFS_EXCHANGE_RANGE_ALL_FLAGS |
                           XFS_EXCHANGE_RANGE_PRIV_FLAGS))
                return -EINVAL;

        if (xfs_is_shutdown(mp))
                return -EIO;

        /* Lock both files against IO */
        error = xfs_ilock2_io_mmap(ip1, ip2);
        if (error)
                goto out_err;

        /* Prepare and then exchange file contents. */
        error = xfs_exchrange_prep(fxr, ip1, ip2);
        if (error)
                goto out_unlock;

        error = xfs_exchrange_mappings(fxr, ip1, ip2);
        if (error)
                goto out_unlock;

        /*
         * Finish the exchange by removing special file privileges like any
         * other file write would do.  This may involve turning on support for
         * logged xattrs if either file has security capabilities.
         */
        error = xfs_exchange_range_finish(fxr);
        if (error)
                goto out_unlock;

out_unlock:
        xfs_iunlock2_io_mmap(ip1, ip2);
out_err:
        if (error)
                trace_xfs_exchrange_error(ip2, error, _RET_IP_);
        return error;
}

/* Exchange parts of two files. */
static int
xfs_exchange_range(
        struct xfs_exchrange    *fxr)
{
        struct inode            *inode1 = file_inode(fxr->file1);
        struct inode            *inode2 = file_inode(fxr->file2);
        loff_t                  check_len = fxr->length;
        int                     ret;

        BUILD_BUG_ON(XFS_EXCHANGE_RANGE_ALL_FLAGS &
                     XFS_EXCHANGE_RANGE_PRIV_FLAGS);

        /* Both files must be on the same mount/filesystem. */
        if (fxr->file1->f_path.mnt != fxr->file2->f_path.mnt)
                return -EXDEV;

        if (fxr->flags & ~(XFS_EXCHANGE_RANGE_ALL_FLAGS |
                         __XFS_EXCHANGE_RANGE_CHECK_FRESH2))
                return -EINVAL;

        /* Userspace requests only honored for regular files. */
        if (S_ISDIR(inode1->i_mode) || S_ISDIR(inode2->i_mode))
                return -EISDIR;
        if (!S_ISREG(inode1->i_mode) || !S_ISREG(inode2->i_mode))
                return -EINVAL;

        /* Both files must be opened for read and write. */
        if (!(fxr->file1->f_mode & FMODE_READ) ||
            !(fxr->file1->f_mode & FMODE_WRITE) ||
            !(fxr->file2->f_mode & FMODE_READ) ||
            !(fxr->file2->f_mode & FMODE_WRITE))
                return -EBADF;

        /* Neither file can be opened append-only. */
        if ((fxr->file1->f_flags & O_APPEND) ||
            (fxr->file2->f_flags & O_APPEND))
                return -EBADF;

        /*
         * If we're exchanging to EOF we can't calculate the length until taking
         * the iolock.  Pass a 0 length to remap_verify_area similar to the
         * FICLONE and FICLONERANGE ioctls that support cloning to EOF as well.
         */
        if (fxr->flags & XFS_EXCHANGE_RANGE_TO_EOF)
                check_len = 0;
        ret = remap_verify_area(fxr->file1, fxr->file1_offset, check_len, true);
        if (ret)
                return ret;
        ret = remap_verify_area(fxr->file2, fxr->file2_offset, check_len, true);
        if (ret)
                return ret;

        /* Update cmtime if the fd/inode don't forbid it. */
        if (!(fxr->file1->f_mode & FMODE_NOCMTIME) && !IS_NOCMTIME(inode1))
                fxr->flags |= __XFS_EXCHANGE_RANGE_UPD_CMTIME1;
        if (!(fxr->file2->f_mode & FMODE_NOCMTIME) && !IS_NOCMTIME(inode2))
                fxr->flags |= __XFS_EXCHANGE_RANGE_UPD_CMTIME2;

        file_start_write(fxr->file2);
        ret = xfs_exchrange_contents(fxr);
        file_end_write(fxr->file2);
        if (ret)
                return ret;

        fsnotify_modify(fxr->file1);
        if (fxr->file2 != fxr->file1)
                fsnotify_modify(fxr->file2);
        return 0;
}

/* Collect exchange-range arguments from userspace. */
long
xfs_ioc_exchange_range(
        struct file                     *file,
        struct xfs_exchange_range __user *argp)
{
        struct xfs_exchrange            fxr = {
                .file2                  = file,
        };
        struct xfs_exchange_range       args;

        if (copy_from_user(&args, argp, sizeof(args)))
                return -EFAULT;
        if (memchr_inv(&args.pad, 0, sizeof(args.pad)))
                return -EINVAL;
        if (args.flags & ~XFS_EXCHANGE_RANGE_ALL_FLAGS)
                return -EINVAL;

        fxr.file1_offset        = args.file1_offset;
        fxr.file2_offset        = args.file2_offset;
        fxr.length              = args.length;
        fxr.flags               = args.flags;

        CLASS(fd, file1)(args.file1_fd);
        if (fd_empty(file1))
                return -EBADF;
        fxr.file1 = fd_file(file1);

        return xfs_exchange_range(&fxr);
}

/* Opaque freshness blob for XFS_IOC_COMMIT_RANGE */
struct xfs_commit_range_fresh {
        xfs_fsid_t      fsid;           /* m_fixedfsid */
        __u64           file2_ino;      /* inode number */
        __s64           file2_mtime;    /* modification time */
        __s64           file2_ctime;    /* change time */
        __s32           file2_mtime_nsec; /* mod time, nsec */
        __s32           file2_ctime_nsec; /* change time, nsec */
        __u32           file2_gen;      /* inode generation */
        __u32           magic;          /* zero */
};
#define XCR_FRESH_MAGIC 0x444F524B      /* DORK */

/* Set up a commitrange operation by sampling file2's write-related attrs */
long
xfs_ioc_start_commit(
        struct file                     *file,
        struct xfs_commit_range __user  *argp)
{
        struct xfs_commit_range         args = { };
        struct kstat                    kstat = { };
        struct xfs_commit_range_fresh   *kern_f;
        struct xfs_commit_range_fresh   __user *user_f;
        struct inode                    *inode2 = file_inode(file);
        struct xfs_inode                *ip2 = XFS_I(inode2);
        const unsigned int              lockflags = XFS_IOLOCK_SHARED |
                                                    XFS_MMAPLOCK_SHARED |
                                                    XFS_ILOCK_SHARED;

        BUILD_BUG_ON(sizeof(struct xfs_commit_range_fresh) !=
                     sizeof(args.file2_freshness));

        kern_f = (struct xfs_commit_range_fresh *)&args.file2_freshness;

        memcpy(&kern_f->fsid, ip2->i_mount->m_fixedfsid, sizeof(xfs_fsid_t));

        xfs_ilock(ip2, lockflags);
        /* Force writing of a distinct ctime if any writes happen. */
        fill_mg_cmtime(&kstat, STATX_CTIME | STATX_MTIME, inode2);
        kern_f->file2_ctime             = kstat.ctime.tv_sec;
        kern_f->file2_ctime_nsec        = kstat.ctime.tv_nsec;
        kern_f->file2_mtime             = kstat.mtime.tv_sec;
        kern_f->file2_mtime_nsec        = kstat.mtime.tv_nsec;
        kern_f->file2_ino               = ip2->i_ino;
        kern_f->file2_gen               = inode2->i_generation;
        kern_f->magic                   = XCR_FRESH_MAGIC;
        xfs_iunlock(ip2, lockflags);

        user_f = (struct xfs_commit_range_fresh __user *)&argp->file2_freshness;
        if (copy_to_user(user_f, kern_f, sizeof(*kern_f)))
                return -EFAULT;

        return 0;
}

/*
 * Exchange file1 and file2 contents if file2 has not been written since the
 * start commit operation.
 */
long
xfs_ioc_commit_range(
        struct file                     *file,
        struct xfs_commit_range __user  *argp)
{
        struct xfs_exchrange            fxr = {
                .file2                  = file,
        };
        struct xfs_commit_range         args;
        struct xfs_commit_range_fresh   *kern_f;
        struct xfs_inode                *ip2 = XFS_I(file_inode(file));
        struct xfs_mount                *mp = ip2->i_mount;

        kern_f = (struct xfs_commit_range_fresh *)&args.file2_freshness;

        if (copy_from_user(&args, argp, sizeof(args)))
                return -EFAULT;
        if (args.flags & ~XFS_EXCHANGE_RANGE_ALL_FLAGS)
                return -EINVAL;
        if (kern_f->magic != XCR_FRESH_MAGIC)
                return -EBUSY;
        if (memcmp(&kern_f->fsid, mp->m_fixedfsid, sizeof(xfs_fsid_t)))
                return -EBUSY;

        fxr.file1_offset        = args.file1_offset;
        fxr.file2_offset        = args.file2_offset;
        fxr.length              = args.length;
        fxr.flags               = args.flags | __XFS_EXCHANGE_RANGE_CHECK_FRESH2;
        fxr.file2_ino           = kern_f->file2_ino;
        fxr.file2_gen           = kern_f->file2_gen;
        fxr.file2_mtime.tv_sec  = kern_f->file2_mtime;
        fxr.file2_mtime.tv_nsec = kern_f->file2_mtime_nsec;
        fxr.file2_ctime.tv_sec  = kern_f->file2_ctime;
        fxr.file2_ctime.tv_nsec = kern_f->file2_ctime_nsec;

        CLASS(fd, file1)(args.file1_fd);
        if (fd_empty(file1))
                return -EBADF;
        fxr.file1 = fd_file(file1);

        return xfs_exchange_range(&fxr);
}