root/fs/btrfs/direct-io.c
// SPDX-License-Identifier: GPL-2.0

#include <linux/fsverity.h>
#include <linux/iomap.h>
#include "ctree.h"
#include "delalloc-space.h"
#include "direct-io.h"
#include "extent-tree.h"
#include "file.h"
#include "fs.h"
#include "transaction.h"
#include "volumes.h"
#include "bio.h"
#include "ordered-data.h"

struct btrfs_dio_data {
        ssize_t submitted;
        struct extent_changeset *data_reserved;
        struct btrfs_ordered_extent *ordered;
        bool data_space_reserved;
        bool nocow_done;
};

struct btrfs_dio_private {
        /* Range of I/O */
        u64 file_offset;
        u32 bytes;

        /* This must be last */
        struct btrfs_bio bbio;
};

static struct bio_set btrfs_dio_bioset;

static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
                              struct extent_state **cached_state,
                              unsigned int iomap_flags)
{
        const bool writing = (iomap_flags & IOMAP_WRITE);
        const bool nowait = (iomap_flags & IOMAP_NOWAIT);
        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
        struct btrfs_ordered_extent *ordered;
        int ret = 0;

        /* Direct lock must be taken before the extent lock. */
        if (nowait) {
                if (!btrfs_try_lock_dio_extent(io_tree, lockstart, lockend, cached_state))
                        return -EAGAIN;
        } else {
                btrfs_lock_dio_extent(io_tree, lockstart, lockend, cached_state);
        }

        while (1) {
                if (nowait) {
                        if (!btrfs_try_lock_extent(io_tree, lockstart, lockend,
                                                   cached_state)) {
                                ret = -EAGAIN;
                                break;
                        }
                } else {
                        btrfs_lock_extent(io_tree, lockstart, lockend, cached_state);
                }
                /*
                 * We're concerned with the entire range that we're going to be
                 * doing DIO to, so we need to make sure there's no ordered
                 * extents in this range.
                 */
                ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), lockstart,
                                                     lockend - lockstart + 1);

                /*
                 * We need to make sure there are no buffered pages in this
                 * range either, we could have raced between the invalidate in
                 * generic_file_direct_write and locking the extent.  The
                 * invalidate needs to happen so that reads after a write do not
                 * get stale data.
                 */
                if (!ordered &&
                    (!writing || !filemap_range_has_page(inode->i_mapping,
                                                         lockstart, lockend)))
                        break;

                btrfs_unlock_extent(io_tree, lockstart, lockend, cached_state);

                if (ordered) {
                        if (nowait) {
                                btrfs_put_ordered_extent(ordered);
                                ret = -EAGAIN;
                                break;
                        }
                        /*
                         * If we are doing a DIO read and the ordered extent we
                         * found is for a buffered write, we can not wait for it
                         * to complete and retry, because if we do so we can
                         * deadlock with concurrent buffered writes on page
                         * locks. This happens only if our DIO read covers more
                         * than one extent map, if at this point has already
                         * created an ordered extent for a previous extent map
                         * and locked its range in the inode's io tree, and a
                         * concurrent write against that previous extent map's
                         * range and this range started (we unlock the ranges
                         * in the io tree only when the bios complete and
                         * buffered writes always lock pages before attempting
                         * to lock range in the io tree).
                         */
                        if (writing ||
                            test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags))
                                btrfs_start_ordered_extent(ordered);
                        else
                                ret = nowait ? -EAGAIN : -ENOTBLK;
                        btrfs_put_ordered_extent(ordered);
                } else {
                        /*
                         * We could trigger writeback for this range (and wait
                         * for it to complete) and then invalidate the pages for
                         * this range (through invalidate_inode_pages2_range()),
                         * but that can lead us to a deadlock with a concurrent
                         * call to readahead (a buffered read or a defrag call
                         * triggered a readahead) on a page lock due to an
                         * ordered dio extent we created before but did not have
                         * yet a corresponding bio submitted (whence it can not
                         * complete), which makes readahead wait for that
                         * ordered extent to complete while holding a lock on
                         * that page.
                         */
                        ret = nowait ? -EAGAIN : -ENOTBLK;
                }

                if (ret)
                        break;

                cond_resched();
        }

        if (ret)
                btrfs_unlock_dio_extent(io_tree, lockstart, lockend, cached_state);
        return ret;
}

static struct extent_map *btrfs_create_dio_extent(struct btrfs_inode *inode,
                                                  struct btrfs_dio_data *dio_data,
                                                  const u64 start,
                                                  const struct btrfs_file_extent *file_extent,
                                                  const int type)
{
        struct extent_map *em = NULL;
        struct btrfs_ordered_extent *ordered;

        if (type != BTRFS_ORDERED_NOCOW) {
                em = btrfs_create_io_em(inode, start, file_extent, type);
                if (IS_ERR(em))
                        goto out;
        }

        ordered = btrfs_alloc_ordered_extent(inode, start, file_extent,
                                             (1U << type) |
                                             (1U << BTRFS_ORDERED_DIRECT));
        if (IS_ERR(ordered)) {
                if (em) {
                        btrfs_free_extent_map(em);
                        btrfs_drop_extent_map_range(inode, start,
                                        start + file_extent->num_bytes - 1, false);
                }
                em = ERR_CAST(ordered);
        } else {
                ASSERT(!dio_data->ordered);
                dio_data->ordered = ordered;
        }
 out:

        return em;
}

static struct extent_map *btrfs_new_extent_direct(struct btrfs_inode *inode,
                                                  struct btrfs_dio_data *dio_data,
                                                  u64 start, u64 len)
{
        struct btrfs_root *root = inode->root;
        struct btrfs_fs_info *fs_info = root->fs_info;
        struct btrfs_file_extent file_extent;
        struct extent_map *em;
        struct btrfs_key ins;
        u64 alloc_hint;
        int ret;

        alloc_hint = btrfs_get_extent_allocation_hint(inode, start, len);
again:
        ret = btrfs_reserve_extent(root, len, len, fs_info->sectorsize,
                                   0, alloc_hint, &ins, true, true);
        if (ret == -EAGAIN) {
                ASSERT(btrfs_is_zoned(fs_info));
                wait_on_bit_io(&inode->root->fs_info->flags, BTRFS_FS_NEED_ZONE_FINISH,
                               TASK_UNINTERRUPTIBLE);
                goto again;
        }
        if (ret)
                return ERR_PTR(ret);

        file_extent.disk_bytenr = ins.objectid;
        file_extent.disk_num_bytes = ins.offset;
        file_extent.num_bytes = ins.offset;
        file_extent.ram_bytes = ins.offset;
        file_extent.offset = 0;
        file_extent.compression = BTRFS_COMPRESS_NONE;
        em = btrfs_create_dio_extent(inode, dio_data, start, &file_extent,
                                     BTRFS_ORDERED_REGULAR);
        btrfs_dec_block_group_reservations(fs_info, ins.objectid);
        if (IS_ERR(em))
                btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, true);

        return em;
}

static int btrfs_get_blocks_direct_write(struct extent_map **map,
                                         struct inode *inode,
                                         struct btrfs_dio_data *dio_data,
                                         u64 start, u64 *lenp,
                                         unsigned int iomap_flags)
{
        const bool nowait = (iomap_flags & IOMAP_NOWAIT);
        struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
        struct btrfs_file_extent file_extent;
        struct extent_map *em = *map;
        int type;
        u64 block_start;
        struct btrfs_block_group *bg;
        bool can_nocow = false;
        bool space_reserved = false;
        u64 len = *lenp;
        u64 prev_len;
        int ret = 0;

        /*
         * We don't allocate a new extent in the following cases
         *
         * 1) The inode is marked as NODATACOW. In this case we'll just use the
         * existing extent.
         * 2) The extent is marked as PREALLOC. We're good to go here and can
         * just use the extent.
         *
         */
        if ((em->flags & EXTENT_FLAG_PREALLOC) ||
            ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
             em->disk_bytenr != EXTENT_MAP_HOLE)) {
                if (em->flags & EXTENT_FLAG_PREALLOC)
                        type = BTRFS_ORDERED_PREALLOC;
                else
                        type = BTRFS_ORDERED_NOCOW;
                len = min(len, em->len - (start - em->start));
                block_start = btrfs_extent_map_block_start(em) + (start - em->start);

                if (can_nocow_extent(BTRFS_I(inode), start, &len, &file_extent,
                                     false) == 1) {
                        bg = btrfs_inc_nocow_writers(fs_info, block_start);
                        if (bg)
                                can_nocow = true;
                }
        }

        prev_len = len;
        if (can_nocow) {
                struct extent_map *em2;

                /* We can NOCOW, so only need to reserve metadata space. */
                ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len, len,
                                                      nowait);
                if (ret < 0) {
                        /* Our caller expects us to free the input extent map. */
                        btrfs_free_extent_map(em);
                        *map = NULL;
                        btrfs_dec_nocow_writers(bg);
                        if (nowait && (ret == -ENOSPC || ret == -EDQUOT))
                                ret = -EAGAIN;
                        goto out;
                }
                space_reserved = true;

                em2 = btrfs_create_dio_extent(BTRFS_I(inode), dio_data, start,
                                              &file_extent, type);
                btrfs_dec_nocow_writers(bg);
                if (type == BTRFS_ORDERED_PREALLOC) {
                        btrfs_free_extent_map(em);
                        *map = em2;
                        em = em2;
                }

                if (IS_ERR(em2)) {
                        ret = PTR_ERR(em2);
                        goto out;
                }

                dio_data->nocow_done = true;
        } else {
                /* Our caller expects us to free the input extent map. */
                btrfs_free_extent_map(em);
                *map = NULL;

                if (nowait) {
                        ret = -EAGAIN;
                        goto out;
                }

                /*
                 * If we could not allocate data space before locking the file
                 * range and we can't do a NOCOW write, then we have to fail.
                 */
                if (!dio_data->data_space_reserved) {
                        ret = -ENOSPC;
                        goto out;
                }

                /*
                 * We have to COW and we have already reserved data space before,
                 * so now we reserve only metadata.
                 */
                ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len, len,
                                                      false);
                if (ret < 0)
                        goto out;
                space_reserved = true;

                em = btrfs_new_extent_direct(BTRFS_I(inode), dio_data, start, len);
                if (IS_ERR(em)) {
                        ret = PTR_ERR(em);
                        goto out;
                }
                *map = em;
                len = min(len, em->len - (start - em->start));
                if (len < prev_len)
                        btrfs_delalloc_release_metadata(BTRFS_I(inode),
                                                        prev_len - len, true);
        }

        /*
         * We have created our ordered extent, so we can now release our reservation
         * for an outstanding extent.
         */
        btrfs_delalloc_release_extents(BTRFS_I(inode), prev_len);

        /*
         * Need to update the i_size under the extent lock so buffered
         * readers will get the updated i_size when we unlock.
         */
        if (start + len > i_size_read(inode))
                i_size_write(inode, start + len);
out:
        if (ret && space_reserved) {
                btrfs_delalloc_release_extents(BTRFS_I(inode), len);
                btrfs_delalloc_release_metadata(BTRFS_I(inode), len, true);
        }
        *lenp = len;
        return ret;
}

static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
                loff_t length, unsigned int flags, struct iomap *iomap,
                struct iomap *srcmap)
{
        struct iomap_iter *iter = container_of(iomap, struct iomap_iter, iomap);
        struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
        struct extent_map *em;
        struct extent_state *cached_state = NULL;
        struct btrfs_dio_data *dio_data = iter->private;
        u64 lockstart, lockend;
        const bool write = !!(flags & IOMAP_WRITE);
        int ret = 0;
        u64 len = length;
        const u64 data_alloc_len = length;
        u32 unlock_bits = EXTENT_LOCKED;

        /*
         * We could potentially fault if we have a buffer > PAGE_SIZE, and if
         * we're NOWAIT we may submit a bio for a partial range and return
         * EIOCBQUEUED, which would result in an errant short read.
         *
         * The best way to handle this would be to allow for partial completions
         * of iocb's, so we could submit the partial bio, return and fault in
         * the rest of the pages, and then submit the io for the rest of the
         * range.  However we don't have that currently, so simply return
         * -EAGAIN at this point so that the normal path is used.
         */
        if (!write && (flags & IOMAP_NOWAIT) && length > PAGE_SIZE)
                return -EAGAIN;

        /*
         * Cap the size of reads to that usually seen in buffered I/O as we need
         * to allocate a contiguous array for the checksums.
         */
        if (!write)
                len = min_t(u64, len, fs_info->sectorsize * BIO_MAX_VECS);

        lockstart = start;
        lockend = start + len - 1;

        /*
         * iomap_dio_rw() only does filemap_write_and_wait_range(), which isn't
         * enough if we've written compressed pages to this area, so we need to
         * flush the dirty pages again to make absolutely sure that any
         * outstanding dirty pages are on disk - the first flush only starts
         * compression on the data, while keeping the pages locked, so by the
         * time the second flush returns we know bios for the compressed pages
         * were submitted and finished, and the pages no longer under writeback.
         *
         * If we have a NOWAIT request and we have any pages in the range that
         * are locked, likely due to compression still in progress, we don't want
         * to block on page locks. We also don't want to block on pages marked as
         * dirty or under writeback (same as for the non-compression case).
         * iomap_dio_rw() did the same check, but after that and before we got
         * here, mmap'ed writes may have happened or buffered reads started
         * (readpage() and readahead(), which lock pages), as we haven't locked
         * the file range yet.
         */
        if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
                     &BTRFS_I(inode)->runtime_flags)) {
                if (flags & IOMAP_NOWAIT) {
                        if (filemap_range_needs_writeback(inode->i_mapping,
                                                          lockstart, lockend))
                                return -EAGAIN;
                } else {
                        ret = filemap_fdatawrite_range(inode->i_mapping, start,
                                                       start + length - 1);
                        if (ret)
                                return ret;
                }
        }

        memset(dio_data, 0, sizeof(*dio_data));

        /*
         * We always try to allocate data space and must do it before locking
         * the file range, to avoid deadlocks with concurrent writes to the same
         * range if the range has several extents and the writes don't expand the
         * current i_size (the inode lock is taken in shared mode). If we fail to
         * allocate data space here we continue and later, after locking the
         * file range, we fail with ENOSPC only if we figure out we can not do a
         * NOCOW write.
         */
        if (write && !(flags & IOMAP_NOWAIT)) {
                ret = btrfs_check_data_free_space(BTRFS_I(inode),
                                                  &dio_data->data_reserved,
                                                  start, data_alloc_len, false);
                if (!ret)
                        dio_data->data_space_reserved = true;
                else if (!(BTRFS_I(inode)->flags &
                           (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)))
                        goto err;
        }

        /*
         * If this errors out it's because we couldn't invalidate pagecache for
         * this range and we need to fallback to buffered IO, or we are doing a
         * NOWAIT read/write and we need to block.
         */
        ret = lock_extent_direct(inode, lockstart, lockend, &cached_state, flags);
        if (ret < 0)
                goto err;

        em = btrfs_get_extent(BTRFS_I(inode), NULL, start, len);
        if (IS_ERR(em)) {
                ret = PTR_ERR(em);
                goto unlock_err;
        }

        /*
         * Ok for INLINE and COMPRESSED extents we need to fallback on buffered
         * io.  INLINE is special, and we could probably kludge it in here, but
         * it's still buffered so for safety lets just fall back to the generic
         * buffered path.
         *
         * For COMPRESSED we _have_ to read the entire extent in so we can
         * decompress it, so there will be buffering required no matter what we
         * do, so go ahead and fallback to buffered.
         *
         * We return -ENOTBLK because that's what makes DIO go ahead and go back
         * to buffered IO.  Don't blame me, this is the price we pay for using
         * the generic code.
         */
        if (btrfs_extent_map_is_compressed(em) || em->disk_bytenr == EXTENT_MAP_INLINE) {
                btrfs_free_extent_map(em);
                /*
                 * If we are in a NOWAIT context, return -EAGAIN in order to
                 * fallback to buffered IO. This is not only because we can
                 * block with buffered IO (no support for NOWAIT semantics at
                 * the moment) but also to avoid returning short reads to user
                 * space - this happens if we were able to read some data from
                 * previous non-compressed extents and then when we fallback to
                 * buffered IO, at btrfs_file_read_iter() by calling
                 * filemap_read(), we fail to fault in pages for the read buffer,
                 * in which case filemap_read() returns a short read (the number
                 * of bytes previously read is > 0, so it does not return -EFAULT).
                 */
                ret = (flags & IOMAP_NOWAIT) ? -EAGAIN : -ENOTBLK;
                goto unlock_err;
        }

        len = min(len, em->len - (start - em->start));

        /*
         * If we have a NOWAIT request and the range contains multiple extents
         * (or a mix of extents and holes), then we return -EAGAIN to make the
         * caller fallback to a context where it can do a blocking (without
         * NOWAIT) request. This way we avoid doing partial IO and returning
         * success to the caller, which is not optimal for writes and for reads
         * it can result in unexpected behaviour for an application.
         *
         * When doing a read, because we use IOMAP_DIO_PARTIAL when calling
         * iomap_dio_rw(), we can end up returning less data then what the caller
         * asked for, resulting in an unexpected, and incorrect, short read.
         * That is, the caller asked to read N bytes and we return less than that,
         * which is wrong unless we are crossing EOF. This happens if we get a
         * page fault error when trying to fault in pages for the buffer that is
         * associated to the struct iov_iter passed to iomap_dio_rw(), and we
         * have previously submitted bios for other extents in the range, in
         * which case iomap_dio_rw() may return us EIOCBQUEUED if not all of
         * those bios have completed by the time we get the page fault error,
         * which we return back to our caller - we should only return EIOCBQUEUED
         * after we have submitted bios for all the extents in the range.
         */
        if ((flags & IOMAP_NOWAIT) && len < length) {
                btrfs_free_extent_map(em);
                ret = -EAGAIN;
                goto unlock_err;
        }

        if (write) {
                ret = btrfs_get_blocks_direct_write(&em, inode, dio_data,
                                                    start, &len, flags);
                if (ret < 0)
                        goto unlock_err;
                /* Recalc len in case the new em is smaller than requested */
                len = min(len, em->len - (start - em->start));
                if (dio_data->data_space_reserved) {
                        u64 release_offset;
                        u64 release_len = 0;

                        if (dio_data->nocow_done) {
                                release_offset = start;
                                release_len = data_alloc_len;
                        } else if (len < data_alloc_len) {
                                release_offset = start + len;
                                release_len = data_alloc_len - len;
                        }

                        if (release_len > 0)
                                btrfs_free_reserved_data_space(BTRFS_I(inode),
                                                               dio_data->data_reserved,
                                                               release_offset,
                                                               release_len);
                }
        }

        /*
         * Translate extent map information to iomap.
         * We trim the extents (and move the addr) even though iomap code does
         * that, since we have locked only the parts we are performing I/O in.
         */
        if ((em->disk_bytenr == EXTENT_MAP_HOLE) ||
            ((em->flags & EXTENT_FLAG_PREALLOC) && !write)) {
                iomap->addr = IOMAP_NULL_ADDR;
                iomap->type = IOMAP_HOLE;
        } else {
                iomap->addr = btrfs_extent_map_block_start(em) + (start - em->start);
                iomap->type = IOMAP_MAPPED;
        }
        iomap->offset = start;
        iomap->bdev = fs_info->fs_devices->latest_dev->bdev;
        iomap->length = len;
        btrfs_free_extent_map(em);

        /*
         * Reads will hold the EXTENT_DIO_LOCKED bit until the io is completed,
         * writes only hold it for this part.  We hold the extent lock until
         * we're completely done with the extent map to make sure it remains
         * valid.
         */
        if (write)
                unlock_bits |= EXTENT_DIO_LOCKED;

        btrfs_clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
                               unlock_bits, &cached_state);

        /* We didn't use everything, unlock the dio extent for the remainder. */
        if (!write && (start + len) < lockend)
                btrfs_unlock_dio_extent(&BTRFS_I(inode)->io_tree, start + len,
                                        lockend, NULL);

        return 0;

unlock_err:
        /*
         * Don't use EXTENT_LOCK_BITS here in case we extend it later and forget
         * to update this, be explicit that we expect EXTENT_LOCKED and
         * EXTENT_DIO_LOCKED to be set here, and so that's what we're clearing.
         */
        btrfs_clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
                               EXTENT_LOCKED | EXTENT_DIO_LOCKED, &cached_state);
err:
        if (dio_data->data_space_reserved) {
                btrfs_free_reserved_data_space(BTRFS_I(inode),
                                               dio_data->data_reserved,
                                               start, data_alloc_len);
                extent_changeset_free(dio_data->data_reserved);
        }

        return ret;
}

static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length,
                ssize_t written, unsigned int flags, struct iomap *iomap)
{
        struct iomap_iter *iter = container_of(iomap, struct iomap_iter, iomap);
        struct btrfs_dio_data *dio_data = iter->private;
        size_t submitted = dio_data->submitted;
        const bool write = !!(flags & IOMAP_WRITE);
        int ret = 0;

        if (!write && (iomap->type == IOMAP_HOLE)) {
                /* If reading from a hole, unlock and return */
                btrfs_unlock_dio_extent(&BTRFS_I(inode)->io_tree, pos,
                                        pos + length - 1, NULL);
                return 0;
        }

        if (submitted < length) {
                pos += submitted;
                length -= submitted;
                if (write)
                        btrfs_finish_ordered_extent(dio_data->ordered, NULL,
                                                    pos, length, false);
                else
                        btrfs_unlock_dio_extent(&BTRFS_I(inode)->io_tree, pos,
                                                pos + length - 1, NULL);
                ret = -ENOTBLK;
        }
        if (write) {
                btrfs_put_ordered_extent(dio_data->ordered);
                dio_data->ordered = NULL;
        }

        if (write)
                extent_changeset_free(dio_data->data_reserved);
        return ret;
}

static void btrfs_dio_end_io(struct btrfs_bio *bbio)
{
        struct btrfs_dio_private *dip =
                container_of(bbio, struct btrfs_dio_private, bbio);
        struct btrfs_inode *inode = bbio->inode;
        struct bio *bio = &bbio->bio;

        if (bio->bi_status) {
                btrfs_warn(inode->root->fs_info,
                "direct IO failed ino %llu op 0x%0x offset %#llx len %u err no %d",
                           btrfs_ino(inode), bio->bi_opf,
                           dip->file_offset, dip->bytes, bio->bi_status);
        }

        if (btrfs_op(bio) == BTRFS_MAP_WRITE) {
                btrfs_finish_ordered_extent(bbio->ordered, NULL,
                                            dip->file_offset, dip->bytes,
                                            !bio->bi_status);
        } else {
                btrfs_unlock_dio_extent(&inode->io_tree, dip->file_offset,
                                        dip->file_offset + dip->bytes - 1, NULL);
        }

        bbio->bio.bi_private = bbio->private;
        iomap_dio_bio_end_io(bio);
}

static int btrfs_extract_ordered_extent(struct btrfs_bio *bbio,
                                        struct btrfs_ordered_extent *ordered)
{
        u64 start = (u64)bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT;
        u64 len = bbio->bio.bi_iter.bi_size;
        struct btrfs_ordered_extent *new;
        int ret;

        /* Must always be called for the beginning of an ordered extent. */
        if (WARN_ON_ONCE(start != ordered->disk_bytenr))
                return -EINVAL;

        /* No need to split if the ordered extent covers the entire bio. */
        if (ordered->disk_num_bytes == len) {
                refcount_inc(&ordered->refs);
                bbio->ordered = ordered;
                return 0;
        }

        /*
         * Don't split the extent_map for NOCOW extents, as we're writing into
         * a pre-existing one.
         */
        if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) {
                ret = btrfs_split_extent_map(bbio->inode, bbio->file_offset,
                                             ordered->num_bytes, len,
                                             ordered->disk_bytenr);
                if (ret)
                        return ret;
        }

        new = btrfs_split_ordered_extent(ordered, len);
        if (IS_ERR(new))
                return PTR_ERR(new);
        bbio->ordered = new;
        return 0;
}

static void btrfs_dio_submit_io(const struct iomap_iter *iter, struct bio *bio,
                                loff_t file_offset)
{
        struct btrfs_bio *bbio = btrfs_bio(bio);
        struct btrfs_dio_private *dip =
                container_of(bbio, struct btrfs_dio_private, bbio);
        struct btrfs_dio_data *dio_data = iter->private;

        btrfs_bio_init(bbio, BTRFS_I(iter->inode), file_offset,
                       btrfs_dio_end_io, bio->bi_private);

        dip->file_offset = file_offset;
        dip->bytes = bio->bi_iter.bi_size;

        dio_data->submitted += bio->bi_iter.bi_size;

        /*
         * Check if we are doing a partial write.  If we are, we need to split
         * the ordered extent to match the submitted bio.  Hang on to the
         * remaining unfinishable ordered_extent in dio_data so that it can be
         * cancelled in iomap_end to avoid a deadlock wherein faulting the
         * remaining pages is blocked on the outstanding ordered extent.
         */
        if (iter->flags & IOMAP_WRITE) {
                int ret;

                ret = btrfs_extract_ordered_extent(bbio, dio_data->ordered);
                if (ret) {
                        btrfs_finish_ordered_extent(dio_data->ordered, NULL,
                                                    file_offset, dip->bytes,
                                                    !ret);
                        bio->bi_status = errno_to_blk_status(ret);
                        iomap_dio_bio_end_io(bio);
                        return;
                }
        }

        btrfs_submit_bbio(bbio, 0);
}

static const struct iomap_ops btrfs_dio_iomap_ops = {
        .iomap_begin            = btrfs_dio_iomap_begin,
        .iomap_end              = btrfs_dio_iomap_end,
};

static const struct iomap_dio_ops btrfs_dio_ops = {
        .submit_io              = btrfs_dio_submit_io,
        .bio_set                = &btrfs_dio_bioset,
};

static ssize_t btrfs_dio_read(struct kiocb *iocb, struct iov_iter *iter,
                              size_t done_before)
{
        struct btrfs_dio_data data = { 0 };

        return iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
                            IOMAP_DIO_PARTIAL | IOMAP_DIO_FSBLOCK_ALIGNED, &data, done_before);
}

static struct iomap_dio *btrfs_dio_write(struct kiocb *iocb, struct iov_iter *iter,
                                         size_t done_before)
{
        struct btrfs_dio_data data = { 0 };

        return __iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
                            IOMAP_DIO_PARTIAL | IOMAP_DIO_FSBLOCK_ALIGNED, &data, done_before);
}

static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info,
                               const struct iov_iter *iter, loff_t offset)
{
        const u32 blocksize_mask = fs_info->sectorsize - 1;

        if (offset & blocksize_mask)
                return -EINVAL;

        if (iov_iter_alignment(iter) & blocksize_mask)
                return -EINVAL;
        return 0;
}

ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
{
        struct file *file = iocb->ki_filp;
        struct inode *inode = file_inode(file);
        struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
        loff_t pos;
        ssize_t written = 0;
        ssize_t written_buffered;
        size_t prev_left = 0;
        loff_t endbyte;
        ssize_t ret;
        unsigned int ilock_flags = 0;
        struct iomap_dio *dio;
        const u64 data_profile = btrfs_data_alloc_profile(fs_info) &
                                 BTRFS_BLOCK_GROUP_PROFILE_MASK;

        if (iocb->ki_flags & IOCB_NOWAIT)
                ilock_flags |= BTRFS_ILOCK_TRY;

        /*
         * If the write DIO is within EOF, use a shared lock and also only if
         * security bits will likely not be dropped by file_remove_privs() called
         * from btrfs_write_check(). Either will need to be rechecked after the
         * lock was acquired.
         */
        if (iocb->ki_pos + iov_iter_count(from) <= i_size_read(inode) && IS_NOSEC(inode))
                ilock_flags |= BTRFS_ILOCK_SHARED;

        /*
         * If our data profile has duplication (either extra mirrors or RAID56),
         * we can not trust the direct IO buffer, the content may change during
         * writeback and cause different contents written to different mirrors.
         *
         * Thus only RAID0 and SINGLE can go true zero-copy direct IO.
         */
        if (data_profile != BTRFS_BLOCK_GROUP_RAID0 && data_profile != 0)
                goto buffered;

relock:
        ret = btrfs_inode_lock(BTRFS_I(inode), ilock_flags);
        if (ret < 0)
                return ret;

        /* Shared lock cannot be used with security bits set. */
        if ((ilock_flags & BTRFS_ILOCK_SHARED) && !IS_NOSEC(inode)) {
                btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
                ilock_flags &= ~BTRFS_ILOCK_SHARED;
                goto relock;
        }

        ret = generic_write_checks(iocb, from);
        if (ret <= 0) {
                btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
                return ret;
        }

        ret = btrfs_write_check(iocb, ret);
        if (ret < 0) {
                btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
                goto out;
        }

        pos = iocb->ki_pos;
        /*
         * Re-check since file size may have changed just before taking the
         * lock or pos may have changed because of O_APPEND in generic_write_check()
         */
        if ((ilock_flags & BTRFS_ILOCK_SHARED) &&
            pos + iov_iter_count(from) > i_size_read(inode)) {
                btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
                ilock_flags &= ~BTRFS_ILOCK_SHARED;
                goto relock;
        }

        if (check_direct_IO(fs_info, from, pos)) {
                btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
                goto buffered;
        }
        /*
         * We can't control the folios being passed in, applications can write
         * to them while a direct IO write is in progress.  This means the
         * content might change after we calculated the data checksum.
         * Therefore we can end up storing a checksum that doesn't match the
         * persisted data.
         *
         * To be extra safe and avoid false data checksum mismatch, if the
         * inode requires data checksum, just fallback to buffered IO.
         * For buffered IO we have full control of page cache and can ensure
         * no one is modifying the content during writeback.
         */
        if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
                btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
                goto buffered;
        }

        /*
         * The iov_iter can be mapped to the same file range we are writing to.
         * If that's the case, then we will deadlock in the iomap code, because
         * it first calls our callback btrfs_dio_iomap_begin(), which will create
         * an ordered extent, and after that it will fault in the pages that the
         * iov_iter refers to. During the fault in we end up in the readahead
         * pages code (starting at btrfs_readahead()), which will lock the range,
         * find that ordered extent and then wait for it to complete (at
         * btrfs_lock_and_flush_ordered_range()), resulting in a deadlock since
         * obviously the ordered extent can never complete as we didn't submit
         * yet the respective bio(s). This always happens when the buffer is
         * memory mapped to the same file range, since the iomap DIO code always
         * invalidates pages in the target file range (after starting and waiting
         * for any writeback).
         *
         * So here we disable page faults in the iov_iter and then retry if we
         * got -EFAULT, faulting in the pages before the retry.
         */
again:
        from->nofault = true;
        dio = btrfs_dio_write(iocb, from, written);
        from->nofault = false;

        if (IS_ERR_OR_NULL(dio)) {
                ret = PTR_ERR_OR_ZERO(dio);
        } else {
                /*
                 * If we have a synchronous write, we must make sure the fsync
                 * triggered by the iomap_dio_complete() call below doesn't
                 * deadlock on the inode lock - we are already holding it and we
                 * can't call it after unlocking because we may need to complete
                 * partial writes due to the input buffer (or parts of it) not
                 * being already faulted in.
                 */
                ASSERT(current->journal_info == NULL);
                current->journal_info = BTRFS_TRANS_DIO_WRITE_STUB;
                ret = iomap_dio_complete(dio);
                current->journal_info = NULL;
        }

        /* No increment (+=) because iomap returns a cumulative value. */
        if (ret > 0)
                written = ret;

        if (iov_iter_count(from) > 0 && (ret == -EFAULT || ret > 0)) {
                const size_t left = iov_iter_count(from);
                /*
                 * We have more data left to write. Try to fault in as many as
                 * possible of the remainder pages and retry. We do this without
                 * releasing and locking again the inode, to prevent races with
                 * truncate.
                 *
                 * Also, in case the iov refers to pages in the file range of the
                 * file we want to write to (due to a mmap), we could enter an
                 * infinite loop if we retry after faulting the pages in, since
                 * iomap will invalidate any pages in the range early on, before
                 * it tries to fault in the pages of the iov. So we keep track of
                 * how much was left of iov in the previous EFAULT and fallback
                 * to buffered IO in case we haven't made any progress.
                 */
                if (left == prev_left) {
                        ret = -ENOTBLK;
                } else {
                        fault_in_iov_iter_readable(from, left);
                        prev_left = left;
                        goto again;
                }
        }

        btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);

        /*
         * If 'ret' is -ENOTBLK or we have not written all data, then it means
         * we must fallback to buffered IO.
         */
        if ((ret < 0 && ret != -ENOTBLK) || !iov_iter_count(from))
                goto out;

buffered:
        /*
         * If we are in a NOWAIT context, then return -EAGAIN to signal the caller
         * it must retry the operation in a context where blocking is acceptable,
         * because even if we end up not blocking during the buffered IO attempt
         * below, we will block when flushing and waiting for the IO.
         */
        if (iocb->ki_flags & IOCB_NOWAIT) {
                ret = -EAGAIN;
                goto out;
        }

        pos = iocb->ki_pos;
        written_buffered = btrfs_buffered_write(iocb, from);
        if (written_buffered < 0) {
                ret = written_buffered;
                goto out;
        }
        /*
         * Ensure all data is persisted. We want the next direct IO read to be
         * able to read what was just written.
         */
        endbyte = pos + written_buffered - 1;
        ret = btrfs_fdatawrite_range(BTRFS_I(inode), pos, endbyte);
        if (ret)
                goto out;
        ret = filemap_fdatawait_range(inode->i_mapping, pos, endbyte);
        if (ret)
                goto out;
        written += written_buffered;
        iocb->ki_pos = pos + written_buffered;
        invalidate_mapping_pages(file->f_mapping, pos >> PAGE_SHIFT,
                                 endbyte >> PAGE_SHIFT);
out:
        return ret < 0 ? ret : written;
}

static int check_direct_read(struct btrfs_fs_info *fs_info,
                             const struct iov_iter *iter, loff_t offset)
{
        int ret;
        int i, seg;

        ret = check_direct_IO(fs_info, iter, offset);
        if (ret < 0)
                return ret;

        if (!iter_is_iovec(iter))
                return 0;

        for (seg = 0; seg < iter->nr_segs; seg++) {
                for (i = seg + 1; i < iter->nr_segs; i++) {
                        const struct iovec *iov1 = iter_iov(iter) + seg;
                        const struct iovec *iov2 = iter_iov(iter) + i;

                        if (iov1->iov_base == iov2->iov_base)
                                return -EINVAL;
                }
        }
        return 0;
}

ssize_t btrfs_direct_read(struct kiocb *iocb, struct iov_iter *to)
{
        struct inode *inode = file_inode(iocb->ki_filp);
        size_t prev_left = 0;
        ssize_t read = 0;
        ssize_t ret;

        if (fsverity_active(inode))
                return 0;

        if (check_direct_read(inode_to_fs_info(inode), to, iocb->ki_pos))
                return 0;

        btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_SHARED);
again:
        /*
         * This is similar to what we do for direct IO writes, see the comment
         * at btrfs_direct_write(), but we also disable page faults in addition
         * to disabling them only at the iov_iter level. This is because when
         * reading from a hole or prealloc extent, iomap calls iov_iter_zero(),
         * which can still trigger page fault ins despite having set ->nofault
         * to true of our 'to' iov_iter.
         *
         * The difference to direct IO writes is that we deadlock when trying
         * to lock the extent range in the inode's tree during he page reads
         * triggered by the fault in (while for writes it is due to waiting for
         * our own ordered extent). This is because for direct IO reads,
         * btrfs_dio_iomap_begin() returns with the extent range locked, which
         * is only unlocked in the endio callback (end_bio_extent_readpage()).
         */
        pagefault_disable();
        to->nofault = true;
        ret = btrfs_dio_read(iocb, to, read);
        to->nofault = false;
        pagefault_enable();

        /* No increment (+=) because iomap returns a cumulative value. */
        if (ret > 0)
                read = ret;

        if (iov_iter_count(to) > 0 && (ret == -EFAULT || ret > 0)) {
                const size_t left = iov_iter_count(to);

                if (left == prev_left) {
                        /*
                         * We didn't make any progress since the last attempt,
                         * fallback to a buffered read for the remainder of the
                         * range. This is just to avoid any possibility of looping
                         * for too long.
                         */
                        ret = read;
                } else {
                        /*
                         * We made some progress since the last retry or this is
                         * the first time we are retrying. Fault in as many pages
                         * as possible and retry.
                         */
                        fault_in_iov_iter_writeable(to, left);
                        prev_left = left;
                        goto again;
                }
        }
        btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_SHARED);
        return ret < 0 ? ret : read;
}

int __init btrfs_init_dio(void)
{
        if (bioset_init(&btrfs_dio_bioset, BIO_POOL_SIZE,
                        offsetof(struct btrfs_dio_private, bbio.bio),
                        BIOSET_NEED_BVECS))
                return -ENOMEM;

        return 0;
}

void __cold btrfs_destroy_dio(void)
{
        bioset_exit(&btrfs_dio_bioset);
}