root/block/blk-mq-dma.c
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Copyright (C) 2025 Christoph Hellwig
 */
#include <linux/blk-integrity.h>
#include <linux/blk-mq-dma.h>
#include "blk.h"

static bool __blk_map_iter_next(struct blk_map_iter *iter)
{
        if (iter->iter.bi_size)
                return true;
        if (!iter->bio || !iter->bio->bi_next)
                return false;

        iter->bio = iter->bio->bi_next;
        if (iter->is_integrity) {
                iter->iter = bio_integrity(iter->bio)->bip_iter;
                iter->bvecs = bio_integrity(iter->bio)->bip_vec;
        } else {
                iter->iter = iter->bio->bi_iter;
                iter->bvecs = iter->bio->bi_io_vec;
        }
        return true;
}

static bool blk_map_iter_next(struct request *req, struct blk_map_iter *iter,
                              struct phys_vec *vec)
{
        unsigned int max_size;
        struct bio_vec bv;

        if (!iter->iter.bi_size)
                return false;

        bv = mp_bvec_iter_bvec(iter->bvecs, iter->iter);
        vec->paddr = bvec_phys(&bv);
        max_size = get_max_segment_size(&req->q->limits, vec->paddr, UINT_MAX);
        bv.bv_len = min(bv.bv_len, max_size);
        bvec_iter_advance_single(iter->bvecs, &iter->iter, bv.bv_len);

        /*
         * If we are entirely done with this bi_io_vec entry, check if the next
         * one could be merged into it.  This typically happens when moving to
         * the next bio, but some callers also don't pack bvecs tight.
         */
        while (!iter->iter.bi_size || !iter->iter.bi_bvec_done) {
                struct bio_vec next;

                if (!__blk_map_iter_next(iter))
                        break;

                next = mp_bvec_iter_bvec(iter->bvecs, iter->iter);
                if (bv.bv_len + next.bv_len > max_size ||
                    !biovec_phys_mergeable(req->q, &bv, &next))
                        break;

                bv.bv_len += next.bv_len;
                bvec_iter_advance_single(iter->bvecs, &iter->iter, next.bv_len);
        }

        vec->len = bv.bv_len;
        return true;
}

/*
 * The IOVA-based DMA API wants to be able to coalesce at the minimal IOMMU page
 * size granularity (which is guaranteed to be <= PAGE_SIZE and usually 4k), so
 * we need to ensure our segments are aligned to this as well.
 *
 * Note that there is no point in using the slightly more complicated IOVA based
 * path for single segment mappings.
 */
static inline bool blk_can_dma_map_iova(struct request *req,
                struct device *dma_dev)
{
        return !(req_phys_gap_mask(req) & dma_get_merge_boundary(dma_dev));
}

static bool blk_dma_map_bus(struct blk_dma_iter *iter, struct phys_vec *vec)
{
        iter->addr = pci_p2pdma_bus_addr_map(iter->p2pdma.mem, vec->paddr);
        iter->len = vec->len;
        return true;
}

static bool blk_dma_map_direct(struct request *req, struct device *dma_dev,
                struct blk_dma_iter *iter, struct phys_vec *vec)
{
        unsigned int attrs = 0;

        if (iter->p2pdma.map == PCI_P2PDMA_MAP_THRU_HOST_BRIDGE)
                attrs |= DMA_ATTR_MMIO;

        iter->addr = dma_map_phys(dma_dev, vec->paddr, vec->len,
                        rq_dma_dir(req), attrs);
        if (dma_mapping_error(dma_dev, iter->addr)) {
                iter->status = BLK_STS_RESOURCE;
                return false;
        }
        iter->len = vec->len;
        return true;
}

static bool blk_rq_dma_map_iova(struct request *req, struct device *dma_dev,
                struct dma_iova_state *state, struct blk_dma_iter *iter,
                struct phys_vec *vec)
{
        enum dma_data_direction dir = rq_dma_dir(req);
        unsigned int attrs = 0;
        size_t mapped = 0;
        int error;

        iter->addr = state->addr;
        iter->len = dma_iova_size(state);

        if (iter->p2pdma.map == PCI_P2PDMA_MAP_THRU_HOST_BRIDGE)
                attrs |= DMA_ATTR_MMIO;

        do {
                error = dma_iova_link(dma_dev, state, vec->paddr, mapped,
                                vec->len, dir, attrs);
                if (error)
                        goto out_unlink;
                mapped += vec->len;
        } while (blk_map_iter_next(req, &iter->iter, vec));

        error = dma_iova_sync(dma_dev, state, 0, mapped);
        if (error)
                goto out_unlink;

        return true;

out_unlink:
        dma_iova_destroy(dma_dev, state, mapped, dir, attrs);
        iter->status = errno_to_blk_status(error);
        return false;
}

static inline void blk_rq_map_iter_init(struct request *rq,
                                        struct blk_map_iter *iter)
{
        struct bio *bio = rq->bio;

        if (rq->rq_flags & RQF_SPECIAL_PAYLOAD) {
                *iter = (struct blk_map_iter) {
                        .bvecs = &rq->special_vec,
                        .iter = {
                                .bi_size = rq->special_vec.bv_len,
                        }
                };
        } else if (bio) {
                *iter = (struct blk_map_iter) {
                        .bio = bio,
                        .bvecs = bio->bi_io_vec,
                        .iter = bio->bi_iter,
                };
        } else {
                /* the internal flush request may not have bio attached */
                *iter = (struct blk_map_iter) {};
        }
}

static bool blk_dma_map_iter_start(struct request *req, struct device *dma_dev,
                struct dma_iova_state *state, struct blk_dma_iter *iter,
                unsigned int total_len)
{
        struct phys_vec vec;

        memset(&iter->p2pdma, 0, sizeof(iter->p2pdma));
        iter->status = BLK_STS_OK;
        iter->p2pdma.map = PCI_P2PDMA_MAP_NONE;

        /*
         * Grab the first segment ASAP because we'll need it to check for P2P
         * transfers.
         */
        if (!blk_map_iter_next(req, &iter->iter, &vec))
                return false;

        switch (pci_p2pdma_state(&iter->p2pdma, dma_dev,
                                 phys_to_page(vec.paddr))) {
        case PCI_P2PDMA_MAP_BUS_ADDR:
                return blk_dma_map_bus(iter, &vec);
        case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE:
                /*
                 * P2P transfers through the host bridge are treated the
                 * same as non-P2P transfers below and during unmap.
                 */
        case PCI_P2PDMA_MAP_NONE:
                break;
        default:
                iter->status = BLK_STS_INVAL;
                return false;
        }

        if (blk_can_dma_map_iova(req, dma_dev) &&
            dma_iova_try_alloc(dma_dev, state, vec.paddr, total_len))
                return blk_rq_dma_map_iova(req, dma_dev, state, iter, &vec);
        memset(state, 0, sizeof(*state));
        return blk_dma_map_direct(req, dma_dev, iter, &vec);
}

/**
 * blk_rq_dma_map_iter_start - map the first DMA segment for a request
 * @req:        request to map
 * @dma_dev:    device to map to
 * @state:      DMA IOVA state
 * @iter:       block layer DMA iterator
 *
 * Start DMA mapping @req to @dma_dev.  @state and @iter are provided by the
 * caller and don't need to be initialized.  @state needs to be stored for use
 * at unmap time, @iter is only needed at map time.
 *
 * Returns %false if there is no segment to map, including due to an error, or
 * %true ft it did map a segment.
 *
 * If a segment was mapped, the DMA address for it is returned in @iter.addr and
 * the length in @iter.len.  If no segment was mapped the status code is
 * returned in @iter.status.
 *
 * The caller can call blk_rq_dma_map_coalesce() to check if further segments
 * need to be mapped after this, or go straight to blk_rq_dma_map_iter_next()
 * to try to map the following segments.
 */
bool blk_rq_dma_map_iter_start(struct request *req, struct device *dma_dev,
                struct dma_iova_state *state, struct blk_dma_iter *iter)
{
        blk_rq_map_iter_init(req, &iter->iter);
        return blk_dma_map_iter_start(req, dma_dev, state, iter,
                                      blk_rq_payload_bytes(req));
}
EXPORT_SYMBOL_GPL(blk_rq_dma_map_iter_start);

/**
 * blk_rq_dma_map_iter_next - map the next DMA segment for a request
 * @req:        request to map
 * @dma_dev:    device to map to
 * @iter:       block layer DMA iterator
 *
 * Iterate to the next mapping after a previous call to
 * blk_rq_dma_map_iter_start().  See there for a detailed description of the
 * arguments.
 *
 * Returns %false if there is no segment to map, including due to an error, or
 * %true ft it did map a segment.
 *
 * If a segment was mapped, the DMA address for it is returned in @iter.addr and
 * the length in @iter.len.  If no segment was mapped the status code is
 * returned in @iter.status.
 */
bool blk_rq_dma_map_iter_next(struct request *req, struct device *dma_dev,
                struct blk_dma_iter *iter)
{
        struct phys_vec vec;

        if (!blk_map_iter_next(req, &iter->iter, &vec))
                return false;

        if (iter->p2pdma.map == PCI_P2PDMA_MAP_BUS_ADDR)
                return blk_dma_map_bus(iter, &vec);
        return blk_dma_map_direct(req, dma_dev, iter, &vec);
}
EXPORT_SYMBOL_GPL(blk_rq_dma_map_iter_next);

static inline struct scatterlist *
blk_next_sg(struct scatterlist **sg, struct scatterlist *sglist)
{
        if (!*sg)
                return sglist;

        /*
         * If the driver previously mapped a shorter list, we could see a
         * termination bit prematurely unless it fully inits the sg table
         * on each mapping. We KNOW that there must be more entries here
         * or the driver would be buggy, so force clear the termination bit
         * to avoid doing a full sg_init_table() in drivers for each command.
         */
        sg_unmark_end(*sg);
        return sg_next(*sg);
}

/*
 * Map a request to scatterlist, return number of sg entries setup. Caller
 * must make sure sg can hold rq->nr_phys_segments entries.
 */
int __blk_rq_map_sg(struct request *rq, struct scatterlist *sglist,
                    struct scatterlist **last_sg)
{
        struct blk_map_iter iter;
        struct phys_vec vec;
        int nsegs = 0;

        blk_rq_map_iter_init(rq, &iter);
        while (blk_map_iter_next(rq, &iter, &vec)) {
                *last_sg = blk_next_sg(last_sg, sglist);

                WARN_ON_ONCE(overflows_type(vec.len, unsigned int));
                sg_set_page(*last_sg, phys_to_page(vec.paddr), vec.len,
                                offset_in_page(vec.paddr));
                nsegs++;
        }

        if (*last_sg)
                sg_mark_end(*last_sg);

        /*
         * Something must have been wrong if the figured number of
         * segment is bigger than number of req's physical segments
         */
        WARN_ON(nsegs > blk_rq_nr_phys_segments(rq));

        return nsegs;
}
EXPORT_SYMBOL(__blk_rq_map_sg);

#ifdef CONFIG_BLK_DEV_INTEGRITY
/**
 * blk_rq_integrity_dma_map_iter_start - map the first integrity DMA segment
 *                                       for a request
 * @req:        request to map
 * @dma_dev:    device to map to
 * @state:      DMA IOVA state
 * @iter:       block layer DMA iterator
 *
 * Start DMA mapping @req integrity data to @dma_dev.  @state and @iter are
 * provided by the caller and don't need to be initialized.  @state needs to be
 * stored for use at unmap time, @iter is only needed at map time.
 *
 * Returns %false if there is no segment to map, including due to an error, or
 * %true if it did map a segment.
 *
 * If a segment was mapped, the DMA address for it is returned in @iter.addr
 * and the length in @iter.len.  If no segment was mapped the status code is
 * returned in @iter.status.
 *
 * The caller can call blk_rq_dma_map_coalesce() to check if further segments
 * need to be mapped after this, or go straight to blk_rq_dma_map_iter_next()
 * to try to map the following segments.
 */
bool blk_rq_integrity_dma_map_iter_start(struct request *req,
                struct device *dma_dev,  struct dma_iova_state *state,
                struct blk_dma_iter *iter)
{
        unsigned len = bio_integrity_bytes(&req->q->limits.integrity,
                                           blk_rq_sectors(req));
        struct bio *bio = req->bio;

        iter->iter = (struct blk_map_iter) {
                .bio = bio,
                .iter = bio_integrity(bio)->bip_iter,
                .bvecs = bio_integrity(bio)->bip_vec,
                .is_integrity = true,
        };
        return blk_dma_map_iter_start(req, dma_dev, state, iter, len);
}
EXPORT_SYMBOL_GPL(blk_rq_integrity_dma_map_iter_start);

/**
 * blk_rq_integrity_dma_map_iter_next - map the next integrity DMA segment for
 *                                       a request
 * @req:        request to map
 * @dma_dev:    device to map to
 * @state:      DMA IOVA state
 * @iter:       block layer DMA iterator
 *
 * Iterate to the next integrity mapping after a previous call to
 * blk_rq_integrity_dma_map_iter_start().  See there for a detailed description
 * of the arguments.
 *
 * Returns %false if there is no segment to map, including due to an error, or
 * %true if it did map a segment.
 *
 * If a segment was mapped, the DMA address for it is returned in @iter.addr and
 * the length in @iter.len.  If no segment was mapped the status code is
 * returned in @iter.status.
 */
bool blk_rq_integrity_dma_map_iter_next(struct request *req,
               struct device *dma_dev, struct blk_dma_iter *iter)
{
        struct phys_vec vec;

        if (!blk_map_iter_next(req, &iter->iter, &vec))
                return false;

        if (iter->p2pdma.map == PCI_P2PDMA_MAP_BUS_ADDR)
                return blk_dma_map_bus(iter, &vec);
        return blk_dma_map_direct(req, dma_dev, iter, &vec);
}
EXPORT_SYMBOL_GPL(blk_rq_integrity_dma_map_iter_next);

/**
 * blk_rq_map_integrity_sg - Map integrity metadata into a scatterlist
 * @rq:         request to map
 * @sglist:     target scatterlist
 *
 * Description: Map the integrity vectors in request into a
 * scatterlist.  The scatterlist must be big enough to hold all
 * elements.  I.e. sized using blk_rq_count_integrity_sg() or
 * rq->nr_integrity_segments.
 */
int blk_rq_map_integrity_sg(struct request *rq, struct scatterlist *sglist)
{
        struct request_queue *q = rq->q;
        struct scatterlist *sg = NULL;
        struct bio *bio = rq->bio;
        unsigned int segments = 0;
        struct phys_vec vec;

        struct blk_map_iter iter = {
                .bio = bio,
                .iter = bio_integrity(bio)->bip_iter,
                .bvecs = bio_integrity(bio)->bip_vec,
                .is_integrity = true,
        };

        while (blk_map_iter_next(rq, &iter, &vec)) {
                sg = blk_next_sg(&sg, sglist);

                WARN_ON_ONCE(overflows_type(vec.len, unsigned int));
                sg_set_page(sg, phys_to_page(vec.paddr), vec.len,
                                offset_in_page(vec.paddr));
                segments++;
        }

        if (sg)
                sg_mark_end(sg);

        /*
         * Something must have been wrong if the figured number of segment
         * is bigger than number of req's physical integrity segments
         */
        BUG_ON(segments > rq->nr_integrity_segments);
        BUG_ON(segments > queue_max_integrity_segments(q));
        return segments;
}
EXPORT_SYMBOL(blk_rq_map_integrity_sg);
#endif