root/usr/src/uts/intel/io/viona/viona_tx.c
/*
 * Copyright (c) 2013  Chris Torek <torek @ torek net>
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */
/*
 * This file and its contents are supplied under the terms of the
 * Common Development and Distribution License ("CDDL"), version 1.0.
 * You may only use this file in accordance with the terms of version
 * 1.0 of the CDDL.
 *
 * A full copy of the text of the CDDL should have accompanied this
 * source.  A copy of the CDDL is also available via the Internet at
 * http://www.illumos.org/license/CDDL.
 *
 * Copyright 2015 Pluribus Networks Inc.
 * Copyright 2019 Joyent, Inc.
 * Copyright 2025 Oxide Computer Company
 */


#include <sys/types.h>
#include <sys/smt.h>
#include <sys/strsubr.h>

#include <sys/pattr.h>
#include <sys/dlpi.h>
#include <inet/ip.h>
#include <inet/ip_impl.h>

#include "viona_impl.h"

#define BNXE_NIC_DRIVER         "bnxe"

/*
 * Tunable controls tx copy by default on or off
 */
boolean_t viona_default_tx_copy = B_TRUE;

/*
 * Tunable for maximum configured TX header padding.
 */
uint_t viona_max_header_pad = 256;

/*
 * copy tx mbufs from virtio ring to avoid necessitating a wait for packet
 * transmission to free resources.
 */
kmutex_t viona_force_copy_lock;
static enum viona_force_copy {
        VFC_UNINITALIZED        = 0,
        VFC_COPY_UNEEDED        = 1,
        VFC_COPY_REQUIRED       = 2,
} viona_force_copy_state = VFC_UNINITALIZED;

struct viona_desb {
        frtn_t                  d_frtn;
        viona_vring_t           *d_ring;
        uint_t                  d_ref;
        uint32_t                d_len;
        uint16_t                d_cookie;
        uchar_t                 *d_headers;
        vmm_page_t              *d_pages;
};

static size_t viona_tx(viona_link_t *, viona_vring_t *);
static void viona_desb_release(viona_desb_t *);


static void
viona_tx_wait_outstanding(viona_vring_t *ring)
{
        ASSERT(MUTEX_HELD(&ring->vr_lock));

        while (ring->vr_xfer_outstanding != 0) {
                /*
                 * Paying heed to signals is counterproductive here.  This is a
                 * very tight loop if pending transfers take an extended amount
                 * of time to be reclaimed while the host process is exiting.
                 */
                cv_wait(&ring->vr_cv, &ring->vr_lock);
        }
}

/*
 * Check if full TX packet copying is needed.  This should not be called from
 * viona attach()/detach() context.
 */
boolean_t
viona_tx_copy_needed(void)
{
        boolean_t result;

        if (viona_default_tx_copy) {
                return (B_TRUE);
        }

        mutex_enter(&viona_force_copy_lock);
        if (viona_force_copy_state == VFC_UNINITALIZED) {
                major_t bnxe_major;

                /*
                 * The original code for viona featured an explicit check for
                 * the bnxe driver which, when found present, necessitated that
                 * all transmissions be copied into their own mblks instead of
                 * passing guest memory to the underlying device.
                 *
                 * The motivations for this are unclear, but until it can be
                 * proven unnecessary, the check lives on.
                 */
                viona_force_copy_state = VFC_COPY_UNEEDED;
                if ((bnxe_major = ddi_name_to_major(BNXE_NIC_DRIVER))
                    != DDI_MAJOR_T_NONE) {
                        if (ddi_hold_installed_driver(bnxe_major) != NULL) {
                                viona_force_copy_state = VFC_COPY_REQUIRED;
                                ddi_rele_driver(bnxe_major);
                        }
                }
        }
        result = (viona_force_copy_state == VFC_COPY_REQUIRED);
        mutex_exit(&viona_force_copy_lock);

        return (result);
}

void
viona_tx_ring_alloc(viona_vring_t *ring, const uint16_t qsz)
{
        const viona_link_params_t *vlp = &ring->vr_link->l_params;

        ring->vr_tx.vrt_header_pad = vlp->vlp_tx_header_pad;
        /* Allocate desb handles for TX ring if packet copying not forced */
        if (!ring->vr_link->l_params.vlp_tx_copy_data) {
                viona_desb_t *dp =
                    kmem_zalloc(sizeof (viona_desb_t) * qsz, KM_SLEEP);
                ring->vr_tx.vrt_desb = dp;

                const size_t header_sz =
                    VIONA_MAX_HDRS_LEN + ring->vr_tx.vrt_header_pad;
                for (uint_t i = 0; i < qsz; i++, dp++) {
                        dp->d_frtn.free_func = viona_desb_release;
                        dp->d_frtn.free_arg = (void *)dp;
                        dp->d_ring = ring;
                        dp->d_headers = kmem_zalloc(header_sz, KM_SLEEP);
                }
        }

        /* Allocate ring-sized iovec buffers for TX */
        ring->vr_tx.vrt_iov = kmem_alloc(sizeof (struct iovec) * qsz, KM_SLEEP);
        ring->vr_tx.vrt_iov_cnt = qsz;
}

void
viona_tx_ring_free(viona_vring_t *ring, const uint16_t qsz)
{
        if (ring->vr_tx.vrt_desb != NULL) {
                viona_desb_t *dp = ring->vr_tx.vrt_desb;

                const size_t header_sz =
                    VIONA_MAX_HDRS_LEN + ring->vr_tx.vrt_header_pad;
                for (uint_t i = 0; i < qsz; i++, dp++) {
                        kmem_free(dp->d_headers, header_sz);
                }
                kmem_free(ring->vr_tx.vrt_desb, sizeof (viona_desb_t) * qsz);
                ring->vr_tx.vrt_desb = NULL;
        }

        if (ring->vr_tx.vrt_iov != NULL) {
                ASSERT3U(ring->vr_tx.vrt_iov_cnt, !=, 0);

                kmem_free(ring->vr_tx.vrt_iov,
                    sizeof (struct iovec) * ring->vr_tx.vrt_iov_cnt);
                ring->vr_tx.vrt_iov = NULL;
                ring->vr_tx.vrt_iov_cnt = 0;
        }
}

static void
viona_tx_done(viona_vring_t *ring, uint32_t len, uint16_t cookie)
{
        vq_pushchain(ring, len, cookie);

        membar_enter();
        viona_intr_ring(ring, B_FALSE);
}

#define TX_BURST_THRESH 32

void
viona_worker_tx(viona_vring_t *ring, viona_link_t *link)
{
        (void) thread_vsetname(curthread, "viona_tx_%u_%p",
            ring->vr_index, ring);

        ASSERT(MUTEX_HELD(&ring->vr_lock));
        ASSERT3U(ring->vr_state, ==, VRS_RUN);

        mutex_exit(&ring->vr_lock);

        for (;;) {
                size_t cnt_tx = 0, size_tx = 0;
                uint_t burst = 0;

                viona_ring_disable_notify(ring);
                while (viona_ring_num_avail(ring) != 0) {
                        const size_t size_sent = viona_tx(link, ring);
                        if (size_sent != 0) {
                                /* Account for successful transmissions */
                                size_tx += size_sent;
                                cnt_tx++;
                        }
                        burst++;

                        /*
                         * It is advantageous for throughput to keep this
                         * transmission loop tight, but periodic breaks to
                         * check for other events are of value too.
                         */
                        if (burst >= TX_BURST_THRESH) {
                                mutex_enter(&ring->vr_lock);
                                const bool need_bail = vring_need_bail(ring);
                                mutex_exit(&ring->vr_lock);

                                if (need_bail) {
                                        break;
                                }
                                burst = 0;
                        }
                }

                VIONA_PROBE2(tx, viona_link_t *, link, size_t, cnt_tx);
                if (cnt_tx != 0) {
                        viona_ring_stat_accept(ring, cnt_tx, size_tx);
                }

                /*
                 * Check for available descriptors on the ring once more in
                 * case a late addition raced with the NO_NOTIFY flag toggle.
                 *
                 * The barrier ensures that visibility of the no-notify
                 * store does not cross the viona_ring_num_avail() check below.
                 */
                viona_ring_enable_notify(ring);
                membar_enter();

                if (viona_ring_num_avail(ring) == 0 &&
                    (link->l_features & VIRTIO_F_RING_NOTIFY_ON_EMPTY) != 0) {
                        /*
                         * The NOTIFY_ON_EMPTY interrupt should not pay heed to
                         * the presence of AVAIL_NO_INTERRUPT.
                         */
                        viona_intr_ring(ring, B_TRUE);
                }

                mutex_enter(&ring->vr_lock);
                for (;;) {
                        if (vring_need_bail(ring)) {
                                ring->vr_state = VRS_STOP;
                                viona_tx_wait_outstanding(ring);
                                return;
                        }

                        if (vmm_drv_lease_expired(ring->vr_lease)) {
                                ring->vr_state_flags |= VRSF_RENEW;
                                /*
                                 * When renewing the lease for the ring, no TX
                                 * frames may be outstanding, as they contain
                                 * references to guest memory.
                                 */
                                viona_tx_wait_outstanding(ring);

                                const boolean_t renewed =
                                    viona_ring_lease_renew(ring);
                                ring->vr_state_flags &= ~VRSF_RENEW;

                                if (!renewed) {
                                        /* stop ring on failed renewal */
                                        ring->vr_state = VRS_STOP;
                                        return;
                                }
                        }

                        if (viona_ring_num_avail(ring) != 0) {
                                break;
                        }

                        /* Wait for further activity on the ring */
                        (void) cv_wait_sig(&ring->vr_cv, &ring->vr_lock);
                }
                mutex_exit(&ring->vr_lock);
        }
        /* UNREACHABLE */
}

static void
viona_desb_release(viona_desb_t *dp)
{
        viona_vring_t *ring = dp->d_ring;
        uint_t ref;
        uint32_t len;
        uint16_t cookie;

        ref = atomic_dec_uint_nv(&dp->d_ref);
        if (ref > 1) {
                return;
        }

        /*
         * The desb corresponding to this index must be ready for reuse before
         * the descriptor is returned to the guest via the 'used' ring.
         */
        len = dp->d_len;
        cookie = dp->d_cookie;
        dp->d_len = 0;
        dp->d_cookie = 0;
        vmm_drv_page_release_chain(dp->d_pages);
        dp->d_pages = NULL;

        /*
         * Ensure all other changes to the desb are visible prior to zeroing its
         * refcount, signifying its readiness for reuse.
         */
        membar_exit();
        dp->d_ref = 0;

        viona_tx_done(ring, len, cookie);

        mutex_enter(&ring->vr_lock);
        if ((--ring->vr_xfer_outstanding) == 0) {
                cv_broadcast(&ring->vr_cv);
        }
        mutex_exit(&ring->vr_lock);
}

/*
 * Confirm that the requested checksum operation acts within the bounds of the
 * provided packet, and that the checksum itself will be stored in the "copied
 * headers" portion of said packet.
 */
static boolean_t
viona_tx_csum_req_valid(const struct virtio_net_mrgrxhdr *hdr,
    const mac_ether_offload_info_t *meoi, uint_t copied_len)
{
        const uint_t csum_off = hdr->vrh_csum_offset + hdr->vrh_csum_start;

        if (hdr->vrh_csum_start >= meoi->meoi_len ||
            hdr->vrh_csum_start < meoi->meoi_l2hlen ||
            csum_off >= meoi->meoi_len ||
            (csum_off + sizeof (uint16_t)) > copied_len) {
                return (B_FALSE);
        }

        return (B_TRUE);
}

/*
 * Configure mblk to request full checksum offloading, given the virtio and meoi
 * details provided.
 */
static void
viona_tx_hcksum_full(mblk_t *mp, const struct virtio_net_mrgrxhdr *hdr,
    const mac_ether_offload_info_t *meoi, uint32_t added_flags)
{
        /*
         * Out of caution, zero the checksum field in case any driver and/or
         * device would erroneously use it in the sum calculation.
         */
        uint16_t *csump = (uint16_t *)
            (mp->b_rptr + hdr->vrh_csum_start + hdr->vrh_csum_offset);
        *csump = 0;

        mac_hcksum_set(mp, 0, 0, 0, 0, HCK_FULLCKSUM | added_flags);
}

/*
 * Configure mblk to request partial checksum offloading, given the virtio and
 * meoi details provided.
 */
static void
viona_tx_hcksum_partial(mblk_t *mp, const struct virtio_net_mrgrxhdr *hdr,
    const mac_ether_offload_info_t *meoi, uint32_t added_flags)
{
        /*
         * MAC expects these offsets to be relative to the start of the L3
         * header rather than the L2 frame.
         */
        mac_hcksum_set(mp,
            hdr->vrh_csum_start - meoi->meoi_l2hlen,
            hdr->vrh_csum_start + hdr->vrh_csum_offset - meoi->meoi_l2hlen,
            meoi->meoi_len - meoi->meoi_l2hlen,
            0, HCK_PARTIALCKSUM | added_flags);
}

static boolean_t
viona_tx_offloads(viona_vring_t *ring, const struct virtio_net_mrgrxhdr *hdr,
    const mac_ether_offload_info_t *meoi, mblk_t *mp, uint32_t len)
{
        viona_link_t *link = ring->vr_link;
        const uint32_t cap_csum = link->l_cap_csum;

        /*
         * Since viona is a "transitional device", the data stored by the
         * driver will either be in the guest's native endian format (for the
         * legacy interface - see sections 2.4.3 and 5.1.6.1 of the VIRTIO 1.0
         * spec for more info) or little-endian. At this time the only guests
         * using viona are x86 and we can assume little-endian.
         */
        const uint16_t gso_size = LE_16(hdr->vrh_gso_size);

        if (!viona_tx_csum_req_valid(hdr, meoi, MBLKL(mp))) {
                VIONA_PROBE2(fail_hcksum, viona_link_t *, link, mblk_t *, mp);
                VIONA_RING_STAT_INCR(ring, fail_hcksum);
                return (B_FALSE);
        }

        const uint16_t ftype = meoi->meoi_l3proto;
        const uint8_t ipproto = meoi->meoi_l4proto;
        if (ftype != ETHERTYPE_IP && ftype != ETHERTYPE_IPV6) {
                /* Ignore checksum offload requests for non-IP protocols. */
                VIONA_PROBE2(fail_hcksum_proto, viona_link_t *, link,
                    mblk_t *, mp);
                VIONA_RING_STAT_INCR(ring, fail_hcksum_proto);
                return (B_FALSE);
        }

        /* Configure TCPv4 LSO when requested */
        if ((hdr->vrh_gso_type & VIRTIO_NET_HDR_GSO_TCPV4) != 0 &&
            ftype == ETHERTYPE_IP) {
                if ((link->l_features & VIRTIO_NET_F_HOST_TSO4) == 0) {
                        VIONA_PROBE2(tx_gso_fail, viona_link_t *, link,
                            mblk_t *, mp);
                        VIONA_RING_STAT_INCR(ring, tx_gso_fail);
                        return (B_FALSE);
                }

                lso_info_set(mp, gso_size, HW_LSO);

                /*
                 * We should have already verified that an adequate form of
                 * hardware checksum offload is present for TSOv4
                 */
                ASSERT3U(cap_csum &
                    (HCKSUM_INET_PARTIAL | HCKSUM_INET_FULL_V4), !=, 0);

                if ((cap_csum & HCKSUM_INET_FULL_V4) != 0) {
                        viona_tx_hcksum_full(mp, hdr, meoi, HW_LSO);
                } else if ((cap_csum & HCKSUM_INET_PARTIAL) != 0) {
                        /*
                         * Our native IP stack doesn't set the L4 length field
                         * of the pseudo header when LSO is in play.  Other IP
                         * stacks, e.g.  Linux, do include the length field.
                         * This is a problem because the hardware expects that
                         * the length field is not set. When it is set, it will
                         * cause an incorrect TCP checksum to be generated.
                         * Linux avoids this issue by correcting the
                         * pseudo-header checksum in the driver code.
                         *
                         * In order to get the correct HW checksum we need to
                         * assume the guest's IP stack gave us a bogus TCP
                         * partial checksum and calculate it ourselves.
                         */
                        ipha_t *ipha =
                            (ipha_t *)(mp->b_rptr + meoi->meoi_l2hlen);
                        uint16_t *cksump =
                            IPH_TCPH_CHECKSUMP(ipha, IPH_HDR_LENGTH(ipha));

                        uint32_t cksum = IP_TCP_CSUM_COMP;
                        const ipaddr_t src = ipha->ipha_src;
                        const ipaddr_t dst = ipha->ipha_dst;
                        cksum += (dst >> 16) + (dst & 0xffff) +
                            (src >> 16) + (src & 0xffff);
                        cksum = (cksum & 0xffff) + (cksum >> 16);
                        *cksump = (cksum & 0xffff) + (cksum >> 16);

                        /*
                         * NICs such as ixgbe require that ipv4 checksum offload
                         * also be enabled when performing LSO.
                         */
                        uint32_t v4csum = 0;
                        if ((cap_csum & HCKSUM_IPHDRCKSUM) != 0) {
                                v4csum = HCK_IPV4_HDRCKSUM;
                                ipha->ipha_hdr_checksum = 0;
                        }

                        viona_tx_hcksum_partial(mp, hdr, meoi, HW_LSO | v4csum);
                } else {
                        /*
                         * This should be unreachable: We do not permit LSO
                         * without adequate checksum offload capability.
                         */
                        VIONA_PROBE2(tx_gso_fail, viona_link_t *, link,
                            mblk_t *, mp);
                        VIONA_RING_STAT_INCR(ring, tx_gso_fail);
                        return (B_FALSE);
                }

                return (B_TRUE);
        }

        /*
         * Partial checksum support from the NIC is ideal, since it most closely
         * maps to the interface defined by virtio.
         */
        if ((cap_csum & HCKSUM_INET_PARTIAL) != 0 &&
            (ipproto == IPPROTO_TCP || ipproto == IPPROTO_UDP)) {
                viona_tx_hcksum_partial(mp, hdr, meoi, 0);
                return (B_TRUE);
        }

        /*
         * Without partial checksum support, look to the L3/L4 protocol
         * information to see if the NIC can handle it.  If not, the checksum
         * will need to calculated inline.
         */
        if (ftype == ETHERTYPE_IP) {
                if ((cap_csum & HCKSUM_INET_FULL_V4) != 0 &&
                    (ipproto == IPPROTO_TCP || ipproto == IPPROTO_UDP)) {
                        viona_tx_hcksum_full(mp, hdr, meoi, 0);
                        return (B_TRUE);
                }

                /* XXX: Implement manual fallback checksumming? */
                VIONA_PROBE2(fail_hcksum, viona_link_t *, link, mblk_t *, mp);
                VIONA_RING_STAT_INCR(ring, fail_hcksum);
                return (B_FALSE);
        } else if (ftype == ETHERTYPE_IPV6) {
                if ((cap_csum & HCKSUM_INET_FULL_V6) != 0 &&
                    (ipproto == IPPROTO_TCP || ipproto == IPPROTO_UDP)) {
                        viona_tx_hcksum_full(mp, hdr, meoi, 0);
                        return (B_TRUE);
                }

                /* XXX: Implement manual fallback checksumming? */
                VIONA_PROBE2(fail_hcksum6, viona_link_t *, link, mblk_t *, mp);
                VIONA_RING_STAT_INCR(ring, fail_hcksum6);
                return (B_FALSE);
        }

        /*
         * Note the failure for unrecognized protocols, but soldier on to make
         * our best effort at getting the frame out the door.
         */
        VIONA_PROBE2(fail_hcksum_proto, viona_link_t *, link, mblk_t *, mp);
        VIONA_RING_STAT_INCR(ring, fail_hcksum_proto);
        return (B_FALSE);
}

static mblk_t *
viona_tx_alloc_headers(viona_vring_t *ring, uint16_t cookie, viona_desb_t **dpp,
    uint32_t len)
{
        ASSERT3P(*dpp, ==, NULL);

        mblk_t *mp = NULL;
        const size_t header_pad = ring->vr_tx.vrt_header_pad;

        if (ring->vr_tx.vrt_desb != NULL) {
                viona_desb_t *dp = &ring->vr_tx.vrt_desb[cookie];
                const size_t header_sz = VIONA_MAX_HDRS_LEN + header_pad;

                /*
                 * If the guest driver is operating properly, each desb slot
                 * should be available for use when processing a TX descriptor
                 * from the 'avail' ring.  In the case of drivers that reuse a
                 * descriptor before it has been posted to the 'used' ring, the
                 * data is simply dropped.
                 */
                if (atomic_cas_uint(&dp->d_ref, 0, 1) != 0) {
                        return (NULL);
                }

                dp->d_cookie = cookie;
                mp = desballoc(dp->d_headers, header_sz, 0, &dp->d_frtn);

                if (mp != NULL) {
                        /*
                         * Account for the successful desballoc, and communicate
                         * out the desb handle for subsequent use
                         */
                        dp->d_ref++;
                        *dpp = dp;
                } else {
                        /* Reset the desb back to its "available" state */
                        dp->d_ref = 0;
                }
        } else {
                /*
                 * If we are going to be copying the entire packet, we might as
                 * well allocate for it all in one go.
                 */
                mp = allocb(len + header_pad, 0);
        }

        /* Push pointers forward to account for requested header padding */
        if (mp != NULL && header_pad != 0) {
                mp->b_rptr = mp->b_wptr = (DB_BASE(mp) + header_pad);
        }

        return (mp);
}

static boolean_t
viona_tx_copy_headers(viona_vring_t *ring, iov_bunch_t *iob, mblk_t *mp,
    mac_ether_offload_info_t *meoi)
{
        ASSERT(mp->b_cont == NULL);

        if (ring->vr_tx.vrt_desb == NULL) {
                /*
                 * If not using guest data loaning through the desb, then we
                 * expect viona_tx_alloc_headers() to have allocated space for
                 * the entire packet, which we should copy now.
                 */
                const uint32_t pkt_size = iob->ib_remain;

                VERIFY(MBLKTAIL(mp) >= pkt_size);
                VERIFY(iov_bunch_copy(iob, mp->b_wptr, pkt_size));
                mp->b_wptr += pkt_size;
                mac_ether_offload_info(mp, meoi);
                return (B_TRUE);
        }

        /*
         * We want to maximize the amount of guest data we loan when performing
         * packet transmission, with the caveat that we must copy the packet
         * headers to prevent TOCTOU issues.
         */
        const uint32_t copy_sz = MIN(iob->ib_remain, MBLKTAIL(mp));

        VERIFY(iov_bunch_copy(iob, mp->b_wptr, copy_sz));
        mp->b_wptr += copy_sz;

        if (iob->ib_remain == 0) {
                mac_ether_offload_info(mp, meoi);
                return (B_TRUE);
        }

        mac_ether_offload_info(mp, meoi);
        if ((meoi->meoi_flags & MEOI_L2INFO_SET) == 0) {
                /* If the L2 header cannot be parsed, give up now */
                return (B_FALSE);
        }
        if ((meoi->meoi_flags & MEOI_L4INFO_SET) != 0) {
                const uint32_t full_hdr_sz =
                    meoi->meoi_l2hlen + meoi->meoi_l3hlen + meoi->meoi_l4hlen;
                if (copy_sz >= full_hdr_sz) {
                        /* All headers are already copied */
                        return (B_TRUE);
                }
        }

        /*
         * The full headers do not appear to be along for the ride yet, or the
         * packet bears a protocol we do not handle.  Just allocate a
         * buffer and copy the remainder of the packet.
         */
        const uint32_t remain_sz = iob->ib_remain;
        mblk_t *remain_mp = allocb(remain_sz, 0);
        if (remain_mp == NULL) {
                return (B_FALSE);
        }
        VERIFY(iov_bunch_copy(iob, remain_mp->b_wptr, remain_sz));
        remain_mp->b_wptr += remain_sz;
        mp->b_cont = remain_mp;
        /* Refresh header info now that we have copied the rest */
        mac_ether_offload_info(mp, meoi);

        return (B_TRUE);
}

static size_t
viona_tx(viona_link_t *link, viona_vring_t *ring)
{
        struct iovec            *iov = ring->vr_tx.vrt_iov;
        const uint_t            max_segs = ring->vr_tx.vrt_iov_cnt;
        uint16_t                cookie;
        vmm_page_t              *pages = NULL;
        uint32_t                total_len;
        mblk_t                  *mp_head = NULL;
        viona_desb_t            *dp = NULL;

        ASSERT(iov != NULL);

        const int n = vq_popchain(ring, iov, max_segs, &cookie, &pages,
            &total_len);
        if (n == 0) {
                VIONA_PROBE1(tx_absent, viona_vring_t *, ring);
                VIONA_RING_STAT_INCR(ring, tx_absent);
                return (0);
        } else if (n < 0) {
                /*
                 * Any error encountered in vq_popchain has already resulted in
                 * specific probe and statistic handling.  Further action here
                 * is unnecessary.
                 */
                return (0);
        }

        /*
         * Get set up to copy the VirtIO header from in front of the packet.
         */
        iov_bunch_t iob = {
                .ib_iov = iov,
                .ib_remain = total_len,
        };
        struct virtio_net_mrgrxhdr hdr;
        uint32_t vio_hdr_len = 0;
        if (ring->vr_link->l_modern ||
            ((link->l_features & VIRTIO_NET_F_MRG_RXBUF) != 0)) {
                /*
                 * Presence of the "num_bufs" member in legacy mode is
                 * determined by the merge-rxbuf feature on the device, despite
                 * the fact that we are in transmission context here. In modern
                 * mode, the field is always present.
                 */
                vio_hdr_len = sizeof (struct virtio_net_mrgrxhdr);
        } else {
                vio_hdr_len = sizeof (struct virtio_net_hdr);
                /*
                 * We ignore "num_bufs" from the guest anyways, but zero it out
                 * just in case.
                 */
                hdr.vrh_bufs = 0;
        }
        const uint32_t pkt_len = total_len - vio_hdr_len;
        if (!iov_bunch_copy(&iob, &hdr, vio_hdr_len)) {
                goto drop_fail;
        }

        if (pkt_len > VIONA_MAX_PACKET_SIZE ||
            pkt_len < sizeof (struct ether_header)) {
                goto drop_fail;
        }

        mp_head = viona_tx_alloc_headers(ring, cookie, &dp, pkt_len);
        if (mp_head == NULL) {
                goto drop_fail;
        }

        /*
         * Copy the the packet headers (L2 through L4, if present) to prevent
         * TOCTOU attacks in any subsequent consumers of that data.
         */
        mac_ether_offload_info_t meoi = { 0 };
        if (!viona_tx_copy_headers(ring, &iob, mp_head, &meoi)) {
                goto drop_fail;
        }

        if (dp != NULL && iob.ib_remain != 0) {
                /*
                 * If this device is loaning guest memory, rather than copying
                 * the entire body of the packet, we may need to establish mblks
                 * for the remaining data-to-be-loaned after the header copy.
                 */
                uint32_t chunk_sz;
                caddr_t chunk;
                mblk_t *mp_tail = mp_head;

                /*
                 * Ensure that our view of the tail is accurate in the rare case
                 * that the header allocation/copying logic has already resulted
                 * in a chained mblk.
                 */
                while (mp_tail->b_cont != NULL) {
                        mp_tail = mp_tail->b_cont;
                }

                while (iov_bunch_next_chunk(&iob, &chunk, &chunk_sz)) {
                        mblk_t *mp = desballoc((uchar_t *)chunk, chunk_sz, 0,
                            &dp->d_frtn);
                        if (mp == NULL) {
                                goto drop_fail;
                        }

                        mp->b_wptr += chunk_sz;
                        dp->d_ref++;
                        mp_tail->b_cont = mp;
                        mp_tail = mp;
                }
        } else {
                /* The copy-everything strategy should be done by now */
                VERIFY0(iob.ib_remain);
        }

        if (VNETHOOK_INTERESTED_OUT(link->l_neti)) {
                /*
                 * The hook consumer may elect to free the mblk_t and set
                 * our mblk_t ** to NULL.  When using a viona_desb_t
                 * (dp != NULL), we do not want the corresponding cleanup to
                 * occur during the viona_hook() call. We instead want to
                 * reset and recycle dp for future use.  To prevent cleanup
                 * during the viona_hook() call, we take a ref on dp (if being
                 * used), and release it on success.  On failure, the
                 * freemsgchain() call will release all the refs taken earlier
                 * in viona_tx() (aside from the initial ref and the one we
                 * take), and drop_hook will reset dp for reuse.
                 */
                if (dp != NULL)
                        dp->d_ref++;

                /*
                 * Pass &mp instead of &mp_head so we don't lose track of
                 * mp_head if the hook consumer (i.e. ipf) elects to free mp
                 * and set mp to NULL.
                 */
                mblk_t *mp = mp_head;
                if (viona_hook(link, ring, &mp, B_TRUE) != 0) {
                        if (mp != NULL)
                                freemsgchain(mp);
                        goto drop_hook;
                }

                if (dp != NULL) {
                        dp->d_ref--;

                        /*
                         * It is possible that the hook(s) accepted the packet,
                         * but as part of its processing, it issued a pull-up
                         * which released all references to the desb.  In that
                         * case, go back to acting like the packet is entirely
                         * copied (which it is).
                         */
                        if (dp->d_ref == 1) {
                                dp->d_cookie = 0;
                                dp->d_ref = 0;
                                dp = NULL;
                        }
                }
        }

        /*
         * Translate request for offloaded checksumming. If the guest sent an
         * LSO packet then it must have also negotiated and requested partial
         * checksum; therefore the LSO logic is contained within
         * viona_tx_offloads().
         */
        if ((link->l_features & VIRTIO_NET_F_CSUM) != 0 &&
            (hdr.vrh_flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) != 0) {
                if (!viona_tx_offloads(ring, &hdr, &meoi, mp_head, pkt_len)) {
                        /*
                         * If processing of any checksum offload request fails,
                         * we can still pass the packet on for transmission.
                         * Even with this best-effort behavior, which may in
                         * fact succeed in the end, we record it as an error.
                         */
                        viona_ring_stat_error(ring);
                }
        }

        if (dp != NULL) {
                /*
                 * Record the info required to record this descriptor in the
                 * used ring once its transmission has completed.
                 */
                dp->d_len = total_len;
                dp->d_pages = pages;
                mutex_enter(&ring->vr_lock);
                ring->vr_xfer_outstanding++;
                mutex_exit(&ring->vr_lock);
        } else {
                /*
                 * If the data was cloned out of the ring, the descriptors can
                 * be marked as 'used' now, rather than deferring that action
                 * until after successful packet transmission.
                 */
                vmm_drv_page_release_chain(pages);
                viona_tx_done(ring, total_len, cookie);
        }

        /*
         * From viona's point of view, this is a successful transmit, even if
         * something downstream decides to drop the packet.
         */
        VIONA_PROBE3(pkt__tx, viona_vring_t *, ring, mblk_t, mp_head,
            size_t, pkt_len)

        /*
         * We're potentially going deep into the networking layer; make sure the
         * guest can't run concurrently.
         */
        smt_begin_unsafe();
        /*
         * Ignore, for now, any signal from MAC about whether the outgoing
         * packet was dropped or not.
         */
        (void) mac_tx(link->l_mch, mp_head, 0, MAC_DROP_ON_NO_DESC, NULL);
        smt_end_unsafe();
        return (pkt_len);

drop_fail:
        /*
         * On the off chance that memory is not available via the desballoc or
         * allocb calls, there are few options left besides to fail and drop
         * the frame on the floor.
         *
         * First account for it in the error stats.
         */
        viona_ring_stat_error(ring);

        if (dp != NULL) {
                /*
                 * Take an additional reference on the desb handle (if present)
                 * so any desballoc-sourced mblks can release their hold on it
                 * without the handle reaching its final state and executing
                 * its clean-up logic.
                 */
                dp->d_ref++;
        }

        /*
         * Free any already-allocated blocks and sum up the total length of the
         * dropped data to be released to the used ring.
         */
        freemsgchain(mp_head);

drop_hook:
        if (dp != NULL) {
                VERIFY(dp->d_ref == 2);

                /* Clean up the desb handle, releasing the extra hold. */
                dp->d_len = 0;
                dp->d_cookie = 0;
                dp->d_ref = 0;
        }

        /* Count in the stats as a drop, rather than an error */
        viona_ring_stat_drop(ring, 1);

        VIONA_PROBE3(tx_drop, viona_vring_t *, ring, uint32_t, pkt_len,
            uint16_t, cookie);
        vmm_drv_page_release_chain(pages);
        viona_tx_done(ring, total_len, cookie);
        return (0);
}