root/usr/src/uts/intel/io/vmxnet3s/vmxnet3_tx.c
/*
 * Copyright (C) 2007 VMware, Inc. All rights reserved.
 *
 * The contents of this file are subject to the terms of the Common
 * Development and Distribution License (the "License") version 1.0
 * and no later version.  You may not use this file except in
 * compliance with the License.
 *
 * You can obtain a copy of the License at
 *         http://www.opensource.org/licenses/cddl1.php
 *
 * See the License for the specific language governing permissions
 * and limitations under the License.
 */

/*
 * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
 * Copyright 2018 Joyent, Inc.
 */

#include <vmxnet3.h>

typedef enum vmxnet3_txstatus {
        VMXNET3_TX_OK,
        VMXNET3_TX_FAILURE,
        VMXNET3_TX_PULLUP,
        VMXNET3_TX_RINGFULL
} vmxnet3_txstatus;

typedef struct vmxnet3_offload_t {
        uint16_t om;
        uint16_t hlen;
        uint16_t msscof;
} vmxnet3_offload_t;

/*
 * Initialize a TxQueue. Currently nothing needs to be done.
 */
/* ARGSUSED */
int
vmxnet3_txqueue_init(vmxnet3_softc_t *dp, vmxnet3_txqueue_t *txq)
{
        return (0);
}

/*
 * Finish a TxQueue by freeing all pending Tx.
 */
void
vmxnet3_txqueue_fini(vmxnet3_softc_t *dp, vmxnet3_txqueue_t *txq)
{
        unsigned int i;

        ASSERT(!dp->devEnabled);

        for (i = 0; i < txq->cmdRing.size; i++) {
                mblk_t *mp = txq->metaRing[i].mp;
                if (mp) {
                        freemsg(mp);
                }
        }
}

/*
 * Build the offload context of a msg.
 *
 * Returns:
 *      0 if everything went well.
 *      +n if n bytes need to be pulled up.
 *      -1 in case of error (not used).
 */
static int
vmxnet3_tx_prepare_offload(vmxnet3_softc_t *dp, vmxnet3_offload_t *ol,
    mblk_t *mp)
{
        int ret = 0;
        uint32_t start, stuff, value, flags, lso_flag, mss;

        ol->om = VMXNET3_OM_NONE;
        ol->hlen = 0;
        ol->msscof = 0;

        mac_hcksum_get(mp, &start, &stuff, NULL, &value, &flags);

        mac_lso_get(mp, &mss, &lso_flag);

        if (flags || lso_flag) {
                struct ether_vlan_header *eth = (void *) mp->b_rptr;
                uint8_t ethLen;

                if (eth->ether_tpid == htons(ETHERTYPE_VLAN)) {
                        ethLen = sizeof (struct ether_vlan_header);
                } else {
                        ethLen = sizeof (struct ether_header);
                }

                VMXNET3_DEBUG(dp, 4, "flags=0x%x, ethLen=%u, start=%u, "
                    "stuff=%u, value=%u\n", flags, ethLen, start, stuff, value);

                if (lso_flag & HW_LSO) {
                        mblk_t *mblk = mp;
                        uint8_t *ip, *tcp;
                        uint8_t ipLen, tcpLen;

                        /*
                         * Copy e1000g's behavior:
                         * - Do not assume all the headers are in the same mblk.
                         * - Assume each header is always within one mblk.
                         * - Assume the ethernet header is in the first mblk.
                         */
                        ip = mblk->b_rptr + ethLen;
                        if (ip >= mblk->b_wptr) {
                                mblk = mblk->b_cont;
                                ip = mblk->b_rptr;
                        }
                        ipLen = IPH_HDR_LENGTH((ipha_t *)ip);
                        tcp = ip + ipLen;
                        if (tcp >= mblk->b_wptr) {
                                mblk = mblk->b_cont;
                                tcp = mblk->b_rptr;
                        }
                        tcpLen = TCP_HDR_LENGTH((tcph_t *)tcp);
                        /* Careful, '>' instead of '>=' here */
                        if (tcp + tcpLen > mblk->b_wptr) {
                                mblk = mblk->b_cont;
                        }

                        ol->om = VMXNET3_OM_TSO;
                        ol->hlen = ethLen + ipLen + tcpLen;
                        ol->msscof = mss;

                        if (mblk != mp) {
                                ret = ol->hlen;
                        }
                } else if (flags & HCK_PARTIALCKSUM) {
                        ol->om = VMXNET3_OM_CSUM;
                        ol->hlen = start + ethLen;
                        ol->msscof = stuff + ethLen;
                }
        }

        return (ret);
}

/*
 * Map a msg into the Tx command ring of a vmxnet3 device.
 *
 * Returns:
 *      VMXNET3_TX_OK if everything went well.
 *      VMXNET3_TX_RINGFULL if the ring is nearly full.
 *      VMXNET3_TX_PULLUP if the msg is overfragmented.
 *      VMXNET3_TX_FAILURE if there was a DMA or offload error.
 *
 * Side effects:
 *      The ring is filled if VMXNET3_TX_OK is returned.
 */
static vmxnet3_txstatus
vmxnet3_tx_one(vmxnet3_softc_t *dp, vmxnet3_txqueue_t *txq,
    vmxnet3_offload_t *ol, mblk_t *mp)
{
        int ret = VMXNET3_TX_OK;
        unsigned int frags = 0, totLen = 0;
        vmxnet3_cmdring_t *cmdRing = &txq->cmdRing;
        Vmxnet3_TxQueueCtrl *txqCtrl = txq->sharedCtrl;
        Vmxnet3_GenericDesc *txDesc;
        uint16_t sopIdx, eopIdx;
        uint8_t sopGen, curGen;
        mblk_t *mblk;

        mutex_enter(&dp->txLock);

        sopIdx = eopIdx = cmdRing->next2fill;
        sopGen = cmdRing->gen;
        curGen = !cmdRing->gen;

        for (mblk = mp; mblk != NULL; mblk = mblk->b_cont) {
                unsigned int len = MBLKL(mblk);
                ddi_dma_cookie_t cookie;
                uint_t cookieCount;

                if (len) {
                        totLen += len;
                } else {
                        continue;
                }

                if (ddi_dma_addr_bind_handle(dp->txDmaHandle, NULL,
                    (caddr_t)mblk->b_rptr, len,
                    DDI_DMA_RDWR | DDI_DMA_STREAMING, DDI_DMA_DONTWAIT, NULL,
                    &cookie, &cookieCount) != DDI_DMA_MAPPED) {
                        VMXNET3_WARN(dp, "ddi_dma_addr_bind_handle() failed\n");
                        ret = VMXNET3_TX_FAILURE;
                        goto error;
                }

                ASSERT(cookieCount);

                do {
                        uint64_t addr = cookie.dmac_laddress;
                        size_t len = cookie.dmac_size;

                        do {
                                uint32_t dw2, dw3;
                                size_t chunkLen;

                                ASSERT(!txq->metaRing[eopIdx].mp);
                                ASSERT(cmdRing->avail - frags);

                                if (frags >= cmdRing->size - 1 ||
                                    (ol->om != VMXNET3_OM_TSO &&
                                    frags >= VMXNET3_MAX_TXD_PER_PKT)) {
                                        VMXNET3_DEBUG(dp, 2,
                                            "overfragmented mp (%u)\n", frags);
                                        (void) ddi_dma_unbind_handle(
                                            dp->txDmaHandle);
                                        ret = VMXNET3_TX_PULLUP;
                                        goto error;
                                }
                                if (cmdRing->avail - frags <= 1) {
                                        dp->txMustResched = B_TRUE;
                                        (void) ddi_dma_unbind_handle(
                                            dp->txDmaHandle);
                                        ret = VMXNET3_TX_RINGFULL;
                                        goto error;
                                }

                                if (len > VMXNET3_MAX_TX_BUF_SIZE) {
                                        chunkLen = VMXNET3_MAX_TX_BUF_SIZE;
                                } else {
                                        chunkLen = len;
                                }

                                frags++;
                                eopIdx = cmdRing->next2fill;

                                txDesc = VMXNET3_GET_DESC(cmdRing, eopIdx);
                                ASSERT(txDesc->txd.gen != cmdRing->gen);

                                /* txd.addr */
                                txDesc->txd.addr = addr;
                                /* txd.dw2 */
                                dw2 = chunkLen == VMXNET3_MAX_TX_BUF_SIZE ?
                                    0 : chunkLen;
                                dw2 |= curGen << VMXNET3_TXD_GEN_SHIFT;
                                txDesc->dword[2] = dw2;
                                ASSERT(txDesc->txd.len == len ||
                                    txDesc->txd.len == 0);
                                /* txd.dw3 */
                                dw3 = 0;
                                txDesc->dword[3] = dw3;

                                VMXNET3_INC_RING_IDX(cmdRing,
                                    cmdRing->next2fill);
                                curGen = cmdRing->gen;

                                addr += chunkLen;
                                len -= chunkLen;
                        } while (len);

                        if (--cookieCount) {
                                ddi_dma_nextcookie(dp->txDmaHandle, &cookie);
                        }
                } while (cookieCount);

                (void) ddi_dma_unbind_handle(dp->txDmaHandle);
        }

        /* Update the EOP descriptor */
        txDesc = VMXNET3_GET_DESC(cmdRing, eopIdx);
        txDesc->dword[3] |= VMXNET3_TXD_CQ | VMXNET3_TXD_EOP;

        /* Update the SOP descriptor. Must be done last */
        txDesc = VMXNET3_GET_DESC(cmdRing, sopIdx);
        if (ol->om == VMXNET3_OM_TSO && txDesc->txd.len != 0 &&
            txDesc->txd.len < ol->hlen) {
                ret = VMXNET3_TX_FAILURE;
                goto error;
        }
        txDesc->txd.om = ol->om;
        txDesc->txd.hlen = ol->hlen;
        txDesc->txd.msscof = ol->msscof;
        membar_producer();
        txDesc->txd.gen = sopGen;

        /* Update the meta ring & metadata */
        txq->metaRing[sopIdx].mp = mp;
        txq->metaRing[eopIdx].sopIdx = sopIdx;
        txq->metaRing[eopIdx].frags = frags;
        cmdRing->avail -= frags;
        if (ol->om == VMXNET3_OM_TSO) {
                txqCtrl->txNumDeferred +=
                    (totLen - ol->hlen + ol->msscof - 1) / ol->msscof;
        } else {
                txqCtrl->txNumDeferred++;
        }

        VMXNET3_DEBUG(dp, 3, "tx 0x%p on [%u;%u]\n", (void *)mp, sopIdx,
            eopIdx);

        goto done;

error:
        /* Reverse the generation bits */
        while (sopIdx != cmdRing->next2fill) {
                VMXNET3_DEC_RING_IDX(cmdRing, cmdRing->next2fill);
                txDesc = VMXNET3_GET_DESC(cmdRing, cmdRing->next2fill);
                txDesc->txd.gen = !cmdRing->gen;
        }

done:
        mutex_exit(&dp->txLock);

        return (ret);
}

/*
 * Send packets on a vmxnet3 device.
 *
 * Returns:
 *      NULL in case of success or failure.
 *      The mps to be retransmitted later if the ring is full.
 */
mblk_t *
vmxnet3_tx(void *data, mblk_t *mps)
{
        vmxnet3_softc_t *dp = data;
        vmxnet3_txqueue_t *txq = &dp->txQueue;
        vmxnet3_cmdring_t *cmdRing = &txq->cmdRing;
        Vmxnet3_TxQueueCtrl *txqCtrl = txq->sharedCtrl;
        vmxnet3_txstatus status = VMXNET3_TX_OK;
        mblk_t *mp;

        ASSERT(mps != NULL);

        do {
                vmxnet3_offload_t ol;
                int pullup;

                mp = mps;
                mps = mp->b_next;
                mp->b_next = NULL;

                if (DB_TYPE(mp) != M_DATA) {
                        /*
                         * PR #315560: M_PROTO mblks could be passed for
                         * some reason. Drop them because we don't understand
                         * them and because their contents are not Ethernet
                         * frames anyway.
                         */
                        ASSERT(B_FALSE);
                        freemsg(mp);
                        continue;
                }

                /*
                 * Prepare the offload while we're still handling the original
                 * message -- msgpullup() discards the metadata afterwards.
                 */
                pullup = vmxnet3_tx_prepare_offload(dp, &ol, mp);
                if (pullup) {
                        mblk_t *new_mp = msgpullup(mp, pullup);
                        atomic_inc_32(&dp->tx_pullup_needed);
                        freemsg(mp);
                        if (new_mp) {
                                mp = new_mp;
                        } else {
                                atomic_inc_32(&dp->tx_pullup_failed);
                                continue;
                        }
                }

                /*
                 * Try to map the message in the Tx ring.
                 * This call might fail for non-fatal reasons.
                 */
                status = vmxnet3_tx_one(dp, txq, &ol, mp);
                if (status == VMXNET3_TX_PULLUP) {
                        /*
                         * Try one more time after flattening
                         * the message with msgpullup().
                         */
                        if (mp->b_cont != NULL) {
                                mblk_t *new_mp = msgpullup(mp, -1);
                                atomic_inc_32(&dp->tx_pullup_needed);
                                freemsg(mp);
                                if (new_mp) {
                                        mp = new_mp;
                                        status = vmxnet3_tx_one(dp, txq, &ol,
                                            mp);
                                } else {
                                        atomic_inc_32(&dp->tx_pullup_failed);
                                        continue;
                                }
                        }
                }
                if (status != VMXNET3_TX_OK && status != VMXNET3_TX_RINGFULL) {
                        /* Fatal failure, drop it */
                        atomic_inc_32(&dp->tx_error);
                        freemsg(mp);
                }
        } while (mps && status != VMXNET3_TX_RINGFULL);

        if (status == VMXNET3_TX_RINGFULL) {
                atomic_inc_32(&dp->tx_ring_full);
                mp->b_next = mps;
                mps = mp;
        } else {
                ASSERT(!mps);
        }

        /* Notify the device */
        mutex_enter(&dp->txLock);
        if (txqCtrl->txNumDeferred >= txqCtrl->txThreshold) {
                txqCtrl->txNumDeferred = 0;
                VMXNET3_BAR0_PUT32(dp, VMXNET3_REG_TXPROD, cmdRing->next2fill);
        }
        mutex_exit(&dp->txLock);

        return (mps);
}

/*
 * Parse a transmit queue and complete packets.
 *
 * Returns:
 *      B_TRUE if Tx must be updated or B_FALSE if no action is required.
 */
boolean_t
vmxnet3_tx_complete(vmxnet3_softc_t *dp, vmxnet3_txqueue_t *txq)
{
        vmxnet3_cmdring_t *cmdRing = &txq->cmdRing;
        vmxnet3_compring_t *compRing = &txq->compRing;
        Vmxnet3_GenericDesc *compDesc;
        boolean_t completedTx = B_FALSE;
        boolean_t ret = B_FALSE;

        mutex_enter(&dp->txLock);

        compDesc = VMXNET3_GET_DESC(compRing, compRing->next2comp);
        while (compDesc->tcd.gen == compRing->gen) {
                vmxnet3_metatx_t *sopMetaDesc, *eopMetaDesc;
                uint16_t sopIdx, eopIdx;
                mblk_t *mp;

                eopIdx = compDesc->tcd.txdIdx;
                eopMetaDesc = &txq->metaRing[eopIdx];
                sopIdx = eopMetaDesc->sopIdx;
                sopMetaDesc = &txq->metaRing[sopIdx];

                ASSERT(eopMetaDesc->frags);
                cmdRing->avail += eopMetaDesc->frags;

                ASSERT(sopMetaDesc->mp);
                mp = sopMetaDesc->mp;
                freemsg(mp);

                eopMetaDesc->sopIdx = 0;
                eopMetaDesc->frags = 0;
                sopMetaDesc->mp = NULL;

                completedTx = B_TRUE;

                VMXNET3_DEBUG(dp, 3, "cp 0x%p on [%u;%u]\n", (void *)mp, sopIdx,
                    eopIdx);

                VMXNET3_INC_RING_IDX(compRing, compRing->next2comp);
                compDesc = VMXNET3_GET_DESC(compRing, compRing->next2comp);
        }

        if (dp->txMustResched && completedTx) {
                dp->txMustResched = B_FALSE;
                ret = B_TRUE;
        }

        mutex_exit(&dp->txLock);

        return (ret);
}