root/usr/src/uts/common/io/sfxge/sfxge_tx.c
/*
 * Copyright (c) 2008-2016 Solarflare Communications Inc.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice,
 *    this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 *    this list of conditions and the following disclaimer in the documentation
 *    and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 * The views and conclusions contained in the software and documentation are
 * those of the authors and should not be interpreted as representing official
 * policies, either expressed or implied, of the FreeBSD Project.
 */

#include <sys/types.h>
#include <sys/sysmacros.h>
#include <sys/ddi.h>
#include <sys/sunddi.h>
#include <sys/atomic.h>
#include <sys/stream.h>
#include <sys/strsun.h>
#include <sys/strsubr.h>
#include <sys/pattr.h>
#include <sys/cpu.h>

#include <sys/ethernet.h>
#include <inet/ip.h>

#include <netinet/in.h>
#include <netinet/ip.h>
#include <netinet/tcp.h>

#include "sfxge.h"

#include "efx.h"

/* TXQ flush response timeout (in microseconds) */
#define SFXGE_TX_QFLUSH_USEC    (2000000)

/* See sfxge.conf.private for descriptions */
#define SFXGE_TX_DPL_GET_PKT_LIMIT_DEFAULT 4096
#define SFXGE_TX_DPL_PUT_PKT_LIMIT_DEFAULT 256


/* Transmit buffer DMA attributes */
static ddi_device_acc_attr_t sfxge_tx_buffer_devacc = {

        DDI_DEVICE_ATTR_V0,     /* devacc_attr_version */
        DDI_NEVERSWAP_ACC,      /* devacc_attr_endian_flags */
        DDI_STRICTORDER_ACC     /* devacc_attr_dataorder */
};

static ddi_dma_attr_t sfxge_tx_buffer_dma_attr = {
        DMA_ATTR_V0,            /* dma_attr_version     */
        0,                      /* dma_attr_addr_lo     */
        0xffffffffffffffffull,  /* dma_attr_addr_hi     */
        0xffffffffffffffffull,  /* dma_attr_count_max   */
        SFXGE_TX_BUFFER_SIZE,   /* dma_attr_align       */
        0xffffffff,             /* dma_attr_burstsizes  */
        1,                      /* dma_attr_minxfer     */
        0xffffffffffffffffull,  /* dma_attr_maxxfer     */
        0xffffffffffffffffull,  /* dma_attr_seg         */
        1,                      /* dma_attr_sgllen      */
        1,                      /* dma_attr_granular    */
        0                       /* dma_attr_flags       */
};

/* Transmit mapping DMA attributes */
static ddi_dma_attr_t sfxge_tx_mapping_dma_attr = {
        DMA_ATTR_V0,            /* dma_attr_version     */
        0,                      /* dma_attr_addr_lo     */
        0xffffffffffffffffull,  /* dma_attr_addr_hi     */
        0xffffffffffffffffull,  /* dma_attr_count_max   */
        1,                      /* dma_attr_align       */
        0xffffffff,             /* dma_attr_burstsizes  */
        1,                      /* dma_attr_minxfer     */
        0xffffffffffffffffull,  /* dma_attr_maxxfer     */
        0xffffffffffffffffull,  /* dma_attr_seg         */
        0x7fffffff,             /* dma_attr_sgllen      */
        1,                      /* dma_attr_granular    */
        0                       /* dma_attr_flags       */
};

/* Transmit queue DMA attributes */
static ddi_device_acc_attr_t sfxge_txq_devacc = {

        DDI_DEVICE_ATTR_V0,     /* devacc_attr_version */
        DDI_NEVERSWAP_ACC,      /* devacc_attr_endian_flags */
        DDI_STRICTORDER_ACC     /* devacc_attr_dataorder */
};

static ddi_dma_attr_t sfxge_txq_dma_attr = {
        DMA_ATTR_V0,            /* dma_attr_version     */
        0,                      /* dma_attr_addr_lo     */
        0xffffffffffffffffull,  /* dma_attr_addr_hi     */
        0xffffffffffffffffull,  /* dma_attr_count_max   */
        EFX_BUF_SIZE,           /* dma_attr_align       */
        0xffffffff,             /* dma_attr_burstsizes  */
        1,                      /* dma_attr_minxfer     */
        0xffffffffffffffffull,  /* dma_attr_maxxfer     */
        0xffffffffffffffffull,  /* dma_attr_seg         */
        1,                      /* dma_attr_sgllen      */
        1,                      /* dma_attr_granular    */
        0                       /* dma_attr_flags       */
};


/*
 * A sfxge_tx_qdpl_swizzle() can happen when the DPL get list is one packet
 * under the limit, and must move all packets from the DPL put->get list
 * Hence this is the real maximum length of the TX DPL get list.
 */
static int
sfxge_tx_dpl_get_pkt_max(sfxge_txq_t *stp)
{
        sfxge_tx_dpl_t *stdp = &(stp->st_dpl);
        return (stdp->get_pkt_limit + stdp->put_pkt_limit - 1);
}


static int
sfxge_tx_packet_ctor(void *buf, void *arg, int kmflags)
{
        _NOTE(ARGUNUSED(arg, kmflags))

        bzero(buf, sizeof (sfxge_tx_packet_t));

        return (0);
}

static void
sfxge_tx_packet_dtor(void *buf, void *arg)
{
        sfxge_tx_packet_t *stpp = buf;

        _NOTE(ARGUNUSED(arg))

        SFXGE_OBJ_CHECK(stpp, sfxge_tx_packet_t);
}

static int
sfxge_tx_buffer_ctor(void *buf, void *arg, int kmflags)
{
        sfxge_tx_buffer_t *stbp = buf;
        sfxge_t *sp = arg;
        sfxge_dma_buffer_attr_t dma_attr;
        int rc;

        bzero(buf, sizeof (sfxge_tx_buffer_t));

        dma_attr.sdba_dip        = sp->s_dip;
        dma_attr.sdba_dattrp     = &sfxge_tx_buffer_dma_attr;
        dma_attr.sdba_callback   = ((kmflags == KM_SLEEP) ?
            DDI_DMA_SLEEP : DDI_DMA_DONTWAIT);
        dma_attr.sdba_length     = SFXGE_TX_BUFFER_SIZE;
        dma_attr.sdba_memflags   = DDI_DMA_STREAMING;
        dma_attr.sdba_devaccp    = &sfxge_tx_buffer_devacc;
        dma_attr.sdba_bindflags  = DDI_DMA_WRITE | DDI_DMA_STREAMING;
        dma_attr.sdba_maxcookies = 1;
        dma_attr.sdba_zeroinit   = B_FALSE;

        if ((rc = sfxge_dma_buffer_create(&(stbp->stb_esm), &dma_attr)) != 0)
                goto fail1;

        return (0);

fail1:
        DTRACE_PROBE1(fail1, int, rc);

        SFXGE_OBJ_CHECK(stbp, sfxge_tx_buffer_t);

        return (-1);
}

static void
sfxge_tx_buffer_dtor(void *buf, void *arg)
{
        sfxge_tx_buffer_t *stbp = buf;

        _NOTE(ARGUNUSED(arg))

        sfxge_dma_buffer_destroy(&(stbp->stb_esm));

        SFXGE_OBJ_CHECK(stbp, sfxge_tx_buffer_t);
}

static int
sfxge_tx_mapping_ctor(void *buf, void *arg, int kmflags)
{
        sfxge_tx_mapping_t *stmp = buf;
        sfxge_t *sp = arg;
        dev_info_t *dip = sp->s_dip;
        int rc;

        bzero(buf, sizeof (sfxge_tx_mapping_t));

        stmp->stm_sp = sp;

        /* Allocate DMA handle */
        rc = ddi_dma_alloc_handle(dip, &sfxge_tx_mapping_dma_attr,
            (kmflags == KM_SLEEP) ? DDI_DMA_SLEEP : DDI_DMA_DONTWAIT,
            NULL, &(stmp->stm_dma_handle));
        if (rc != DDI_SUCCESS)
                goto fail1;

        return (0);

fail1:
        DTRACE_PROBE1(fail1, int, rc);

        stmp->stm_sp = NULL;

        SFXGE_OBJ_CHECK(stmp, sfxge_tx_mapping_t);

        return (-1);
}

static void
sfxge_tx_mapping_dtor(void *buf, void *arg)
{
        sfxge_tx_mapping_t *stmp = buf;

        ASSERT3P(stmp->stm_sp, ==, arg);

        /* Free the DMA handle */
        ddi_dma_free_handle(&(stmp->stm_dma_handle));
        stmp->stm_dma_handle = NULL;

        stmp->stm_sp = NULL;

        SFXGE_OBJ_CHECK(stmp, sfxge_tx_mapping_t);
}

static int
sfxge_tx_qctor(void *buf, void *arg, int kmflags)
{
        sfxge_txq_t *stp = buf;
        efsys_mem_t *esmp = &(stp->st_mem);
        sfxge_t *sp = arg;
        sfxge_dma_buffer_attr_t dma_attr;
        sfxge_tx_dpl_t *stdp;
        int rc;

        /* Compile-time structure layout checks */
        EFX_STATIC_ASSERT(sizeof (stp->__st_u1.__st_s1) <=
            sizeof (stp->__st_u1.__st_pad));
        EFX_STATIC_ASSERT(sizeof (stp->__st_u2.__st_s2) <=
            sizeof (stp->__st_u2.__st_pad));
        EFX_STATIC_ASSERT(sizeof (stp->__st_u3.__st_s3) <=
            sizeof (stp->__st_u3.__st_pad));
        EFX_STATIC_ASSERT(sizeof (stp->__st_u4.__st_s4) <=
            sizeof (stp->__st_u4.__st_pad));

        bzero(buf, sizeof (sfxge_txq_t));

        stp->st_sp = sp;

        dma_attr.sdba_dip        = sp->s_dip;
        dma_attr.sdba_dattrp     = &sfxge_txq_dma_attr;
        dma_attr.sdba_callback   = DDI_DMA_SLEEP;
        dma_attr.sdba_length     = EFX_TXQ_SIZE(SFXGE_TX_NDESCS);
        dma_attr.sdba_memflags   = DDI_DMA_CONSISTENT;
        dma_attr.sdba_devaccp    = &sfxge_txq_devacc;
        dma_attr.sdba_bindflags  = DDI_DMA_READ | DDI_DMA_CONSISTENT;
        dma_attr.sdba_maxcookies = EFX_TXQ_NBUFS(SFXGE_TX_NDESCS);
        dma_attr.sdba_zeroinit   = B_FALSE;

        if ((rc = sfxge_dma_buffer_create(esmp, &dma_attr)) != 0)
                goto fail1;

        /* Allocate some buffer table entries */
        if ((rc = sfxge_sram_buf_tbl_alloc(sp, EFX_TXQ_NBUFS(SFXGE_TX_NDESCS),
            &(stp->st_id))) != 0)
                goto fail2;

        /* Allocate the descriptor array */
        if ((stp->st_eb = kmem_zalloc(sizeof (efx_buffer_t) *
            EFX_TXQ_LIMIT(SFXGE_TX_NDESCS), kmflags)) == NULL) {
                rc = ENOMEM;
                goto fail3;
        }

        /* Allocate the context arrays */
        if ((stp->st_stmp = kmem_zalloc(sizeof (sfxge_tx_mapping_t *) *
            SFXGE_TX_NDESCS, kmflags)) == NULL) {
                rc = ENOMEM;
                goto fail4;
        }

        if ((stp->st_stbp = kmem_zalloc(sizeof (sfxge_tx_buffer_t *) *
            SFXGE_TX_NDESCS, kmflags)) == NULL) {
                rc = ENOMEM;
                goto fail5;
        }

        if ((stp->st_mp = kmem_zalloc(sizeof (mblk_t *) *
            SFXGE_TX_NDESCS, kmflags)) == NULL) {
                rc = ENOMEM;
                goto fail6;
        }

        /* Initialize the deferred packet list */
        stdp = &(stp->st_dpl);
        stdp->std_getp = &(stdp->std_get);

        stp->st_unblock = SFXGE_TXQ_NOT_BLOCKED;

        return (0);

fail6:
        DTRACE_PROBE(fail6);

        kmem_free(stp->st_stbp, sizeof (sfxge_tx_buffer_t *) * SFXGE_TX_NDESCS);
        stp->st_stbp = NULL;

fail5:
        DTRACE_PROBE(fail5);

        kmem_free(stp->st_stmp,
            sizeof (sfxge_tx_mapping_t *) * SFXGE_TX_NDESCS);
        stp->st_stmp = NULL;

fail4:
        DTRACE_PROBE(fail4);

        /* Free the descriptor array */
        kmem_free(stp->st_eb, sizeof (efx_buffer_t) *
            EFX_TXQ_LIMIT(SFXGE_TX_NDESCS));
        stp->st_eb = NULL;

fail3:
        DTRACE_PROBE(fail3);

        /* Free the buffer table entries */
        sfxge_sram_buf_tbl_free(sp, stp->st_id, EFX_TXQ_NBUFS(SFXGE_TX_NDESCS));
        stp->st_id = 0;

fail2:
        DTRACE_PROBE(fail2);

        /* Tear down DMA setup */
        sfxge_dma_buffer_destroy(esmp);

fail1:
        DTRACE_PROBE1(fail1, int, rc);

        stp->st_sp = NULL;

        SFXGE_OBJ_CHECK(stp, sfxge_txq_t);

        return (-1);
}

static void
sfxge_tx_qdtor(void *buf, void *arg)
{
        sfxge_txq_t *stp = buf;
        efsys_mem_t *esmp = &(stp->st_mem);
        sfxge_t *sp = stp->st_sp;
        sfxge_tx_dpl_t *stdp;

        _NOTE(ARGUNUSED(arg))

        stp->st_unblock = 0;

        /* Tear down the deferred packet list */
        stdp = &(stp->st_dpl);
        ASSERT3P(stdp->std_getp, ==, &(stdp->std_get));
        stdp->std_getp = NULL;

        /* Free the context arrays */
        kmem_free(stp->st_mp, sizeof (mblk_t *) * SFXGE_TX_NDESCS);
        stp->st_mp = NULL;

        kmem_free(stp->st_stbp, sizeof (sfxge_tx_buffer_t *) * SFXGE_TX_NDESCS);
        stp->st_stbp = NULL;

        kmem_free(stp->st_stmp,
            sizeof (sfxge_tx_mapping_t *) * SFXGE_TX_NDESCS);
        stp->st_stmp = NULL;

        /* Free the descriptor array */
        kmem_free(stp->st_eb, sizeof (efx_buffer_t) *
            EFX_TXQ_LIMIT(SFXGE_TX_NDESCS));
        stp->st_eb = NULL;

        /* Free the buffer table entries */
        sfxge_sram_buf_tbl_free(sp, stp->st_id, EFX_TXQ_NBUFS(SFXGE_TX_NDESCS));
        stp->st_id = 0;

        /* Tear down dma setup */
        sfxge_dma_buffer_destroy(esmp);

        stp->st_sp = NULL;

        SFXGE_OBJ_CHECK(stp, sfxge_txq_t);
}

static void
sfxge_tx_packet_destroy(sfxge_t *sp, sfxge_tx_packet_t *stpp)
{
        kmem_cache_free(sp->s_tpc, stpp);
}

static sfxge_tx_packet_t *
sfxge_tx_packet_create(sfxge_t *sp)
{
        sfxge_tx_packet_t *stpp;

        stpp = kmem_cache_alloc(sp->s_tpc, KM_NOSLEEP);

        return (stpp);
}

static inline int
sfxge_tx_qfpp_put(sfxge_txq_t *stp, sfxge_tx_packet_t *stpp)
{
        sfxge_tx_fpp_t *stfp = &(stp->st_fpp);

        ASSERT(mutex_owned(&(stp->st_lock)));

        ASSERT3P(stpp->stp_next, ==, NULL);
        ASSERT3P(stpp->stp_mp, ==, NULL);
        ASSERT3P(stpp->stp_etherhp, ==, NULL);
        ASSERT3P(stpp->stp_iphp, ==, NULL);
        ASSERT3P(stpp->stp_thp, ==, NULL);
        ASSERT3U(stpp->stp_off, ==, 0);
        ASSERT3U(stpp->stp_size, ==, 0);
        ASSERT3U(stpp->stp_mss, ==, 0);
        ASSERT3U(stpp->stp_dpl_put_len, ==, 0);

        if (stfp->stf_count < SFXGE_TX_FPP_MAX) {
                /* Add to the start of the list */
                stpp->stp_next = stfp->stf_stpp;
                stfp->stf_stpp = stpp;
                stfp->stf_count++;

                return (0);
        }

        DTRACE_PROBE(fpp_full);
        return (ENOSPC);
}

static inline sfxge_tx_packet_t *
sfxge_tx_qfpp_get(sfxge_txq_t *stp)
{
        sfxge_tx_packet_t *stpp;
        sfxge_tx_fpp_t *stfp = &(stp->st_fpp);

        ASSERT(mutex_owned(&(stp->st_lock)));

        stpp = stfp->stf_stpp;
        if (stpp == NULL) {
                ASSERT3U(stfp->stf_count, ==, 0);
                return (NULL);
        }

        /* Remove item from the head of the list */
        stfp->stf_stpp = stpp->stp_next;
        stpp->stp_next = NULL;

        ASSERT3U(stfp->stf_count, >, 0);
        stfp->stf_count--;

        if (stfp->stf_count != 0) {
                ASSERT(stfp->stf_stpp != NULL);
                prefetch_read_many(stfp->stf_stpp);
        }
        return (stpp);
}

static void
sfxge_tx_qfpp_empty(sfxge_txq_t *stp)
{
        sfxge_t *sp = stp->st_sp;
        sfxge_tx_fpp_t *stfp = &(stp->st_fpp);
        sfxge_tx_packet_t *stpp;

        mutex_enter(&(stp->st_lock));

        stpp = stfp->stf_stpp;
        stfp->stf_stpp = NULL;

        while (stpp != NULL) {
                sfxge_tx_packet_t *next;

                next = stpp->stp_next;
                stpp->stp_next = NULL;

                ASSERT3U(stfp->stf_count, >, 0);
                stfp->stf_count--;

                sfxge_tx_packet_destroy(sp, stpp);

                stpp = next;
        }
        ASSERT3U(stfp->stf_count, ==, 0);

        mutex_exit(&(stp->st_lock));
}

static inline void
sfxge_tx_qfbp_put(sfxge_txq_t *stp, sfxge_tx_buffer_t *stbp)
{
        sfxge_tx_fbp_t *stfp = &(stp->st_fbp);

        ASSERT3P(stbp->stb_next, ==, NULL);
        ASSERT3U(stbp->stb_off, ==, 0);
        ASSERT3U(stbp->stb_esm.esm_used, ==, 0);

        stbp->stb_next = stfp->stf_stbp;
        stfp->stf_stbp = stbp;
        stfp->stf_count++;
}


static inline sfxge_tx_buffer_t *
sfxge_tx_qfbp_get(sfxge_txq_t *stp)
{
        sfxge_tx_buffer_t *stbp;
        sfxge_tx_fbp_t *stfp = &(stp->st_fbp);

        stbp = stfp->stf_stbp;
        if (stbp == NULL) {
                ASSERT3U(stfp->stf_count, ==, 0);
                return (NULL);
        }

        stfp->stf_stbp = stbp->stb_next;
        stbp->stb_next = NULL;

        ASSERT3U(stfp->stf_count, >, 0);
        stfp->stf_count--;

        if (stfp->stf_count != 0) {
                ASSERT(stfp->stf_stbp != NULL);
                prefetch_read_many(stfp->stf_stbp);
        }

        return (stbp);
}

static void
sfxge_tx_qfbp_empty(sfxge_txq_t *stp)
{
        sfxge_t *sp = stp->st_sp;
        sfxge_tx_fbp_t *stfp = &(stp->st_fbp);
        sfxge_tx_buffer_t *stbp;

        mutex_enter(&(stp->st_lock));

        stbp = stfp->stf_stbp;
        stfp->stf_stbp = NULL;

        while (stbp != NULL) {
                sfxge_tx_buffer_t *next;

                next = stbp->stb_next;
                stbp->stb_next = NULL;

                ASSERT3U(stfp->stf_count, >, 0);
                stfp->stf_count--;

                kmem_cache_free(sp->s_tbc, stbp);

                stbp = next;
        }
        ASSERT3U(stfp->stf_count, ==, 0);

        mutex_exit(&(stp->st_lock));
}

static inline void
sfxge_tx_qfmp_put(sfxge_txq_t *stp, sfxge_tx_mapping_t *stmp)
{
        sfxge_tx_fmp_t *stfp = &(stp->st_fmp);

        ASSERT3P(stmp->stm_next, ==, NULL);
        ASSERT3P(stmp->stm_mp, ==, NULL);
        ASSERT3P(stmp->stm_base, ==, NULL);
        ASSERT3U(stmp->stm_off, ==, 0);
        ASSERT3U(stmp->stm_size, ==, 0);

        stmp->stm_next = stfp->stf_stmp;
        stfp->stf_stmp = stmp;
        stfp->stf_count++;
}

static inline sfxge_tx_mapping_t *
sfxge_tx_qfmp_get(sfxge_txq_t *stp)
{
        sfxge_tx_mapping_t *stmp;
        sfxge_tx_fmp_t *stfp = &(stp->st_fmp);

        stmp = stfp->stf_stmp;
        if (stmp == NULL) {
                ASSERT3U(stfp->stf_count, ==, 0);
                return (NULL);
        }

        stfp->stf_stmp = stmp->stm_next;
        stmp->stm_next = NULL;

        ASSERT3U(stfp->stf_count, >, 0);
        stfp->stf_count--;

        if (stfp->stf_count != 0) {
                ASSERT(stfp->stf_stmp != NULL);
                prefetch_read_many(stfp->stf_stmp);
        }
        return (stmp);
}

static void
sfxge_tx_qfmp_empty(sfxge_txq_t *stp)
{
        sfxge_t *sp = stp->st_sp;
        sfxge_tx_fmp_t *stfp = &(stp->st_fmp);
        sfxge_tx_mapping_t *stmp;

        mutex_enter(&(stp->st_lock));

        stmp = stfp->stf_stmp;
        stfp->stf_stmp = NULL;

        while (stmp != NULL) {
                sfxge_tx_mapping_t *next;

                next = stmp->stm_next;
                stmp->stm_next = NULL;

                ASSERT3U(stfp->stf_count, >, 0);
                stfp->stf_count--;

                kmem_cache_free(sp->s_tmc, stmp);

                stmp = next;
        }
        ASSERT3U(stfp->stf_count, ==, 0);

        mutex_exit(&(stp->st_lock));
}

static void
sfxge_tx_msgb_unbind(sfxge_tx_mapping_t *stmp)
{
        bzero(stmp->stm_addr, sizeof (uint64_t) * SFXGE_TX_MAPPING_NADDR);
        stmp->stm_off = 0;

        (void) ddi_dma_unbind_handle(stmp->stm_dma_handle);

        stmp->stm_size = 0;
        stmp->stm_base = NULL;

        stmp->stm_mp = NULL;
}

#define SFXGE_TX_DESCSHIFT      12
#define SFXGE_TX_DESCSIZE       (1 << 12)

#define SFXGE_TX_DESCOFFSET     (SFXGE_TX_DESCSIZE - 1)
#define SFXGE_TX_DESCMASK       (~SFXGE_TX_DESCOFFSET)

static int
sfxge_tx_msgb_bind(mblk_t *mp, sfxge_tx_mapping_t *stmp)
{
        ddi_dma_cookie_t dmac;
        unsigned int ncookies;
        size_t size;
        unsigned int n;
        int rc;

        ASSERT(mp != NULL);
        ASSERT3U(DB_TYPE(mp), ==, M_DATA);

        ASSERT(stmp->stm_mp == NULL);
        stmp->stm_mp = mp;

        stmp->stm_base = (caddr_t)(mp->b_rptr);
        stmp->stm_size = MBLKL(mp);

        /* Bind the STREAMS block to the mapping */
        rc = ddi_dma_addr_bind_handle(stmp->stm_dma_handle, NULL,
            stmp->stm_base, stmp->stm_size, DDI_DMA_WRITE | DDI_DMA_STREAMING,
            DDI_DMA_DONTWAIT, NULL, &dmac, &ncookies);
        if (rc != DDI_DMA_MAPPED)
                goto fail1;

        ASSERT3U(ncookies, <=, SFXGE_TX_MAPPING_NADDR);

        /*
         * Construct an array of addresses and an initial
         * offset.
         */
        n = 0;
        stmp->stm_addr[n++] = dmac.dmac_laddress & SFXGE_TX_DESCMASK;
        DTRACE_PROBE1(addr, uint64_t, dmac.dmac_laddress & SFXGE_TX_DESCMASK);

        stmp->stm_off = dmac.dmac_laddress & SFXGE_TX_DESCOFFSET;

        size = MIN(SFXGE_TX_DESCSIZE - stmp->stm_off, dmac.dmac_size);
        dmac.dmac_laddress += size;
        dmac.dmac_size -= size;

        for (;;) {
                ASSERT3U(n, <, SFXGE_TX_MAPPING_NADDR);

                if (dmac.dmac_size == 0) {
                        if (--ncookies == 0)
                                break;

                        ddi_dma_nextcookie(stmp->stm_dma_handle, &dmac);
                }

                ASSERT((dmac.dmac_laddress & SFXGE_TX_DESCMASK) != 0);
                ASSERT((dmac.dmac_laddress & SFXGE_TX_DESCOFFSET) == 0);
                stmp->stm_addr[n++] = dmac.dmac_laddress;
                DTRACE_PROBE1(addr, uint64_t, dmac.dmac_laddress);

                size = MIN(SFXGE_TX_DESCSIZE, dmac.dmac_size);
                dmac.dmac_laddress += size;
                dmac.dmac_size -= size;
        }
        ASSERT3U(n, <=, SFXGE_TX_MAPPING_NADDR);

        return (0);

fail1:
        DTRACE_PROBE1(fail1, int, rc);

        stmp->stm_size = 0;
        stmp->stm_base = NULL;

        stmp->stm_mp = NULL;

        return (-1);
}

static void
sfxge_tx_qreap(sfxge_txq_t *stp)
{
        unsigned int reaped;

        ASSERT(mutex_owned(&(stp->st_lock)));

        reaped = stp->st_reaped;
        while (reaped != stp->st_completed) {
                unsigned int id;
                sfxge_tx_mapping_t *stmp;
                sfxge_tx_buffer_t *stbp;

                id = reaped++ & (SFXGE_TX_NDESCS - 1);

                ASSERT3P(stp->st_mp[id], ==, NULL);

                if ((stmp = stp->st_stmp[id]) != NULL) {
                        stp->st_stmp[id] = NULL;

                        /* Free all the mappings */
                        do {
                                sfxge_tx_mapping_t *next;

                                next = stmp->stm_next;
                                stmp->stm_next = NULL;

                                sfxge_tx_qfmp_put(stp, stmp);

                                stmp = next;
                        } while (stmp != NULL);
                }

                if ((stbp = stp->st_stbp[id]) != NULL) {
                        stp->st_stbp[id] = NULL;

                        /* Free all the buffers */
                        do {
                                sfxge_tx_buffer_t *next;

                                next = stbp->stb_next;
                                stbp->stb_next = NULL;

                                stbp->stb_esm.esm_used = 0;
                                stbp->stb_off = 0;

                                sfxge_tx_qfbp_put(stp, stbp);

                                stbp = next;
                        } while (stbp != NULL);
                }
        }
        stp->st_reaped = reaped;
}

static void
sfxge_tx_qlist_abort(sfxge_txq_t *stp)
{
        unsigned int id;
        sfxge_tx_mapping_t *stmp;
        sfxge_tx_buffer_t *stbp;
        mblk_t *mp;

        ASSERT(mutex_owned(&(stp->st_lock)));

        id = stp->st_added & (SFXGE_TX_NDESCS - 1);

        /* Clear the completion information */
        stmp = stp->st_stmp[id];
        stp->st_stmp[id] = NULL;

        /* Free any mappings that were used */
        while (stmp != NULL) {
                sfxge_tx_mapping_t *next;

                next = stmp->stm_next;
                stmp->stm_next = NULL;

                if (stmp->stm_mp != NULL)
                        sfxge_tx_msgb_unbind(stmp);

                sfxge_tx_qfmp_put(stp, stmp);

                stmp = next;
        }

        stbp = stp->st_stbp[id];
        stp->st_stbp[id] = NULL;

        /* Free any buffers that were used */
        while (stbp != NULL) {
                sfxge_tx_buffer_t *next;

                next = stbp->stb_next;
                stbp->stb_next = NULL;

                stbp->stb_off = 0;
                stbp->stb_esm.esm_used = 0;

                sfxge_tx_qfbp_put(stp, stbp);

                stbp = next;
        }

        mp = stp->st_mp[id];
        stp->st_mp[id] = NULL;

        if (mp != NULL)
                freemsg(mp);

        /* Clear the fragment list */
        stp->st_n = 0;
}

/* Push descriptors to the TX ring setting blocked if no space */
static void
sfxge_tx_qlist_post(sfxge_txq_t *stp)
{
        unsigned int id;
        unsigned int level;
        unsigned int available;
        int rc;

        ASSERT(mutex_owned(&(stp->st_lock)));

        ASSERT(stp->st_n != 0);

again:
        level = stp->st_added - stp->st_reaped;
        available = EFX_TXQ_LIMIT(SFXGE_TX_NDESCS) - level;

        id = stp->st_added & (SFXGE_TX_NDESCS - 1);

        if (available < stp->st_n) {
                rc = ENOSPC;
                goto fail1;
        }

        ASSERT3U(available, >=, stp->st_n);

        /* Post the fragment list */
        if ((rc = efx_tx_qpost(stp->st_etp, stp->st_eb, stp->st_n,
            stp->st_reaped, &(stp->st_added))) != 0)
                goto fail2;

        /*
         * If the list took more than a single descriptor then we need to
         * to move the completion information so it is referenced by the last
         * descriptor.
         */
        if (((stp->st_added - 1) & (SFXGE_TX_NDESCS - 1)) != id) {
                sfxge_tx_mapping_t *stmp;
                sfxge_tx_buffer_t *stbp;
                mblk_t *mp;

                stmp = stp->st_stmp[id];
                stp->st_stmp[id] = NULL;

                stbp = stp->st_stbp[id];
                stp->st_stbp[id] = NULL;

                mp = stp->st_mp[id];
                stp->st_mp[id] = NULL;

                id = (stp->st_added - 1) & (SFXGE_TX_NDESCS - 1);

                ASSERT(stp->st_stmp[id] == NULL);
                stp->st_stmp[id] = stmp;

                ASSERT(stp->st_stbp[id] == NULL);
                stp->st_stbp[id] = stbp;

                ASSERT(stp->st_mp[id] == NULL);
                stp->st_mp[id] = mp;
        }

        /* Clear the list */
        stp->st_n = 0;

        ASSERT3U(stp->st_unblock, ==, SFXGE_TXQ_NOT_BLOCKED);
        return;

fail2:
        DTRACE_PROBE(fail2);
fail1:
        DTRACE_PROBE1(fail1, int, rc);

        ASSERT(rc == ENOSPC);

        level = stp->st_added - stp->st_completed;
        available = EFX_TXQ_LIMIT(SFXGE_TX_NDESCS) - level;

        /*
         * If there would be enough space after we've reaped any completed
         * mappings and buffers, and we gain sufficient queue space by doing
         * so, then reap now and try posting again.
         */
        if (stp->st_n <= available &&
            stp->st_completed - stp->st_reaped >= SFXGE_TX_BATCH) {
                sfxge_tx_qreap(stp);

                goto again;
        }

        /* Set the unblock level */
        if (stp->st_unblock == SFXGE_TXQ_NOT_BLOCKED) {
                stp->st_unblock = SFXGE_TXQ_UNBLOCK_LEVEL1;
        } else {
                ASSERT(stp->st_unblock == SFXGE_TXQ_UNBLOCK_LEVEL1);

                stp->st_unblock = SFXGE_TXQ_UNBLOCK_LEVEL2;
        }

        /*
         * Avoid a race with completion interrupt handling that could leave the
         * queue blocked.
         *
         * NOTE: The use of st_pending rather than st_completed is intentional
         *       as st_pending is updated per-event rather than per-batch and
         *       therefore avoids needless deferring.
         */
        if (stp->st_pending == stp->st_added) {
                sfxge_tx_qreap(stp);

                stp->st_unblock = SFXGE_TXQ_NOT_BLOCKED;
                goto again;
        }

        ASSERT(stp->st_unblock != SFXGE_TXQ_NOT_BLOCKED);
}

static int
sfxge_tx_kstat_update(kstat_t *ksp, int rw)
{
        sfxge_txq_t *stp = ksp->ks_private;
        sfxge_tx_dpl_t *stdp = &(stp->st_dpl);
        kstat_named_t *knp;
        int rc;

        ASSERT(mutex_owned(&(stp->st_lock)));

        if (rw != KSTAT_READ) {
                rc = EACCES;
                goto fail1;
        }

        if (stp->st_state != SFXGE_TXQ_STARTED)
                goto done;

        efx_tx_qstats_update(stp->st_etp, stp->st_stat);
        knp = (kstat_named_t *)ksp->ks_data + TX_NQSTATS;
        knp->value.ui64 = stdp->get_pkt_limit;
        knp++;
        knp->value.ui64 = stdp->put_pkt_limit;
        knp++;
        knp->value.ui64 = stdp->get_full_count;
        knp++;
        knp->value.ui64 = stdp->put_full_count;

done:
        return (0);

fail1:
        DTRACE_PROBE1(fail1, int, rc);

        return (rc);
}

static int
sfxge_tx_kstat_init(sfxge_txq_t *stp)
{
        sfxge_t *sp = stp->st_sp;
        unsigned int index = stp->st_index;
        dev_info_t *dip = sp->s_dip;
        kstat_t *ksp;
        kstat_named_t *knp;
        char name[MAXNAMELEN];
        unsigned int id;
        int rc;

        /* Create the set */
        (void) snprintf(name, MAXNAMELEN - 1, "%s_txq%04d",
            ddi_driver_name(dip), index);

        if ((ksp = kstat_create((char *)ddi_driver_name(dip),
            ddi_get_instance(dip), name, "queue", KSTAT_TYPE_NAMED,
            TX_NQSTATS + 4, 0)) == NULL) {
                rc = ENOMEM;
                goto fail1;
        }

        stp->st_ksp = ksp;

        ksp->ks_update = sfxge_tx_kstat_update;
        ksp->ks_private = stp;
        ksp->ks_lock = &(stp->st_lock);

        /* Initialise the named stats */
        stp->st_stat = knp = ksp->ks_data;
        for (id = 0; id < TX_NQSTATS; id++) {
                kstat_named_init(knp, (char *)efx_tx_qstat_name(sp->s_enp, id),
                    KSTAT_DATA_UINT64);
                knp++;
        }
        kstat_named_init(knp, "dpl_get_pkt_limit", KSTAT_DATA_UINT64);
        knp++;
        kstat_named_init(knp, "dpl_put_pkt_limit", KSTAT_DATA_UINT64);
        knp++;
        kstat_named_init(knp, "dpl_get_full_count", KSTAT_DATA_UINT64);
        knp++;
        kstat_named_init(knp, "dpl_put_full_count", KSTAT_DATA_UINT64);

        kstat_install(ksp);
        return (0);

fail1:
        DTRACE_PROBE1(fail1, int, rc);

        return (rc);
}

static void
sfxge_tx_kstat_fini(sfxge_txq_t *stp)
{
        /* Destroy the set */
        kstat_delete(stp->st_ksp);
        stp->st_ksp = NULL;
        stp->st_stat = NULL;
}

static int
sfxge_tx_qinit(sfxge_t *sp, unsigned int index, sfxge_txq_type_t type,
    unsigned int evq)
{
        sfxge_txq_t *stp;
        sfxge_tx_dpl_t *stdp;
        int rc;

        ASSERT3U(index, <, EFX_ARRAY_SIZE(sp->s_stp));
        ASSERT3U(type, <, SFXGE_TXQ_NTYPES);
        ASSERT3U(evq, <, EFX_ARRAY_SIZE(sp->s_sep));

        if ((stp = kmem_cache_alloc(sp->s_tqc, KM_SLEEP)) == NULL) {
                rc = ENOMEM;
                goto fail1;
        }
        ASSERT3U(stp->st_state, ==, SFXGE_TXQ_UNINITIALIZED);

        stdp = &(stp->st_dpl);

        stp->st_index = index;
        stp->st_type = type;
        stp->st_evq = evq;

        mutex_init(&(stp->st_lock), NULL, MUTEX_DRIVER,
            DDI_INTR_PRI(sp->s_intr.si_intr_pri));

        /* Initialize the statistics */
        if ((rc = sfxge_tx_kstat_init(stp)) != 0)
                goto fail2;

        stdp->get_pkt_limit = ddi_prop_get_int(DDI_DEV_T_ANY, sp->s_dip,
            DDI_PROP_DONTPASS, "tx_dpl_get_pkt_limit",
            SFXGE_TX_DPL_GET_PKT_LIMIT_DEFAULT);

        stdp->put_pkt_limit = ddi_prop_get_int(DDI_DEV_T_ANY, sp->s_dip,
            DDI_PROP_DONTPASS, "tx_dpl_put_pkt_limit",
            SFXGE_TX_DPL_PUT_PKT_LIMIT_DEFAULT);

        /* Allocate a per-EVQ label for events from this TXQ */
        if ((rc = sfxge_ev_txlabel_alloc(sp, evq, stp, &(stp->st_label))) != 0)
                goto fail2;

        stp->st_state = SFXGE_TXQ_INITIALIZED;

        /* Attach the TXQ to the driver */
        ASSERT3P(sp->s_stp[index], ==, NULL);
        sp->s_stp[index] = stp;
        sp->s_tx_qcount++;

        return (0);

fail2:
        DTRACE_PROBE(fail2);

        sfxge_tx_kstat_fini(stp);


        stp->st_evq = 0;
        stp->st_type = 0;
        stp->st_index = 0;

        mutex_destroy(&(stp->st_lock));

        kmem_cache_free(sp->s_tqc, stp);

fail1:
        DTRACE_PROBE1(fail1, int, rc);

        return (rc);
}

static int
sfxge_tx_qstart(sfxge_t *sp, unsigned int index)
{
        sfxge_txq_t *stp = sp->s_stp[index];
        efx_nic_t *enp = sp->s_enp;
        efsys_mem_t *esmp;
        sfxge_evq_t *sep;
        unsigned int evq;
        unsigned int flags;
        unsigned int desc_index;
        int rc;

        mutex_enter(&(stp->st_lock));

        esmp = &(stp->st_mem);
        evq = stp->st_evq;
        sep = sp->s_sep[evq];

        ASSERT3U(stp->st_state, ==, SFXGE_TXQ_INITIALIZED);
        ASSERT3U(sep->se_state, ==, SFXGE_EVQ_STARTED);

        /* Zero the memory */
        bzero(esmp->esm_base, EFX_TXQ_SIZE(SFXGE_TX_NDESCS));

        /* Program the buffer table */
        if ((rc = sfxge_sram_buf_tbl_set(sp, stp->st_id, esmp,
            EFX_TXQ_NBUFS(SFXGE_TX_NDESCS))) != 0)
                goto fail1;

        switch (stp->st_type) {
        case SFXGE_TXQ_NON_CKSUM:
                flags = 0;
                break;

        case SFXGE_TXQ_IP_CKSUM:
                flags = EFX_TXQ_CKSUM_IPV4;
                break;

        case SFXGE_TXQ_IP_TCP_UDP_CKSUM:
                flags = EFX_TXQ_CKSUM_IPV4 | EFX_TXQ_CKSUM_TCPUDP;
                break;

        default:
                ASSERT(B_FALSE);

                flags = 0;
                break;
        }

        /* Create the transmit queue */
        if ((rc = efx_tx_qcreate(enp, index, stp->st_label, esmp,
            SFXGE_TX_NDESCS, stp->st_id, flags, sep->se_eep,
            &(stp->st_etp), &desc_index)) != 0)
                goto fail2;

        /* Initialise queue descriptor indexes */
        stp->st_added = desc_index;
        stp->st_pending = desc_index;
        stp->st_completed = desc_index;
        stp->st_reaped = desc_index;

        /* Enable the transmit queue */
        efx_tx_qenable(stp->st_etp);

        stp->st_state = SFXGE_TXQ_STARTED;

        mutex_exit(&(stp->st_lock));

        return (0);

fail2:
        DTRACE_PROBE(fail2);

        /* Clear entries from the buffer table */
        sfxge_sram_buf_tbl_clear(sp, stp->st_id,
            EFX_TXQ_NBUFS(SFXGE_TX_NDESCS));

fail1:
        DTRACE_PROBE1(fail1, int, rc);

        mutex_exit(&(stp->st_lock));

        return (rc);
}

static inline int
sfxge_tx_qmapping_add(sfxge_txq_t *stp, sfxge_tx_mapping_t *stmp,
    size_t *offp, size_t *limitp)
{
        mblk_t *mp;
        size_t mapping_off;
        size_t mapping_size;
        int rc;

        ASSERT3U(*offp, <, stmp->stm_size);
        ASSERT(*limitp != 0);

        mp = stmp->stm_mp;

        ASSERT3P(stmp->stm_base, ==, mp->b_rptr);
        ASSERT3U(stmp->stm_size, ==, MBLKL(mp));

        mapping_off = stmp->stm_off + *offp;
        mapping_size = stmp->stm_size - *offp;

        while (mapping_size != 0 && *limitp != 0) {
                size_t page =
                    mapping_off >> SFXGE_TX_DESCSHIFT;
                size_t page_off =
                    mapping_off & SFXGE_TX_DESCOFFSET;
                size_t page_size =
                    SFXGE_TX_DESCSIZE - page_off;
                efx_buffer_t *ebp;

                ASSERT3U(page, <, SFXGE_TX_MAPPING_NADDR);
                ASSERT((stmp->stm_addr[page] & SFXGE_TX_DESCMASK) != 0);

                page_size = MIN(page_size, mapping_size);
                page_size = MIN(page_size, *limitp);

                ASSERT3U(stp->st_n, <=,
                    EFX_TXQ_LIMIT(SFXGE_TX_NDESCS));
                if (stp->st_n ==
                    EFX_TXQ_LIMIT(SFXGE_TX_NDESCS)) {
                        rc = ENOSPC;
                        goto fail1;
                }

                ebp = &(stp->st_eb[stp->st_n++]);
                ebp->eb_addr = stmp->stm_addr[page] +
                    page_off;
                ebp->eb_size = page_size;

                *offp += page_size;
                *limitp -= page_size;

                mapping_off += page_size;
                mapping_size -= page_size;

                ebp->eb_eop = (*limitp == 0 ||
                    (mapping_size == 0 && mp->b_cont == NULL));

                DTRACE_PROBE5(tx_mapping_add,
                    unsigned int, stp->st_index,
                    unsigned int, stp->st_n - 1,
                    uint64_t, ebp->eb_addr,
                    size_t, ebp->eb_size,
                    boolean_t, ebp->eb_eop);
        }

        ASSERT3U(*offp, <=, stmp->stm_size);

        return (0);

fail1:
        DTRACE_PROBE1(fail1, int, rc);

        return (rc);
}

static inline int
sfxge_tx_qbuffer_add(sfxge_txq_t *stp, sfxge_tx_buffer_t *stbp, boolean_t eop)
{
        efx_buffer_t *ebp;
        int rc;

        ASSERT3U(stp->st_n, <=,
            EFX_TXQ_LIMIT(SFXGE_TX_NDESCS));
        if (stp->st_n == EFX_TXQ_LIMIT(SFXGE_TX_NDESCS)) {
                rc = ENOSPC;
                goto fail1;
        }

        ebp = &(stp->st_eb[stp->st_n++]);
        ebp->eb_addr = stbp->stb_esm.esm_addr + stbp->stb_off;
        ebp->eb_size = stbp->stb_esm.esm_used - stbp->stb_off;
        ebp->eb_eop = eop;

        (void) ddi_dma_sync(stbp->stb_esm.esm_dma_handle,
            stbp->stb_off, ebp->eb_size,
            DDI_DMA_SYNC_FORDEV);

        stbp->stb_off = stbp->stb_esm.esm_used;

        DTRACE_PROBE5(tx_buffer_add,
            unsigned int, stp->st_index,
            unsigned int, stp->st_n - 1,
            uint64_t, ebp->eb_addr, size_t, ebp->eb_size,
            boolean_t, ebp->eb_eop);

        return (0);

fail1:
        DTRACE_PROBE1(fail1, int, rc);

        return (rc);
}

static inline boolean_t
sfxge_tx_msgb_copy(mblk_t *mp, sfxge_tx_buffer_t *stbp, size_t *offp,
    size_t *limitp)
{
        size_t data_off;
        size_t data_size;
        size_t copy_off;
        size_t copy_size;
        boolean_t eop;

        ASSERT3U(*offp, <=, MBLKL(mp));
        ASSERT(*limitp != 0);

        data_off = *offp;
        data_size = MBLKL(mp) - *offp;

        copy_off = stbp->stb_esm.esm_used;
        copy_size = SFXGE_TX_BUFFER_SIZE - copy_off;

        copy_size = MIN(copy_size, data_size);
        copy_size = MIN(copy_size, *limitp);

        bcopy(mp->b_rptr + data_off,
            stbp->stb_esm.esm_base + copy_off, copy_size);

        stbp->stb_esm.esm_used += copy_size;
        ASSERT3U(stbp->stb_esm.esm_used, <=,
            SFXGE_TX_BUFFER_SIZE);

        *offp += copy_size;
        *limitp -= copy_size;

        data_off += copy_size;
        data_size -= copy_size;

        eop = (*limitp == 0 ||
            (data_size == 0 && mp->b_cont == NULL));

        ASSERT3U(*offp, <=, MBLKL(mp));

        return (eop);
}

static int
sfxge_tx_qpayload_fragment(sfxge_txq_t *stp, unsigned int id, mblk_t **mpp,
    size_t *offp, size_t size, boolean_t copy)
{
        sfxge_t *sp = stp->st_sp;
        mblk_t *mp = *mpp;
        size_t off = *offp;
        sfxge_tx_buffer_t *stbp;
        sfxge_tx_mapping_t *stmp;
        int rc;

        stbp = stp->st_stbp[id];
        ASSERT(stbp == NULL || (stbp->stb_esm.esm_used == stbp->stb_off));

        stmp = stp->st_stmp[id];

        while (size != 0) {
                boolean_t eop;

                ASSERT(mp != NULL);

                if (mp->b_cont != NULL)
                        prefetch_read_many(mp->b_cont);

                ASSERT3U(off, <, MBLKL(mp));

                if (copy)
                        goto copy;

                /*
                 * Check whether we have already mapped this data block for
                 * DMA.
                 */
                if (stmp == NULL || stmp->stm_mp != mp) {
                        /*
                         * If we are part way through copying a data block then
                         * there's no point in trying to map it for DMA.
                         */
                        if (off != 0)
                                goto copy;

                        /*
                         * If the data block is too short then the cost of
                         * mapping it for DMA would outweigh the cost of
                         * copying it.
                         */
                        if (MBLKL(mp) < SFXGE_TX_COPY_THRESHOLD)
                                goto copy;

                        /* Try to grab a transmit mapping from the pool */
                        stmp = sfxge_tx_qfmp_get(stp);
                        if (stmp == NULL) {
                                /*
                                 * The pool was empty so allocate a new
                                 * mapping.
                                 */
                                if ((stmp = kmem_cache_alloc(sp->s_tmc,
                                    KM_NOSLEEP)) == NULL)
                                        goto copy;
                        }

                        /* Add the DMA mapping to the list */
                        stmp->stm_next = stp->st_stmp[id];
                        stp->st_stmp[id] = stmp;

                        /* Try to bind the data block to the mapping */
                        if (sfxge_tx_msgb_bind(mp, stmp) != 0)
                                goto copy;
                }
                ASSERT3P(stmp->stm_mp, ==, mp);

                /*
                 * If we have a partially filled buffer then we must add it to
                 * the fragment list before adding the mapping.
                 */
                if (stbp != NULL && (stbp->stb_esm.esm_used > stbp->stb_off)) {
                        rc = sfxge_tx_qbuffer_add(stp, stbp, B_FALSE);
                        if (rc != 0)
                                goto fail1;
                }

                /* Add the mapping to the fragment list */
                rc = sfxge_tx_qmapping_add(stp, stmp, &off, &size);
                if (rc != 0)
                        goto fail2;

                ASSERT(off == MBLKL(mp) || size == 0);

                /*
                 * If the data block has been exhausted then Skip over the
                 * control block and advance to the next data block.
                 */
                if (off == MBLKL(mp)) {
                        mp = mp->b_cont;
                        off = 0;
                }

                continue;

copy:
                if (stbp == NULL ||
                    stbp->stb_esm.esm_used == SFXGE_TX_BUFFER_SIZE) {
                        /* Try to grab a buffer from the pool */
                        stbp = sfxge_tx_qfbp_get(stp);
                        if (stbp == NULL) {
                                /*
                                 * The pool was empty so allocate a new
                                 * buffer.
                                 */
                                if ((stbp = kmem_cache_alloc(sp->s_tbc,
                                    KM_NOSLEEP)) == NULL) {
                                        rc = ENOMEM;
                                        goto fail3;
                                }
                        }

                        /* Add it to the list */
                        stbp->stb_next = stp->st_stbp[id];
                        stp->st_stbp[id] = stbp;
                }

                /* Copy as much of the data block as we can into the buffer */
                eop = sfxge_tx_msgb_copy(mp, stbp, &off, &size);

                ASSERT(off == MBLKL(mp) || size == 0 ||
                    stbp->stb_esm.esm_used == SFXGE_TX_BUFFER_SIZE);

                /*
                 * If we have reached the end of the packet, or the buffer is
                 * full, then add the buffer to the fragment list.
                 */
                if (stbp->stb_esm.esm_used == SFXGE_TX_BUFFER_SIZE || eop) {
                        rc = sfxge_tx_qbuffer_add(stp, stbp, eop);
                        if (rc != 0)
                                goto fail4;
                }

                /*
                 * If the data block has been exhaused then advance to the next
                 * one.
                 */
                if (off == MBLKL(mp)) {
                        mp = mp->b_cont;
                        off = 0;
                }
        }

        *mpp = mp;
        *offp = off;

        return (0);

fail4:
        DTRACE_PROBE(fail4);
fail3:
        DTRACE_PROBE(fail3);
fail2:
        DTRACE_PROBE(fail2);
fail1:
        DTRACE_PROBE1(fail1, int, rc);

        return (rc);
}

static int
sfxge_tx_qlso_fragment(sfxge_txq_t *stp, sfxge_tx_packet_t *stpp,
    boolean_t copy)
{
        sfxge_t *sp = stp->st_sp;
        mblk_t *mp = stpp->stp_mp;
        struct ether_header *etherhp = stpp->stp_etherhp;
        struct ip *iphp = stpp->stp_iphp;
        struct tcphdr *thp = stpp->stp_thp;
        size_t size = stpp->stp_size;
        size_t off = stpp->stp_off;
        size_t mss = stpp->stp_mss;
        unsigned int id;
        caddr_t hp;
        size_t ehs, hs;
        uint16_t start_len;
        uint16_t start_id;
        uint16_t ip_id;
        uint8_t start_flags;
        uint32_t start_seq;
        uint32_t th_seq;
        size_t lss;
        sfxge_tx_buffer_t *stbp;
        int rc;

        ASSERT(mutex_owned(&(stp->st_lock)));

        if ((DB_LSOFLAGS(mp) & HW_LSO) == 0) {
                rc = EINVAL;
                goto fail1;
        }

        id = stp->st_added & (SFXGE_TX_NDESCS - 1);

        ASSERT(stp->st_n == 0);
        ASSERT(stp->st_stbp[id] == NULL);
        ASSERT(stp->st_stmp[id] == NULL);

        ehs = (etherhp->ether_type == htons(ETHERTYPE_VLAN)) ?
            sizeof (struct ether_vlan_header) :
            sizeof (struct ether_header);
        if (msgdsize(mp) != ehs + ntohs(iphp->ip_len)) {
                rc = EINVAL;
                goto fail2;
        }

        /* The payload offset is equivalent to the size of the headers */
        hp = (caddr_t)(mp->b_rptr);
        hs = off;

        /*
         * If the initial data block only contains the headers then advance
         * to the next one.
         */
        if (hs > MBLKL(mp)) {
                rc = EINVAL;
                goto fail3;
        }
        mp->b_rptr += hs;

        if (MBLKL(mp) == 0)
                mp = mp->b_cont;

        off = 0;

        /* Check IP and TCP headers are suitable for LSO */
        if (((iphp->ip_off & ~htons(IP_DF)) != 0) ||
            ((thp->th_flags & (TH_URG | TH_SYN)) != 0) ||
            (thp->th_urp != 0)) {
                rc = EINVAL;
                goto fail4;
        }

        if (size + (thp->th_off << 2) + (iphp->ip_hl << 2) !=
            ntohs(iphp->ip_len)) {
                rc = EINVAL;
                goto fail4;
        }

        /*
         * Get the base IP id, The stack leaves enough of a gap in id space
         * for us to increment this for each segment we send out.
         */
        start_len = ntohs(iphp->ip_len);
        start_id = ip_id = ntohs(iphp->ip_id);

        /* Get the base TCP sequence number and flags */
        start_flags = thp->th_flags;
        start_seq = th_seq = ntohl(thp->th_seq);

        /* Adjust the header for interim segments */
        iphp->ip_len = htons((iphp->ip_hl << 2) + (thp->th_off << 2) + mss);
        thp->th_flags = start_flags & ~(TH_PUSH | TH_FIN);

        lss = size;
        if ((lss / mss) >= (EFX_TXQ_LIMIT(SFXGE_TX_NDESCS) / 2)) {
                rc = EINVAL;
                goto fail5;
        }

        stbp = NULL;
        while (lss != 0) {
                size_t ss = MIN(lss, mss);
                boolean_t eol = (ss == lss);

                /* Adjust the header for this segment */
                iphp->ip_id = htons(ip_id);
                ip_id++;

                thp->th_seq = htonl(th_seq);
                th_seq += ss;

                /* If this is the final segment then do some extra adjustment */
                if (eol) {
                        iphp->ip_len = htons((iphp->ip_hl << 2) +
                            (thp->th_off << 2) + ss);
                        thp->th_flags = start_flags;
                }

                if (stbp == NULL ||
                    stbp->stb_esm.esm_used + hs > SFXGE_TX_BUFFER_SIZE) {
                        /* Try to grab a buffer from the pool */
                        stbp = sfxge_tx_qfbp_get(stp);
                        if (stbp == NULL) {
                                /*
                                 * The pool was empty so allocate a new
                                 * buffer.
                                 */
                                if ((stbp = kmem_cache_alloc(sp->s_tbc,
                                    KM_NOSLEEP)) == NULL) {
                                        rc = ENOMEM;
                                        goto fail6;
                                }
                        }

                        /* Add it to the list */
                        stbp->stb_next = stp->st_stbp[id];
                        stp->st_stbp[id] = stbp;
                }

                /* Copy in the headers */
                ASSERT3U(stbp->stb_off, ==, stbp->stb_esm.esm_used);
                bcopy(hp, stbp->stb_esm.esm_base + stbp->stb_off, hs);
                stbp->stb_esm.esm_used += hs;

                /* Add the buffer to the fragment list */
                rc = sfxge_tx_qbuffer_add(stp, stbp, B_FALSE);
                if (rc != 0)
                        goto fail7;

                /* Add the payload to the fragment list */
                if ((rc = sfxge_tx_qpayload_fragment(stp, id, &mp, &off,
                    ss, copy)) != 0)
                        goto fail8;

                lss -= ss;
        }
        ASSERT3U(off, ==, 0);
        ASSERT3P(mp, ==, NULL);

        ASSERT3U(th_seq - start_seq, ==, size);

        /*
         * If no part of the packet has been mapped for DMA then we can free
         * it now, otherwise it can only be freed on completion.
         */
        if (stp->st_stmp[id] == NULL)
                freemsg(stpp->stp_mp);
        else
                stp->st_mp[id] = stpp->stp_mp;

        stpp->stp_mp = NULL;

        return (0);

fail8:
        DTRACE_PROBE(fail8);
fail7:
        DTRACE_PROBE(fail7);
fail6:
        DTRACE_PROBE(fail6);
fail5:
        DTRACE_PROBE(fail5);

        /* Restore the header */
        thp->th_seq = htonl(start_seq);
        thp->th_flags = start_flags;

        iphp->ip_len = htons(start_len);
        iphp->ip_id = htons(start_id);

fail4:
        DTRACE_PROBE(fail4);

        mp = stpp->stp_mp;
        mp->b_rptr -= hs;

        ASSERT3U(((etherhp->ether_type == htons(ETHERTYPE_VLAN)) ?
            sizeof (struct ether_vlan_header) :
            sizeof (struct ether_header)) +
            ntohs(iphp->ip_len), ==, msgdsize(mp));

        ASSERT(stp->st_mp[id] == NULL);

fail3:
        DTRACE_PROBE(fail3);
fail2:
        DTRACE_PROBE(fail2);
fail1:
        DTRACE_PROBE1(fail1, int, rc);

        return (rc);
}

static int
sfxge_tx_qpacket_fragment(sfxge_txq_t *stp, sfxge_tx_packet_t *stpp,
    boolean_t copy)
{
        sfxge_t *sp = stp->st_sp;
        mblk_t *mp = stpp->stp_mp;
        unsigned int id;
        size_t off;
        size_t size;
        sfxge_tx_mapping_t *stmp;
        sfxge_tx_buffer_t *stbp;
        int rc;

        ASSERT(mutex_owned(&(stp->st_lock)));

        ASSERT(stp->st_n == 0);

        id = stp->st_added & (SFXGE_TX_NDESCS - 1);

        ASSERT(stp->st_stbp[id] == NULL);
        ASSERT(stp->st_stmp[id] == NULL);

        off = 0;
        size = LONG_MAX;        /* must be larger than the packet */

        stbp = NULL;
        stmp = NULL;

        while (mp != NULL) {
                boolean_t eop;

                ASSERT(mp != NULL);

                if (mp->b_cont != NULL)
                        prefetch_read_many(mp->b_cont);

                ASSERT(stmp == NULL || stmp->stm_mp != mp);

                if (copy)
                        goto copy;

                /*
                 * If we are part way through copying a data block then there's
                 * no point in trying to map it for DMA.
                 */
                if (off != 0)
                        goto copy;

                /*
                 * If the data block is too short then the cost of mapping it
                 * for DMA would outweigh the cost of copying it.
                 *
                 * TX copy break
                 */
                if (MBLKL(mp) < SFXGE_TX_COPY_THRESHOLD)
                        goto copy;

                /* Try to grab a transmit mapping from the pool */
                stmp = sfxge_tx_qfmp_get(stp);
                if (stmp == NULL) {
                        /*
                         * The pool was empty so allocate a new
                         * mapping.
                         */
                        if ((stmp = kmem_cache_alloc(sp->s_tmc,
                            KM_NOSLEEP)) == NULL)
                                goto copy;
                }

                /* Add the DMA mapping to the list */
                stmp->stm_next = stp->st_stmp[id];
                stp->st_stmp[id] = stmp;

                /* Try to bind the data block to the mapping */
                if (sfxge_tx_msgb_bind(mp, stmp) != 0)
                        goto copy;

                /*
                 * If we have a partially filled buffer then we must add it to
                 * the fragment list before adding the mapping.
                 */
                if (stbp != NULL && (stbp->stb_esm.esm_used > stbp->stb_off)) {
                        rc = sfxge_tx_qbuffer_add(stp, stbp, B_FALSE);
                        if (rc != 0)
                                goto fail1;
                }

                /* Add the mapping to the fragment list */
                rc = sfxge_tx_qmapping_add(stp, stmp, &off, &size);
                if (rc != 0)
                        goto fail2;

                ASSERT3U(off, ==, MBLKL(mp));

                /* Advance to the next data block */
                mp = mp->b_cont;
                off = 0;
                continue;

copy:
                if (stbp == NULL ||
                    stbp->stb_esm.esm_used == SFXGE_TX_BUFFER_SIZE) {
                        /* Try to grab a buffer from the pool */
                        stbp = sfxge_tx_qfbp_get(stp);
                        if (stbp == NULL) {
                                /*
                                 * The pool was empty so allocate a new
                                 * buffer.
                                 */
                                if ((stbp = kmem_cache_alloc(sp->s_tbc,
                                    KM_NOSLEEP)) == NULL) {
                                        rc = ENOMEM;
                                        goto fail3;
                                }
                        }

                        /* Add it to the list */
                        stbp->stb_next = stp->st_stbp[id];
                        stp->st_stbp[id] = stbp;
                }

                /* Copy as much of the data block as we can into the buffer */
                eop = sfxge_tx_msgb_copy(mp, stbp, &off, &size);

                ASSERT(off == MBLKL(mp) ||
                    stbp->stb_esm.esm_used == SFXGE_TX_BUFFER_SIZE);

                /*
                 * If we have reached the end of the packet, or the buffer is
                 * full, then add the buffer to the fragment list.
                 */
                if (stbp->stb_esm.esm_used == SFXGE_TX_BUFFER_SIZE || eop) {
                        rc = sfxge_tx_qbuffer_add(stp, stbp, eop);
                        if (rc != 0)
                                goto fail4;
                }

                /*
                 * If the data block has been exhaused then advance to the next
                 * one.
                 */
                if (off == MBLKL(mp)) {
                        mp = mp->b_cont;
                        off = 0;
                }
        }
        ASSERT3U(off, ==, 0);
        ASSERT3P(mp, ==, NULL);
        ASSERT3U(size, !=, 0);

        /*
         * If no part of the packet has been mapped for DMA then we can free
         * it now, otherwise it can only be freed on completion.
         */
        if (stp->st_stmp[id] == NULL)
                freemsg(stpp->stp_mp);
        else
                stp->st_mp[id] = stpp->stp_mp;

        stpp->stp_mp = NULL;

        return (0);

fail4:
        DTRACE_PROBE(fail4);
fail3:
        DTRACE_PROBE(fail3);
fail2:
        DTRACE_PROBE(fail2);
fail1:
        DTRACE_PROBE1(fail1, int, rc);

        ASSERT(stp->st_stmp[id] == NULL);

        return (rc);
}


#define SFXGE_TX_QDPL_PUT_PENDING(_stp)                                 \
        ((_stp)->st_dpl.std_put != 0)

static void
sfxge_tx_qdpl_swizzle(sfxge_txq_t *stp)
{
        sfxge_tx_dpl_t *stdp = &(stp->st_dpl);
        volatile uintptr_t *putp;
        uintptr_t put;
        sfxge_tx_packet_t *stpp;
        sfxge_tx_packet_t *p;
        sfxge_tx_packet_t **pp;
        unsigned int count;

        ASSERT(mutex_owned(&(stp->st_lock)));

        /*
         * Guaranteed that in flight TX packets will cause more TX completions
         * hence more swizzles must happen
         */
        ASSERT3U(stdp->std_count, <=, sfxge_tx_dpl_get_pkt_max(stp));
        if (stdp->std_count >= stdp->get_pkt_limit)
                return;

        /* Acquire the put list - replacing with an empty list */
        putp = &(stdp->std_put);
        put = atomic_swap_ulong(putp, 0);
        stpp = (void *)put;

        if (stpp == NULL)
                return;

        /* Reverse the list */
        pp = &(stpp->stp_next);
        p = NULL;

        count = 0;
        do {
                sfxge_tx_packet_t *next;

                next = stpp->stp_next;

                stpp->stp_next = p;
                p = stpp;

                count++;
                stpp = next;
        } while (stpp != NULL);

        /* Add it to the tail of the get list */
        ASSERT3P(*pp, ==, NULL);

        *(stdp->std_getp) = p;
        stdp->std_getp = pp;
        stdp->std_count += count;
        ASSERT3U(stdp->std_count, <=, sfxge_tx_dpl_get_pkt_max(stp));

        DTRACE_PROBE2(dpl_counts, int, stdp->std_count, int, count);
}


/*
 * If TXQ locked, add the RX DPL put list and this packet to the TX DPL get list
 * If TXQ unlocked, atomically add this packet to TX DPL put list
 *
 * The only possible error is ENOSPC (used for TX backpressure)
 * For the TX DPL put or get list becoming full, in both cases there must be
 * future TX completions (as represented by the packets on the DPL get lists).
 *
 * This ensures that in the future mac_tx_update() will be called from
 * sfxge_tx_qcomplete()
 */
static inline int
sfxge_tx_qdpl_add(sfxge_txq_t *stp, sfxge_tx_packet_t *stpp, int locked)
{
        sfxge_tx_dpl_t *stdp = &stp->st_dpl;

        ASSERT3P(stpp->stp_next, ==, NULL);

        if (locked) {
                ASSERT(mutex_owned(&stp->st_lock));

                if (stdp->std_count >= stdp->get_pkt_limit) {
                        stdp->get_full_count++;
                        return (ENOSPC);
                }

                /* Reverse the put list onto the get list */
                sfxge_tx_qdpl_swizzle(stp);

                /* Add to the tail of the get list */
                *(stdp->std_getp) = stpp;
                stdp->std_getp = &stpp->stp_next;
                stdp->std_count++;
                ASSERT3U(stdp->std_count, <=, sfxge_tx_dpl_get_pkt_max(stp));

        } else {
                volatile uintptr_t *putp;
                uintptr_t old;
                uintptr_t new;
                sfxge_tx_packet_t *old_pkt;

                putp = &(stdp->std_put);
                new = (uintptr_t)stpp;

                /* Add to the head of the put list, keeping a list length */
                do {
                        old = *putp;
                        old_pkt =  (sfxge_tx_packet_t *)old;

                        stpp->stp_dpl_put_len = old ?
                            old_pkt->stp_dpl_put_len + 1 : 1;

                        if (stpp->stp_dpl_put_len >= stdp->put_pkt_limit) {
                                stpp->stp_next = 0;
                                stpp->stp_dpl_put_len = 0;
                                stdp->put_full_count++;
                                return (ENOSPC);
                        }

                        stpp->stp_next = (void *)old;
                } while (atomic_cas_ulong(putp, old, new) != old);
        }
        return (0);
}


/* Take all packets from DPL get list and try to send to HW */
static void
sfxge_tx_qdpl_drain(sfxge_txq_t *stp)
{
        sfxge_t *sp = stp->st_sp;
        sfxge_tx_dpl_t *stdp = &(stp->st_dpl);
        unsigned int pushed = stp->st_added;
        sfxge_tx_packet_t *stpp;
        unsigned int count;

        ASSERT(mutex_owned(&(stp->st_lock)));

        prefetch_read_many(sp->s_enp);
        prefetch_read_many(stp->st_etp);

        stpp = stdp->std_get;
        count = stdp->std_count;

        while (count != 0) {
                sfxge_tx_packet_t *next;
                boolean_t copy;
                int rc;

                ASSERT(stpp != NULL);

                /* Split stpp off */
                next = stpp->stp_next;
                stpp->stp_next = NULL;

                if (next != NULL)
                        prefetch_read_many(next);

                if (stp->st_state != SFXGE_TXQ_STARTED)
                        goto reject;

                copy = B_FALSE;

again:
                /* Fragment the packet */
                if (stpp->stp_mss != 0) {
                        rc = sfxge_tx_qlso_fragment(stp, stpp, copy);
                } else {
                        rc = sfxge_tx_qpacket_fragment(stp, stpp, copy);
                }

                switch (rc) {
                case 0:
                        break;

                case ENOSPC:
                        if (!copy)
                                goto copy;

                /*FALLTHRU*/
                default:
                        goto reject;
                }

                /* Free the packet structure */
                stpp->stp_etherhp = NULL;
                stpp->stp_iphp = NULL;
                stpp->stp_thp = NULL;
                stpp->stp_off = 0;
                stpp->stp_size = 0;
                stpp->stp_mss = 0;
                stpp->stp_dpl_put_len = 0;

                ASSERT3P(stpp->stp_mp, ==, NULL);

                if (sfxge_tx_qfpp_put(stp, stpp) != 0) {
                        sfxge_tx_packet_destroy(sp, stpp);
                        stpp = NULL;
                }

                --count;
                stpp = next;

                /* Post the packet */
                sfxge_tx_qlist_post(stp);

                if (stp->st_unblock != SFXGE_TXQ_NOT_BLOCKED)
                        goto defer;

                if (stp->st_added - pushed >= SFXGE_TX_BATCH) {
                        efx_tx_qpush(stp->st_etp, stp->st_added, pushed);
                        pushed = stp->st_added;
                }

                continue;

copy:
                /* Abort the current fragment list */
                sfxge_tx_qlist_abort(stp);

                /* Try copying the packet to flatten it */
                ASSERT(!copy);
                copy = B_TRUE;

                goto again;

reject:
                /* Abort the current fragment list */
                sfxge_tx_qlist_abort(stp);

                /* Discard the packet */
                freemsg(stpp->stp_mp);
                stpp->stp_mp = NULL;

                /* Free the packet structure */
                stpp->stp_etherhp = NULL;
                stpp->stp_iphp = NULL;
                stpp->stp_thp = NULL;
                stpp->stp_off = 0;
                stpp->stp_size = 0;
                stpp->stp_mss = 0;
                stpp->stp_dpl_put_len = 0;

                if (sfxge_tx_qfpp_put(stp, stpp) != 0) {
                        sfxge_tx_packet_destroy(sp, stpp);
                        stpp = NULL;
                }

                --count;
                stpp = next;
                continue;
defer:
                DTRACE_PROBE1(defer, unsigned int, stp->st_index);
                break;
        }

        if (count == 0) {
                /* New empty get list */
                ASSERT3P(stpp, ==, NULL);
                stdp->std_get = NULL;
                stdp->std_count = 0;

                stdp->std_getp = &(stdp->std_get);
        } else {
                /* shorten the list by moving the head */
                stdp->std_get = stpp;
                stdp->std_count = count;
                ASSERT3U(stdp->std_count, <=, sfxge_tx_dpl_get_pkt_max(stp));
        }

        if (stp->st_added != pushed)
                efx_tx_qpush(stp->st_etp, stp->st_added, pushed);

        ASSERT(stp->st_unblock != SFXGE_TXQ_NOT_BLOCKED ||
            stdp->std_count == 0);
}

/* Swizzle deferred packet list, try and push to HW */
static inline void
sfxge_tx_qdpl_service(sfxge_txq_t *stp)
{
        do {
                ASSERT(mutex_owned(&(stp->st_lock)));

                if (SFXGE_TX_QDPL_PUT_PENDING(stp))
                        sfxge_tx_qdpl_swizzle(stp);

                if (stp->st_unblock == SFXGE_TXQ_NOT_BLOCKED)
                        sfxge_tx_qdpl_drain(stp);

                mutex_exit(&(stp->st_lock));

                if (!SFXGE_TX_QDPL_PUT_PENDING(stp))
                        break;
        } while (mutex_tryenter(&(stp->st_lock)));
}

static void
sfxge_tx_qdpl_flush_locked(sfxge_txq_t *stp)
{
        sfxge_t *sp = stp->st_sp;
        sfxge_tx_dpl_t *stdp = &(stp->st_dpl);
        sfxge_tx_packet_t *stpp;
        unsigned int count;

        ASSERT(mutex_owned(&(stp->st_lock)));

        /* Swizzle put list to the get list */
        sfxge_tx_qdpl_swizzle(stp);

        stpp = stdp->std_get;
        count = stdp->std_count;

        while (count != 0) {
                sfxge_tx_packet_t *next;

                next = stpp->stp_next;
                stpp->stp_next = NULL;

                /* Discard the packet */
                freemsg(stpp->stp_mp);
                stpp->stp_mp = NULL;

                /* Free the packet structure */
                stpp->stp_etherhp = NULL;
                stpp->stp_iphp = NULL;
                stpp->stp_thp = NULL;
                stpp->stp_off = 0;
                stpp->stp_size = 0;
                stpp->stp_mss = 0;
                stpp->stp_dpl_put_len = 0;

                sfxge_tx_packet_destroy(sp, stpp);

                --count;
                stpp = next;
        }

        ASSERT3P(stpp, ==, NULL);

        /* Empty list */
        stdp->std_get = NULL;
        stdp->std_count = 0;
        stdp->std_getp = &(stdp->std_get);
}


void
sfxge_tx_qdpl_flush(sfxge_txq_t *stp)
{
        mutex_enter(&(stp->st_lock));
        sfxge_tx_qdpl_flush_locked(stp);
        mutex_exit(&(stp->st_lock));
}


static void
sfxge_tx_qunblock(sfxge_txq_t *stp)
{
        sfxge_t *sp = stp->st_sp;
        unsigned int evq = stp->st_evq;
        sfxge_evq_t *sep = sp->s_sep[evq];

        ASSERT(mutex_owned(&(sep->se_lock)));

        mutex_enter(&(stp->st_lock));

        if (stp->st_state != SFXGE_TXQ_STARTED) {
                mutex_exit(&(stp->st_lock));
                return;
        }

        if (stp->st_unblock != SFXGE_TXQ_NOT_BLOCKED) {
                unsigned int level;

                level = stp->st_added - stp->st_completed;
                if (level <= stp->st_unblock) {
                        stp->st_unblock = SFXGE_TXQ_NOT_BLOCKED;
                        sfxge_tx_qlist_post(stp);
                }
        }

        sfxge_tx_qdpl_service(stp);
        /* lock has been dropped */
}

void
sfxge_tx_qcomplete(sfxge_txq_t *stp)
{
        sfxge_t *sp = stp->st_sp;
        sfxge_tx_dpl_t *stdp = &(stp->st_dpl);
        unsigned int evq = stp->st_evq;
        sfxge_evq_t *sep = sp->s_sep[evq];
        unsigned int completed;

        ASSERT(mutex_owned(&(sep->se_lock)));

        completed = stp->st_completed;
        while (completed != stp->st_pending) {
                unsigned int id;
                sfxge_tx_mapping_t *stmp;

                id = completed++ & (SFXGE_TX_NDESCS - 1);

                if ((stmp = stp->st_stmp[id]) != NULL) {
                        mblk_t *mp;

                        /* Unbind all the mappings */
                        do {
                                ASSERT(stmp->stm_mp != NULL);
                                sfxge_tx_msgb_unbind(stmp);

                                stmp = stmp->stm_next;
                        } while (stmp != NULL);

                        /*
                         * Now that the packet is no longer mapped for DMA it
                         * can be freed.
                         */
                        mp = stp->st_mp[id];
                        stp->st_mp[id] = NULL;

                        ASSERT(mp != NULL);
                        freemsg(mp);
                }
        }
        stp->st_completed = completed;

        /* Check whether we need to unblock the queue */
        if (stp->st_unblock != SFXGE_TXQ_NOT_BLOCKED) {
                unsigned int level;

                level = stp->st_added - stp->st_completed;
                if (level <= stp->st_unblock)
                        sfxge_tx_qunblock(stp);
        }

        /* Release TX backpressure from the TX DPL put/get list being full */
        if (stdp->std_count < stdp->get_pkt_limit)
                mac_tx_update(sp->s_mh);
}

void
sfxge_tx_qflush_done(sfxge_txq_t *stp)
{
        sfxge_t *sp = stp->st_sp;
        boolean_t flush_pending = B_FALSE;

        ASSERT(mutex_owned(&(sp->s_sep[stp->st_evq]->se_lock)));

        mutex_enter(&(stp->st_lock));

        switch (stp->st_state) {
        case SFXGE_TXQ_INITIALIZED:
                /* Ignore flush event after TxQ destroyed */
                break;

        case SFXGE_TXQ_FLUSH_PENDING:
                flush_pending = B_TRUE;
                stp->st_state = SFXGE_TXQ_FLUSH_DONE;
                break;

        case SFXGE_TXQ_FLUSH_FAILED:
                /* MC may have rebooted before handling the flush request */
                stp->st_state = SFXGE_TXQ_FLUSH_DONE;
                break;

        case SFXGE_TXQ_STARTED:
                /*
                 * MC initiated flush on MC reboot or because of bad Tx
                 * descriptor
                 */
                stp->st_state = SFXGE_TXQ_FLUSH_DONE;
                break;

        case SFXGE_TXQ_FLUSH_DONE:
                /* Ignore unexpected extra flush event */
                ASSERT(B_FALSE);
                break;

        default:
                ASSERT(B_FALSE);
        }


        mutex_exit(&(stp->st_lock));

        if (flush_pending == B_FALSE) {
                /* Flush was not pending */
                return;
        }

        mutex_enter(&(sp->s_tx_flush_lock));
        sp->s_tx_flush_pending--;
        if (sp->s_tx_flush_pending <= 0) {
                /* All queues flushed: wakeup sfxge_tx_stop() */
                cv_signal(&(sp->s_tx_flush_kv));
        }
        mutex_exit(&(sp->s_tx_flush_lock));
}

static void
sfxge_tx_qflush(sfxge_t *sp, unsigned int index, boolean_t wait_for_flush)
{
        sfxge_txq_t *stp = sp->s_stp[index];
        int rc;

        ASSERT(mutex_owned(&(sp->s_state_lock)));
        ASSERT(mutex_owned(&(sp->s_tx_flush_lock)));

        mutex_enter(&(stp->st_lock));

        /* Prepare to flush and stop the queue */
        if (stp->st_state == SFXGE_TXQ_STARTED) {
                /* Flush the transmit queue */
                if ((rc = efx_tx_qflush(stp->st_etp)) == EALREADY) {
                        /* Already flushed, may be initiated by MC */
                        stp->st_state = SFXGE_TXQ_FLUSH_DONE;
                } else if (rc != 0) {
                        /* Unexpected error */
                        stp->st_state = SFXGE_TXQ_FLUSH_FAILED;
                } else if (wait_for_flush) {
                        stp->st_state = SFXGE_TXQ_FLUSH_PENDING;
                        sp->s_tx_flush_pending++;
                } else {
                        /* Assume the flush is done */
                        stp->st_state = SFXGE_TXQ_FLUSH_DONE;
                }
        }

        mutex_exit(&(stp->st_lock));
}

static void
sfxge_tx_qstop(sfxge_t *sp, unsigned int index)
{
        sfxge_txq_t *stp = sp->s_stp[index];
        unsigned int evq = stp->st_evq;
        sfxge_evq_t *sep = sp->s_sep[evq];

        mutex_enter(&(sep->se_lock));
        mutex_enter(&(stp->st_lock));

        if (stp->st_state == SFXGE_TXQ_INITIALIZED)
                goto done;

        ASSERT(stp->st_state == SFXGE_TXQ_FLUSH_PENDING ||
            stp->st_state == SFXGE_TXQ_FLUSH_DONE ||
            stp->st_state == SFXGE_TXQ_FLUSH_FAILED);

        /* All queues should have been flushed */
        if (stp->st_sp->s_tx_flush_pending != 0) {
                dev_err(sp->s_dip, CE_NOTE,
                    SFXGE_CMN_ERR "txq[%d] stop with flush_pending=%d",
                    index, stp->st_sp->s_tx_flush_pending);
        }
        if (stp->st_state == SFXGE_TXQ_FLUSH_FAILED) {
                dev_err(sp->s_dip, CE_NOTE,
                    SFXGE_CMN_ERR "txq[%d] flush failed", index);
        }

        /* Destroy the transmit queue */
        efx_tx_qdestroy(stp->st_etp);
        stp->st_etp = NULL;

        /* Clear entries from the buffer table */
        sfxge_sram_buf_tbl_clear(sp, stp->st_id,
            EFX_TXQ_NBUFS(SFXGE_TX_NDESCS));

        sfxge_tx_qlist_abort(stp);
        ASSERT3U(stp->st_n, ==, 0);

        stp->st_unblock = SFXGE_TXQ_NOT_BLOCKED;

        stp->st_pending = stp->st_added;

        sfxge_tx_qcomplete(stp);
        ASSERT3U(stp->st_completed, ==, stp->st_pending);

        sfxge_tx_qreap(stp);
        ASSERT3U(stp->st_reaped, ==, stp->st_completed);

        /*
         * Ensure the deferred packet list is cleared
         * Can race with sfxge_tx_packet_add() adding to the put list
         */
        sfxge_tx_qdpl_flush_locked(stp);

        stp->st_added = 0;
        stp->st_pending = 0;
        stp->st_completed = 0;
        stp->st_reaped = 0;

        stp->st_state = SFXGE_TXQ_INITIALIZED;

done:
        mutex_exit(&(stp->st_lock));
        mutex_exit(&(sep->se_lock));
}

static void
sfxge_tx_qfini(sfxge_t *sp, unsigned int index)
{
        sfxge_txq_t *stp = sp->s_stp[index];
        sfxge_tx_dpl_t *stdp = &(stp->st_dpl);

        ASSERT3U(stp->st_state, ==, SFXGE_TXQ_INITIALIZED);
        stp->st_state = SFXGE_TXQ_UNINITIALIZED;

        /* Detach the TXQ from the driver */
        sp->s_stp[index] = NULL;
        ASSERT(sp->s_tx_qcount > 0);
        sp->s_tx_qcount--;

        /* Free the EVQ label for events from this TXQ */
        (void) sfxge_ev_txlabel_free(sp, stp->st_evq, stp, stp->st_label);
        stp->st_label = 0;

        /* Tear down the statistics */
        sfxge_tx_kstat_fini(stp);

        /* Ensure the deferred packet list is empty */
        ASSERT3U(stdp->std_count, ==, 0);
        ASSERT3P(stdp->std_get, ==, NULL);
        ASSERT3U(stdp->std_put, ==, 0);

        /* Clear the free buffer pool */
        sfxge_tx_qfbp_empty(stp);

        /* Clear the free mapping pool */
        sfxge_tx_qfmp_empty(stp);

        /* Clear the free packet pool */
        sfxge_tx_qfpp_empty(stp);

        mutex_destroy(&(stp->st_lock));

        stp->st_evq = 0;
        stp->st_type = 0;
        stp->st_index = 0;

        kmem_cache_free(sp->s_tqc, stp);
}

int
sfxge_tx_init(sfxge_t *sp)
{
        sfxge_intr_t *sip = &(sp->s_intr);
        char name[MAXNAMELEN];
        sfxge_txq_type_t qtype;
        unsigned int txq, evq;
        int index;
        int rc;

        (void) snprintf(name, MAXNAMELEN - 1, "%s%d_tx_packet_cache",
            ddi_driver_name(sp->s_dip), ddi_get_instance(sp->s_dip));

        sp->s_tpc = kmem_cache_create(name, sizeof (sfxge_tx_packet_t),
            SFXGE_CPU_CACHE_SIZE, sfxge_tx_packet_ctor, sfxge_tx_packet_dtor,
            NULL, sp, NULL, 0);
        ASSERT(sp->s_tpc != NULL);

        (void) snprintf(name, MAXNAMELEN - 1, "%s%d_tx_buffer_cache",
            ddi_driver_name(sp->s_dip), ddi_get_instance(sp->s_dip));

        sp->s_tbc = kmem_cache_create(name, sizeof (sfxge_tx_buffer_t),
            SFXGE_CPU_CACHE_SIZE, sfxge_tx_buffer_ctor, sfxge_tx_buffer_dtor,
            NULL, sp, NULL, 0);
        ASSERT(sp->s_tbc != NULL);

        (void) snprintf(name, MAXNAMELEN - 1, "%s%d_tx_mapping_cache",
            ddi_driver_name(sp->s_dip), ddi_get_instance(sp->s_dip));

        sp->s_tmc = kmem_cache_create(name, sizeof (sfxge_tx_mapping_t),
            SFXGE_CPU_CACHE_SIZE, sfxge_tx_mapping_ctor, sfxge_tx_mapping_dtor,
            NULL, sp, NULL, 0);
        ASSERT(sp->s_tmc != NULL);

        (void) snprintf(name, MAXNAMELEN - 1, "%s%d_txq_cache",
            ddi_driver_name(sp->s_dip), ddi_get_instance(sp->s_dip));

        sp->s_tqc = kmem_cache_create(name, sizeof (sfxge_txq_t),
            SFXGE_CPU_CACHE_SIZE, sfxge_tx_qctor, sfxge_tx_qdtor, NULL, sp,
            NULL, 0);
        ASSERT(sp->s_tqc != NULL);

        /* Initialize the transmit queues. */
        sp->s_tx_scale_max[SFXGE_TXQ_NON_CKSUM]         = sip->si_nalloc;
        sp->s_tx_scale_max[SFXGE_TXQ_IP_CKSUM]          = 1;
        sp->s_tx_scale_max[SFXGE_TXQ_IP_TCP_UDP_CKSUM]  = sip->si_nalloc;

        /* Ensure minimum queue counts required by sfxge_tx_packet_add(). */
        if (sp->s_tx_scale_max[SFXGE_TXQ_NON_CKSUM] < 1)
                sp->s_tx_scale_max[SFXGE_TXQ_NON_CKSUM] = 1;

        if (sp->s_tx_scale_max[SFXGE_TXQ_IP_CKSUM] < 1)
                sp->s_tx_scale_max[SFXGE_TXQ_IP_CKSUM] = 1;

        txq = 0;
        for (qtype = 0; qtype < SFXGE_TXQ_NTYPES; qtype++) {
                unsigned int tx_scale = sp->s_tx_scale_max[qtype];

                if (txq + tx_scale > EFX_ARRAY_SIZE(sp->s_stp)) {
                        rc = EINVAL;
                        goto fail1;
                }

                sp->s_tx_scale_base[qtype] = txq;

                for (evq = 0; evq < tx_scale; evq++) {
                        if ((rc = sfxge_tx_qinit(sp, txq, qtype, evq)) != 0) {
                                goto fail2;
                        }
                        txq++;
                }
                ASSERT3U(txq, <=, EFX_ARRAY_SIZE(sp->s_stp));
        }

        return (0);

fail2:
        DTRACE_PROBE(fail2);

fail1:
        DTRACE_PROBE1(fail1, int, rc);

        index = EFX_ARRAY_SIZE(sp->s_stp);
        while (--index >= 0) {
                if (sp->s_stp[index] != NULL)
                        sfxge_tx_qfini(sp, index);
        }

        kmem_cache_destroy(sp->s_tqc);
        sp->s_tqc = NULL;

        kmem_cache_destroy(sp->s_tmc);
        sp->s_tmc = NULL;

        kmem_cache_destroy(sp->s_tbc);
        sp->s_tbc = NULL;

        kmem_cache_destroy(sp->s_tpc);
        sp->s_tpc = NULL;

        return (rc);
}

int
sfxge_tx_start(sfxge_t *sp)
{
        efx_nic_t *enp = sp->s_enp;
        int index;
        int rc;

        /* Initialize the transmit module */
        if ((rc = efx_tx_init(enp)) != 0)
                goto fail1;

        for (index = 0; index < EFX_ARRAY_SIZE(sp->s_stp); index++) {
                if (sp->s_stp[index] != NULL)
                        if ((rc = sfxge_tx_qstart(sp, index)) != 0)
                                goto fail2;
        }

        return (0);

fail2:
        DTRACE_PROBE(fail2);

        sfxge_tx_stop(sp);

fail1:
        DTRACE_PROBE1(fail1, int, rc);

        return (rc);
}


/*
 * Add a packet to the TX Deferred Packet List and if the TX queue lock
 * can be acquired then call sfxge_tx_qdpl_service() to fragment and push
 * to the H/W transmit descriptor ring
 *
 * If ENOSPC is returned then the DPL is full or the packet create failed, but
 * the mblk isn't freed so that the caller can return this mblk from mc_tx() to
 * back-pressure the OS stack.
 *
 * For all other errors the mblk is freed
 */
int
sfxge_tx_packet_add(sfxge_t *sp, mblk_t *mp)
{
        struct ether_header *etherhp;
        struct ip *iphp;
        struct tcphdr *thp;
        size_t off;
        size_t size;
        size_t mss;
        sfxge_txq_t *stp;
        unsigned int txq;
        int index;
        boolean_t locked;
        sfxge_tx_packet_t *stpp;
        sfxge_packet_type_t pkt_type;
        uint16_t sport, dport;
        int rc = 0;

        ASSERT3P(mp->b_next, ==, NULL);
        ASSERT(!(DB_CKSUMFLAGS(mp) & HCK_PARTIALCKSUM));

        /*
         * Do not enqueue packets during startup/shutdown;
         *
         * NOTE: This access to the state is NOT protected by the state lock. It
         * is an imperfect test and anything further getting onto the get/put
         * deferred packet lists is cleaned up in (possibly repeated) calls to
         * sfxge_can_destroy().
         */
        if (sp->s_state != SFXGE_STARTED) {
                rc = EINVAL;
                goto fail1;
        }

        etherhp = NULL;
        iphp = NULL;
        thp = NULL;
        off = 0;
        size = 0;
        mss = 0;

        /* Check whether we need the header pointers for LSO segmentation */
        if (DB_LSOFLAGS(mp) & HW_LSO) {
                /* LSO segmentation relies on hardware checksum offload */
                DB_CKSUMFLAGS(mp) |= HCK_FULLCKSUM;

                if ((mss = DB_LSOMSS(mp)) == 0) {
                        rc = EINVAL;
                        goto fail1;
                }

                pkt_type = sfxge_pkthdr_parse(mp, &etherhp, &iphp, &thp,
                    &off, &size, &sport, &dport);

                if (pkt_type != SFXGE_PACKET_TYPE_IPV4_TCP ||
                    etherhp == NULL ||
                    iphp == NULL ||
                    thp == NULL ||
                    off == 0) {
                        rc = EINVAL;
                        goto fail2;
                }
        }

        /* Choose the appropriate transit queue */
        if (DB_CKSUMFLAGS(mp) & HCK_FULLCKSUM) {
                sfxge_rx_scale_t *srsp = &(sp->s_rx_scale);

                if (srsp->srs_state == SFXGE_RX_SCALE_STARTED) {
                        uint32_t hash;

                        if (srsp->srs_count > 1) {
                                /*
                                 * If we have not already parsed the headers
                                 * for LSO segmentation then we need to do it
                                 * now so we can calculate the hash.
                                 */
                                if (thp == NULL) {
                                        (void) sfxge_pkthdr_parse(mp, &etherhp,
                                            &iphp, &thp, &off, &size,
                                            &sport, &dport);
                                }

                                if (thp != NULL) {
                                        SFXGE_TCP_HASH(sp,
                                            &iphp->ip_dst.s_addr,
                                            thp->th_dport,
                                            &iphp->ip_src.s_addr,
                                            thp->th_sport, hash);

                                        index = srsp->srs_tbl[hash %
                                            SFXGE_RX_SCALE_MAX];
                                } else if (iphp != NULL) {
                                        /*
                                         * Calculate IPv4 4-tuple hash, with
                                         * TCP/UDP/SCTP src/dest ports. Ports
                                         * are zero for other IPv4 protocols.
                                         */
                                        SFXGE_IP_HASH(sp,
                                            &iphp->ip_dst.s_addr, dport,
                                            &iphp->ip_src.s_addr, sport, hash);

                                        index = srsp->srs_tbl[hash %
                                            SFXGE_RX_SCALE_MAX];
                                } else {
                                        /*
                                         * Other traffic always goes to the
                                         * the queue in the zero-th entry of
                                         * the RSS table.
                                         */
                                        index = srsp->srs_tbl[0];
                                }
                        } else {
                                /*
                                 * It does not matter what the hash is
                                 * because all the RSS table entries will be
                                 * the same.
                                 */
                                index = srsp->srs_tbl[0];
                        }

                        /*
                         * Find the event queue corresponding to the hash in
                         * the RSS table.
                         */
                        txq = sp->s_tx_scale_base[SFXGE_TXQ_IP_TCP_UDP_CKSUM] +
                            index;
                        stp = sp->s_stp[txq];
                        ASSERT3U(stp->st_evq, ==, index);
                } else {
                        index = 0;
                        txq = sp->s_tx_scale_base[SFXGE_TXQ_IP_TCP_UDP_CKSUM] +
                            index;
                        stp = sp->s_stp[txq];
                }
        } else if (DB_CKSUMFLAGS(mp) & HCK_IPV4_HDRCKSUM) {
                ASSERT3U(sp->s_tx_scale_max[SFXGE_TXQ_IP_CKSUM], >=, 1);
                index = 0;
                txq = sp->s_tx_scale_base[SFXGE_TXQ_IP_CKSUM] + index;
                stp = sp->s_stp[txq];
        } else {
                /*
                 * No hardware checksum offload requested.
                 */
                sfxge_rx_scale_t *srsp = &(sp->s_rx_scale);

                if (srsp->srs_state == SFXGE_RX_SCALE_STARTED) {
                        uint32_t hash = 0;

                        if (srsp->srs_count > 1) {
                                if (iphp == NULL) {
                                        (void) sfxge_pkthdr_parse(mp, &etherhp,
                                            &iphp, &thp, &off, &size,
                                            &sport, &dport);
                                }

                                if (iphp != NULL) {
                                        /*
                                         * Calculate IPv4 4-tuple hash, with
                                         * TCP/UDP/SCTP src/dest ports. Ports
                                         * are zero for other IPv4 protocols.
                                         */
                                        SFXGE_IP_HASH(sp,
                                            &iphp->ip_dst.s_addr, dport,
                                            &iphp->ip_src.s_addr, sport, hash);

                                        hash = hash % SFXGE_RX_SCALE_MAX;
                                }
                        }
                        index = srsp->srs_tbl[hash];

                        /*
                         * The RSS table (indexed by hash) gives the RXQ index,
                         * (mapped 1:1 with EVQs). Find the TXQ that results in
                         * using the same EVQ as for the RX data path.
                         */
                        ASSERT3U(sp->s_tx_scale_max[SFXGE_TXQ_NON_CKSUM],
                            >, index);
                        txq = sp->s_tx_scale_base[SFXGE_TXQ_NON_CKSUM] + index;
                        stp = sp->s_stp[txq];
                        ASSERT3U(stp->st_evq, ==, index);
                } else {
                        ASSERT3U(sp->s_tx_scale_max[SFXGE_TXQ_NON_CKSUM], >, 0);
                        index = 0;
                        txq = sp->s_tx_scale_base[SFXGE_TXQ_NON_CKSUM] + index;
                        stp = sp->s_stp[txq];
                }


        }
        ASSERT(stp != NULL);

        ASSERT(mss == 0 || (DB_LSOFLAGS(mp) & HW_LSO));

        /* Try to grab the lock */
        locked = mutex_tryenter(&(stp->st_lock));

        if (locked) {
                /* Try to grab a packet from the pool */
                stpp = sfxge_tx_qfpp_get(stp);
        } else {
                stpp = NULL;
        }

        if (stpp == NULL) {
                /*
                 * Either the pool was empty or we don't have the lock so
                 * allocate a new packet.
                 */
                if ((stpp = sfxge_tx_packet_create(sp)) == NULL) {
                        rc = ENOSPC;
                        goto fail3;
                }
        }

        stpp->stp_mp = mp;
        stpp->stp_etherhp = etherhp;
        stpp->stp_iphp = iphp;
        stpp->stp_thp = thp;
        stpp->stp_off = off;
        stpp->stp_size = size;
        stpp->stp_mss = mss;
        stpp->stp_dpl_put_len = 0;

        rc = sfxge_tx_qdpl_add(stp, stpp, locked);
        if (rc != 0) {
                /* ENOSPC can happen for DPL get or put list is full */
                ASSERT3U(rc, ==, ENOSPC);

                /*
                 * Note; if this is the unlocked DPL put list full case there is
                 * no need to worry about a race with locked
                 * sfxge_tx_qdpl_swizzle() as we know that the TX DPL put list
                 * was full and would have been swizzle'd to the TX DPL get
                 * list; hence guaranteeing future TX completions and calls
                 * to mac_tx_update() via sfxge_tx_qcomplete()
                 */
                goto fail4;
        }

        /* Try to grab the lock again */
        if (!locked)
                locked = mutex_tryenter(&(stp->st_lock));

        if (locked) {
                /* Try to service the list */
                sfxge_tx_qdpl_service(stp);
                /* lock has been dropped */
        }

        return (0);

fail4:
        DTRACE_PROBE(fail4);
        sfxge_tx_packet_destroy(sp, stpp);
fail3:
        DTRACE_PROBE(fail3);
        if (locked)
                mutex_exit(&(stp->st_lock));
fail2:
        DTRACE_PROBE(fail2);
fail1:
        DTRACE_PROBE1(fail1, int, rc);

        if (rc != ENOSPC)
                freemsg(mp);
        return (rc);
}

void
sfxge_tx_stop(sfxge_t *sp)
{
        efx_nic_t *enp = sp->s_enp;
        clock_t timeout;
        boolean_t wait_for_flush;
        int index;

        ASSERT(mutex_owned(&(sp->s_state_lock)));

        mutex_enter(&(sp->s_tx_flush_lock));

        /* Flush all the queues */
        if (sp->s_hw_err == SFXGE_HW_OK) {
                wait_for_flush = B_TRUE;
        } else {
                /*
                 * Flag indicates possible hardware failure.
                 * Attempt flush but do not wait for it to complete.
                 */
                wait_for_flush = B_FALSE;
        }

        /* Prepare queues to stop and flush the hardware ring */
        index = EFX_ARRAY_SIZE(sp->s_stp);
        while (--index >= 0) {
                if (sp->s_stp[index] != NULL)
                        sfxge_tx_qflush(sp, index, wait_for_flush);
        }

        if (wait_for_flush == B_FALSE)
                goto flush_done;

        /* Wait upto 2sec for queue flushing to complete */
        timeout = ddi_get_lbolt() + drv_usectohz(SFXGE_TX_QFLUSH_USEC);

        while (sp->s_tx_flush_pending > 0) {
                if (cv_timedwait(&(sp->s_tx_flush_kv), &(sp->s_tx_flush_lock),
                    timeout) < 0) {
                        /* Timeout waiting for queues to flush */
                        dev_info_t *dip = sp->s_dip;

                        DTRACE_PROBE(timeout);
                        dev_err(dip, CE_NOTE,
                            SFXGE_CMN_ERR "tx qflush timeout");
                        break;
                }
        }

flush_done:
        sp->s_tx_flush_pending = 0;
        mutex_exit(&(sp->s_tx_flush_lock));

        /* Stop all the queues */
        index = EFX_ARRAY_SIZE(sp->s_stp);
        while (--index >= 0) {
                if (sp->s_stp[index] != NULL)
                        sfxge_tx_qstop(sp, index);
        }

        /* Tear down the transmit module */
        efx_tx_fini(enp);
}

void
sfxge_tx_fini(sfxge_t *sp)
{
        int index;

        index = EFX_ARRAY_SIZE(sp->s_stp);
        while (--index >= 0) {
                if (sp->s_stp[index] != NULL)
                        sfxge_tx_qfini(sp, index);
        }

        kmem_cache_destroy(sp->s_tqc);
        sp->s_tqc = NULL;

        kmem_cache_destroy(sp->s_tmc);
        sp->s_tmc = NULL;

        kmem_cache_destroy(sp->s_tbc);
        sp->s_tbc = NULL;

        kmem_cache_destroy(sp->s_tpc);
        sp->s_tpc = NULL;
}