root/usr/src/uts/common/io/igb/igb_tx.c
/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */

/*
 * Copyright(c) 2007-2010 Intel Corporation. All rights reserved.
 */

/*
 * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
 */

#include "igb_sw.h"

static boolean_t igb_tx(igb_tx_ring_t *, mblk_t *);
static int igb_tx_copy(igb_tx_ring_t *, tx_control_block_t *, mblk_t *,
    uint32_t, boolean_t);
static int igb_tx_bind(igb_tx_ring_t *, tx_control_block_t *, mblk_t *,
    uint32_t);
static int igb_tx_fill_ring(igb_tx_ring_t *, link_list_t *, tx_context_t *,
    size_t);
static void igb_save_desc(tx_control_block_t *, uint64_t, size_t);
static tx_control_block_t *igb_get_free_list(igb_tx_ring_t *);
static int igb_get_tx_context(mblk_t *, tx_context_t *);
static boolean_t igb_check_tx_context(igb_tx_ring_t *, tx_context_t *);
static void igb_fill_tx_context(struct e1000_adv_tx_context_desc *,
    tx_context_t *, uint32_t);

mblk_t *
igb_tx_ring_send(void *arg, mblk_t *mp)
{
        igb_tx_ring_t *tx_ring = (igb_tx_ring_t *)arg;
        igb_t *igb;

        ASSERT(tx_ring != NULL);

        igb = tx_ring->igb;

        if ((igb->igb_state & IGB_SUSPENDED) ||
            (igb->igb_state & IGB_ERROR) ||
            !(igb->igb_state & IGB_STARTED) ||
            igb->link_state != LINK_STATE_UP) {
                freemsg(mp);
                return (NULL);
        }

        return ((igb_tx(tx_ring, mp)) ? NULL : mp);
}

/*
 * igb_tx - Main transmit processing
 *
 * Called from igb_m_tx with an mblk ready to transmit. this
 * routine sets up the transmit descriptors and sends data to
 * the wire.
 *
 * One mblk can consist of several fragments, each fragment
 * will be processed with different methods based on the size.
 * For the fragments with size less than the bcopy threshold,
 * they will be processed by using bcopy; otherwise, they will
 * be processed by using DMA binding.
 *
 * To process the mblk, a tx control block is got from the
 * free list. One tx control block contains one tx buffer, which
 * is used to copy mblk fragments' data; and one tx DMA handle,
 * which is used to bind a mblk fragment with DMA resource.
 *
 * Several small mblk fragments can be copied into one tx control
 * block's buffer, and then the buffer will be transmitted with
 * one tx descriptor.
 *
 * A large fragment only binds with one tx control block's DMA
 * handle, and it can span several tx descriptors for transmitting.
 *
 * So to transmit a packet (mblk), several tx control blocks can
 * be used. After the processing, those tx control blocks will
 * be put to the work list.
 */
static boolean_t
igb_tx(igb_tx_ring_t *tx_ring, mblk_t *mp)
{
        igb_t *igb = tx_ring->igb;
        tx_type_t current_flag, next_flag;
        uint32_t current_len, next_len;
        uint32_t desc_total;
        size_t mbsize;
        int desc_num;
        boolean_t copy_done, eop;
        mblk_t *current_mp, *next_mp, *nmp;
        tx_control_block_t *tcb;
        tx_context_t tx_context, *ctx;
        link_list_t pending_list;
        mblk_t *hdr_new_mp = NULL;
        mblk_t *hdr_previous_mp = NULL;
        mblk_t *hdr_current_mp = NULL;
        uint32_t hdr_frag_len;
        uint32_t hdr_len, len;
        uint32_t copy_thresh;

        copy_thresh = igb->tx_copy_thresh;

        /* Get the mblk size */
        mbsize = 0;
        for (nmp = mp; nmp != NULL; nmp = nmp->b_cont) {
                mbsize += MBLKL(nmp);
        }

        if (igb->tx_hcksum_enable) {
                ctx = &tx_context;
                /*
                 * Retrieve offloading context information from the mblk
                 * that will be used to decide whether/how to fill the
                 * context descriptor.
                 */
                if (igb_get_tx_context(mp, ctx) != TX_CXT_SUCCESS) {
                        freemsg(mp);
                        return (B_TRUE);
                }

                if ((ctx->lso_flag &&
                    (mbsize > (ctx->mac_hdr_len + IGB_LSO_MAXLEN))) ||
                    (!ctx->lso_flag &&
                    (mbsize > (igb->max_frame_size - ETHERFCSL)))) {
                        freemsg(mp);
                        igb_log(igb, IGB_LOG_INFO, "igb_tx: packet oversize");
                        return (B_TRUE);
                }
        } else {
                ctx = NULL;
                if (mbsize > (igb->max_frame_size - ETHERFCSL)) {
                        freemsg(mp);
                        igb_log(igb, IGB_LOG_INFO, "igb_tx: packet oversize");
                        return (B_TRUE);
                }
        }

        /*
         * Check and recycle tx descriptors.
         * The recycle threshold here should be selected carefully
         */
        if (tx_ring->tbd_free < igb->tx_recycle_thresh)
                tx_ring->tx_recycle(tx_ring);

        /*
         * After the recycling, if the tbd_free is less than the
         * tx_overload_threshold, assert overload, return B_FALSE;
         * and we need to re-schedule the tx again.
         */
        if (tx_ring->tbd_free < igb->tx_overload_thresh) {
                tx_ring->reschedule = B_TRUE;
                IGB_DEBUG_STAT(tx_ring->stat_overload);
                return (B_FALSE);
        }

        /*
         * The software should guarantee LSO packet header(MAC+IP+TCP)
         * to be within one descriptor - this is required by h/w.
         * Here will reallocate and refill the header if
         * the headers(MAC+IP+TCP) is physical memory non-contiguous.
         */
        if (ctx && ctx->lso_flag) {
                hdr_len = ctx->mac_hdr_len + ctx->ip_hdr_len + ctx->l4_hdr_len;
                len = MBLKL(mp);
                hdr_current_mp = mp;
                while (len < hdr_len) {
                        hdr_previous_mp = hdr_current_mp;
                        hdr_current_mp = hdr_current_mp->b_cont;
                        len += MBLKL(hdr_current_mp);
                }
                /*
                 * If the header and the payload are in different mblks,
                 * we simply force the header to be copied into pre-allocated
                 * page-aligned buffer.
                 */
                if (len == hdr_len)
                        goto adjust_threshold;

                hdr_frag_len = hdr_len - (len - MBLKL(hdr_current_mp));
                /*
                 * There are two cases we will reallocate
                 * a mblk for the last header fragment.
                 * 1. the header is in multiple mblks and
                 *    the last fragment shares the same mblk
                 *    with the payload
                 * 2. the header is in a single mblk shared
                 *    with the payload but the header crosses
                 *    a page.
                 */
                if ((hdr_current_mp != mp) ||
                    (P2NPHASE((uintptr_t)hdr_current_mp->b_rptr, igb->page_size)
                    < hdr_len)) {
                        /*
                         * reallocate the mblk for the last header fragment,
                         * expect it to be copied into pre-allocated
                         * page-aligned buffer
                         */
                        hdr_new_mp = allocb(hdr_frag_len, 0);
                        if (!hdr_new_mp) {
                                return (B_FALSE);
                        }

                        /* link the new header fragment with the other parts */
                        bcopy(hdr_current_mp->b_rptr,
                            hdr_new_mp->b_rptr, hdr_frag_len);
                        hdr_new_mp->b_wptr = hdr_new_mp->b_rptr + hdr_frag_len;
                        hdr_new_mp->b_cont = hdr_current_mp;
                        if (hdr_previous_mp)
                                hdr_previous_mp->b_cont = hdr_new_mp;
                        else
                                mp = hdr_new_mp;
                        hdr_current_mp->b_rptr += hdr_frag_len;
                }
adjust_threshold:
                /*
                 * adjust the bcopy threshhold to guarantee
                 * the header to use bcopy way
                 */
                if (copy_thresh < hdr_len)
                        copy_thresh = hdr_len;
        }

        /*
         * The pending_list is a linked list that is used to save
         * the tx control blocks that have packet data processed
         * but have not put the data to the tx descriptor ring.
         * It is used to reduce the lock contention of the tx_lock.
         */
        LINK_LIST_INIT(&pending_list);
        desc_num = 0;
        desc_total = 0;

        current_mp = mp;
        current_len = MBLKL(current_mp);
        /*
         * Decide which method to use for the first fragment
         */
        current_flag = (current_len <= copy_thresh) ?
            USE_COPY : USE_DMA;
        /*
         * If the mblk includes several contiguous small fragments,
         * they may be copied into one buffer. This flag is used to
         * indicate whether there are pending fragments that need to
         * be copied to the current tx buffer.
         *
         * If this flag is B_TRUE, it indicates that a new tx control
         * block is needed to process the next fragment using either
         * copy or DMA binding.
         *
         * Otherwise, it indicates that the next fragment will be
         * copied to the current tx buffer that is maintained by the
         * current tx control block. No new tx control block is needed.
         */
        copy_done = B_TRUE;
        while (current_mp) {
                next_mp = current_mp->b_cont;
                eop = (next_mp == NULL); /* Last fragment of the packet? */
                next_len = eop ? 0: MBLKL(next_mp);

                /*
                 * When the current fragment is an empty fragment, if
                 * the next fragment will still be copied to the current
                 * tx buffer, we cannot skip this fragment here. Because
                 * the copy processing is pending for completion. We have
                 * to process this empty fragment in the tx_copy routine.
                 *
                 * If the copy processing is completed or a DMA binding
                 * processing is just completed, we can just skip this
                 * empty fragment.
                 */
                if ((current_len == 0) && (copy_done)) {
                        current_mp = next_mp;
                        current_len = next_len;
                        current_flag = (current_len <= copy_thresh) ?
                            USE_COPY : USE_DMA;
                        continue;
                }

                if (copy_done) {
                        /*
                         * Get a new tx control block from the free list
                         */
                        tcb = igb_get_free_list(tx_ring);

                        if (tcb == NULL) {
                                IGB_DEBUG_STAT(tx_ring->stat_fail_no_tcb);
                                goto tx_failure;
                        }

                        /*
                         * Push the tx control block to the pending list
                         * to avoid using lock too early
                         */
                        LIST_PUSH_TAIL(&pending_list, &tcb->link);
                }

                if (current_flag == USE_COPY) {
                        /*
                         * Check whether to use bcopy or DMA binding to process
                         * the next fragment, and if using bcopy, whether we
                         * need to continue copying the next fragment into the
                         * current tx buffer.
                         */
                        ASSERT((tcb->tx_buf.len + current_len) <=
                            tcb->tx_buf.size);

                        if (eop) {
                                /*
                                 * This is the last fragment of the packet, so
                                 * the copy processing will be completed with
                                 * this fragment.
                                 */
                                next_flag = USE_NONE;
                                copy_done = B_TRUE;
                        } else if ((tcb->tx_buf.len + current_len + next_len) >
                            tcb->tx_buf.size) {
                                /*
                                 * If the next fragment is too large to be
                                 * copied to the current tx buffer, we need
                                 * to complete the current copy processing.
                                 */
                                next_flag = (next_len > copy_thresh) ?
                                    USE_DMA: USE_COPY;
                                copy_done = B_TRUE;
                        } else if (next_len > copy_thresh) {
                                /*
                                 * The next fragment needs to be processed with
                                 * DMA binding. So the copy prcessing will be
                                 * completed with the current fragment.
                                 */
                                next_flag = USE_DMA;
                                copy_done = B_TRUE;
                        } else {
                                /*
                                 * Continue to copy the next fragment to the
                                 * current tx buffer.
                                 */
                                next_flag = USE_COPY;
                                copy_done = B_FALSE;
                        }

                        desc_num = igb_tx_copy(tx_ring, tcb, current_mp,
                            current_len, copy_done);
                } else {
                        /*
                         * Check whether to use bcopy or DMA binding to process
                         * the next fragment.
                         */
                        next_flag = (next_len > copy_thresh) ?
                            USE_DMA: USE_COPY;
                        ASSERT(copy_done == B_TRUE);

                        desc_num = igb_tx_bind(tx_ring, tcb, current_mp,
                            current_len);
                }

                if (desc_num > 0)
                        desc_total += desc_num;
                else if (desc_num < 0)
                        goto tx_failure;

                current_mp = next_mp;
                current_len = next_len;
                current_flag = next_flag;
        }

        /*
         * Attach the mblk to the last tx control block
         */
        ASSERT(tcb);
        ASSERT(tcb->mp == NULL);
        tcb->mp = mp;

        /*
         * Before fill the tx descriptor ring with the data, we need to
         * ensure there are adequate free descriptors for transmit
         * (including one context descriptor).
         * Do not use up all the tx descriptors.
         * Otherwise tx recycle will fail and cause false hang.
         */
        if (tx_ring->tbd_free <= (desc_total + 1)) {
                tx_ring->tx_recycle(tx_ring);
        }

        mutex_enter(&tx_ring->tx_lock);

        /*
         * If the number of free tx descriptors is not enough for transmit
         * then return failure.
         *
         * Note: we must put this check under the mutex protection to
         * ensure the correctness when multiple threads access it in
         * parallel.
         */
        if (tx_ring->tbd_free <= (desc_total + 1)) {
                IGB_DEBUG_STAT(tx_ring->stat_fail_no_tbd);
                mutex_exit(&tx_ring->tx_lock);
                goto tx_failure;
        }

        desc_num = igb_tx_fill_ring(tx_ring, &pending_list, ctx, mbsize);

        ASSERT((desc_num == desc_total) || (desc_num == (desc_total + 1)));

        /* Update per-ring tx statistics */
        tx_ring->tx_pkts++;
        tx_ring->tx_bytes += mbsize;

        mutex_exit(&tx_ring->tx_lock);

        return (B_TRUE);

tx_failure:
        /*
         * If new mblk has been allocted for the last header
         * fragment of a LSO packet, we should restore the
         * modified mp.
         */
        if (hdr_new_mp) {
                hdr_new_mp->b_cont = NULL;
                freeb(hdr_new_mp);
                hdr_current_mp->b_rptr -= hdr_frag_len;
                if (hdr_previous_mp)
                        hdr_previous_mp->b_cont = hdr_current_mp;
                else
                        mp = hdr_current_mp;
        }

        /*
         * Discard the mblk and free the used resources
         */
        tcb = (tx_control_block_t *)LIST_GET_HEAD(&pending_list);
        while (tcb) {
                tcb->mp = NULL;

                igb_free_tcb(tcb);

                tcb = (tx_control_block_t *)
                    LIST_GET_NEXT(&pending_list, &tcb->link);
        }

        /*
         * Return the tx control blocks in the pending list to the free list.
         */
        igb_put_free_list(tx_ring, &pending_list);

        /* Transmit failed, do not drop the mblk, rechedule the transmit */
        tx_ring->reschedule = B_TRUE;

        return (B_FALSE);
}

/*
 * igb_tx_copy
 *
 * Copy the mblk fragment to the pre-allocated tx buffer
 */
static int
igb_tx_copy(igb_tx_ring_t *tx_ring, tx_control_block_t *tcb, mblk_t *mp,
    uint32_t len, boolean_t copy_done)
{
        dma_buffer_t *tx_buf;
        uint32_t desc_num;
        _NOTE(ARGUNUSED(tx_ring));

        tx_buf = &tcb->tx_buf;

        /*
         * Copy the packet data of the mblk fragment into the
         * pre-allocated tx buffer, which is maintained by the
         * tx control block.
         *
         * Several mblk fragments can be copied into one tx buffer.
         * The destination address of the current copied fragment in
         * the tx buffer is next to the end of the previous copied
         * fragment.
         */
        if (len > 0) {
                bcopy(mp->b_rptr, tx_buf->address + tx_buf->len, len);

                tx_buf->len += len;
                tcb->frag_num++;
        }

        desc_num = 0;

        /*
         * If it is the last fragment copied to the current tx buffer,
         * in other words, if there's no remaining fragment or the remaining
         * fragment requires a new tx control block to process, we need to
         * complete the current copy processing by syncing up the current
         * DMA buffer and saving the descriptor data.
         */
        if (copy_done) {
                /*
                 * Sync the DMA buffer of the packet data
                 */
                DMA_SYNC(tx_buf, DDI_DMA_SYNC_FORDEV);

                tcb->tx_type = USE_COPY;

                /*
                 * Save the address and length to the private data structure
                 * of the tx control block, which will be used to fill the
                 * tx descriptor ring after all the fragments are processed.
                 */
                igb_save_desc(tcb, tx_buf->dma_address, tx_buf->len);
                desc_num++;
        }

        return (desc_num);
}

/*
 * igb_tx_bind
 *
 * Bind the mblk fragment with DMA
 */
static int
igb_tx_bind(igb_tx_ring_t *tx_ring, tx_control_block_t *tcb, mblk_t *mp,
    uint32_t len)
{
        int status, i;
        ddi_dma_cookie_t dma_cookie;
        uint_t ncookies;
        int desc_num;

        /*
         * Use DMA binding to process the mblk fragment
         */
        status = ddi_dma_addr_bind_handle(tcb->tx_dma_handle, NULL,
            (caddr_t)mp->b_rptr, len,
            DDI_DMA_WRITE | DDI_DMA_STREAMING, DDI_DMA_DONTWAIT,
            0, &dma_cookie, &ncookies);

        if (status != DDI_DMA_MAPPED) {
                IGB_DEBUG_STAT(tx_ring->stat_fail_dma_bind);
                return (-1);
        }

        tcb->frag_num++;
        tcb->tx_type = USE_DMA;
        /*
         * Each fragment can span several cookies. One cookie will have
         * one tx descriptor to transmit.
         */
        desc_num = 0;
        for (i = ncookies; i > 0; i--) {
                /*
                 * Save the address and length to the private data structure
                 * of the tx control block, which will be used to fill the
                 * tx descriptor ring after all the fragments are processed.
                 */
                igb_save_desc(tcb,
                    dma_cookie.dmac_laddress,
                    dma_cookie.dmac_size);

                desc_num++;

                if (i > 1)
                        ddi_dma_nextcookie(tcb->tx_dma_handle, &dma_cookie);
        }

        return (desc_num);
}

/*
 * igb_get_tx_context
 *
 * Get the tx context information from the mblk
 */
static int
igb_get_tx_context(mblk_t *mp, tx_context_t *ctx)
{
        uint32_t start;
        uint32_t flags;
        uint32_t lso_flag;
        uint32_t lso_cksum;
        uint32_t mss;
        uint32_t len;
        uint32_t size;
        uint32_t offset;
        unsigned char *pos;
        ushort_t etype;
        uint32_t mac_hdr_len;
        uint32_t l4_proto;
        uint32_t l4_hdr_len;

        ASSERT(mp != NULL);

        mac_hcksum_get(mp, &start, NULL, NULL, NULL, &flags);
        bzero(ctx, sizeof (tx_context_t));

        ctx->hcksum_flags = flags;

        if (flags == 0)
                return (TX_CXT_SUCCESS);

        mac_lso_get(mp, &mss, &lso_flag);
        ctx->mss = mss;
        ctx->lso_flag = (lso_flag == HW_LSO);

        etype = 0;
        mac_hdr_len = 0;
        l4_proto = 0;

        /*
         * Firstly get the position of the ether_type/ether_tpid.
         * Here we don't assume the ether (VLAN) header is fully included
         * in one mblk fragment, so we go thourgh the fragments to parse
         * the ether type.
         */
        size = len = MBLKL(mp);
        offset = offsetof(struct ether_header, ether_type);
        while (size <= offset) {
                mp = mp->b_cont;
                ASSERT(mp != NULL);
                len = MBLKL(mp);
                size += len;
        }
        pos = mp->b_rptr + offset + len - size;

        etype = ntohs(*(ushort_t *)(uintptr_t)pos);
        if (etype == ETHERTYPE_VLAN) {
                /*
                 * Get the position of the ether_type in VLAN header
                 */
                offset = offsetof(struct ether_vlan_header, ether_type);
                while (size <= offset) {
                        mp = mp->b_cont;
                        ASSERT(mp != NULL);
                        len = MBLKL(mp);
                        size += len;
                }
                pos = mp->b_rptr + offset + len - size;

                etype = ntohs(*(ushort_t *)(uintptr_t)pos);
                mac_hdr_len = sizeof (struct ether_vlan_header);
        } else {
                mac_hdr_len = sizeof (struct ether_header);
        }

        /*
         * Here we assume the IP(V6) header is fully included in one
         * mblk fragment.
         */
        lso_cksum = HCK_PARTIALCKSUM;
        ctx->l3_proto = etype;
        switch (etype) {
        case ETHERTYPE_IP:
                offset = mac_hdr_len;
                while (size <= offset) {
                        mp = mp->b_cont;
                        ASSERT(mp != NULL);
                        len = MBLKL(mp);
                        size += len;
                }
                pos = mp->b_rptr + offset + len - size;

                if (ctx->lso_flag) {
                        *((uint16_t *)(uintptr_t)(pos + offsetof(ipha_t,
                            ipha_length))) = 0;

                        /*
                         * To utilize igb LSO, here need to fill
                         * the tcp checksum field of the packet with the
                         * following pseudo-header checksum:
                         * (ip_source_addr, ip_destination_addr, l4_proto)
                         * and also need to fill the ip header checksum
                         * with zero. Currently the tcp/ip stack has done
                         * these.
                         */
                        lso_cksum |= HCK_IPV4_HDRCKSUM;
                }

                l4_proto = *(uint8_t *)(pos + offsetof(ipha_t, ipha_protocol));
                break;
        case ETHERTYPE_IPV6:
                /*
                 * We need to zero out the length in the header.
                 */
                if (ctx->lso_flag) {
                        offset = offsetof(ip6_t, ip6_plen) + mac_hdr_len;
                        while (size <= offset) {
                                mp = mp->b_cont;
                                ASSERT(mp != NULL);
                                len = MBLKL(mp);
                                size += len;
                        }
                        pos = mp->b_rptr + offset + len - size;
                        *((uint16_t *)(uintptr_t)(pos)) = 0;
                }

                offset = offsetof(ip6_t, ip6_nxt) + mac_hdr_len;
                while (size <= offset) {
                        mp = mp->b_cont;
                        ASSERT(mp != NULL);
                        len = MBLKL(mp);
                        size += len;
                }
                pos = mp->b_rptr + offset + len - size;

                l4_proto = *(uint8_t *)pos;
                break;
        default:
                /* Unrecoverable error */
                igb_log(NULL, IGB_LOG_INFO, "Ethernet type field error with "
                    "tx hcksum flag set");
                return (TX_CXT_E_ETHER_TYPE);
        }

        if (ctx->lso_flag) {
                /*
                 * LSO relies on tx h/w checksum, so here the packet will be
                 * dropped if the h/w checksum flags are not set.
                 */
                if ((ctx->hcksum_flags & lso_cksum) != lso_cksum) {
                        igb_log(NULL, IGB_LOG_INFO, "igb_tx: h/w "
                            "checksum flags are not set for LSO, found "
                            "0x%x, needed bits 0x%x", ctx->hcksum_flags,
                            lso_cksum);
                        return (TX_CXT_E_LSO_CSUM);
                }

                offset = mac_hdr_len + start;
                while (size <= offset) {
                        mp = mp->b_cont;
                        ASSERT(mp != NULL);
                        len = MBLKL(mp);
                        size += len;
                }
                pos = mp->b_rptr + offset + len - size;

                l4_hdr_len = TCP_HDR_LENGTH((tcph_t *)pos);
        } else {
                /*
                 * l4 header length is only required for LSO
                 */
                l4_hdr_len = 0;
        }

        ctx->mac_hdr_len = mac_hdr_len;
        ctx->ip_hdr_len = start;
        ctx->l4_proto = l4_proto;
        ctx->l4_hdr_len = l4_hdr_len;

        return (TX_CXT_SUCCESS);
}

/*
 * igb_check_tx_context
 *
 * Check if a new context descriptor is needed
 */
static boolean_t
igb_check_tx_context(igb_tx_ring_t *tx_ring, tx_context_t *ctx)
{
        tx_context_t *last;

        if (ctx == NULL)
                return (B_FALSE);

        /*
         * Compare the context data retrieved from the mblk and the
         * stored context data of the last context descriptor. The data
         * need to be checked are:
         *      hcksum_flags
         *      l4_proto
         *      l3_proto
         *      mss (only check for LSO)
         *      l4_hdr_len (only check for LSO)
         *      ip_hdr_len
         *      mac_hdr_len
         * Either one of the above data is changed, a new context descriptor
         * will be needed.
         */
        last = &tx_ring->tx_context;

        if (ctx->hcksum_flags != 0) {
                if ((ctx->hcksum_flags != last->hcksum_flags) ||
                    (ctx->l4_proto != last->l4_proto) ||
                    (ctx->l3_proto != last->l3_proto) ||
                    (ctx->lso_flag && ((ctx->mss != last->mss) ||
                    (ctx->l4_hdr_len != last->l4_hdr_len))) ||
                    (ctx->ip_hdr_len != last->ip_hdr_len) ||
                    (ctx->mac_hdr_len != last->mac_hdr_len)) {
                        return (B_TRUE);
                }
        }

        return (B_FALSE);
}

/*
 * igb_fill_tx_context
 *
 * Fill the context descriptor with hardware checksum informations
 */
static void
igb_fill_tx_context(struct e1000_adv_tx_context_desc *ctx_tbd,
    tx_context_t *ctx, uint32_t ring_index)
{
        /*
         * Fill the context descriptor with the checksum
         * context information we've got
         */
        ctx_tbd->vlan_macip_lens = ctx->ip_hdr_len;
        ctx_tbd->vlan_macip_lens |= ctx->mac_hdr_len <<
            E1000_ADVTXD_MACLEN_SHIFT;

        ctx_tbd->type_tucmd_mlhl =
            E1000_ADVTXD_DCMD_DEXT | E1000_ADVTXD_DTYP_CTXT;

        /*
         * When we have a TX context set up, we enforce that the ethertype is
         * either IPv4 or IPv6 in igb_get_tx_context().
         */
        if (ctx->lso_flag || ctx->hcksum_flags & HCK_IPV4_HDRCKSUM) {
                if (ctx->l3_proto == ETHERTYPE_IP) {
                        ctx_tbd->type_tucmd_mlhl |= E1000_ADVTXD_TUCMD_IPV4;
                } else {
                        ctx_tbd->type_tucmd_mlhl |= E1000_ADVTXD_TUCMD_IPV6;
                }
        }

        if (ctx->lso_flag || ctx->hcksum_flags & HCK_PARTIALCKSUM) {
                switch (ctx->l4_proto) {
                case IPPROTO_TCP:
                        ctx_tbd->type_tucmd_mlhl |= E1000_ADVTXD_TUCMD_L4T_TCP;
                        break;
                case IPPROTO_UDP:
                        /*
                         * We don't have to explicitly set:
                         *      ctx_tbd->type_tucmd_mlhl |=
                         *          E1000_ADVTXD_TUCMD_L4T_UDP;
                         * Because E1000_ADVTXD_TUCMD_L4T_UDP == 0b
                         */
                        break;
                default:
                        /* Unrecoverable error */
                        igb_log(NULL, IGB_LOG_INFO,
                            "L4 type error with tx hcksum");
                        break;
                }
        }

        ctx_tbd->seqnum_seed = 0;
        ctx_tbd->mss_l4len_idx = ring_index << 4;
        if (ctx->lso_flag) {
                ctx_tbd->mss_l4len_idx |=
                    (ctx->l4_hdr_len << E1000_ADVTXD_L4LEN_SHIFT) |
                    (ctx->mss << E1000_ADVTXD_MSS_SHIFT);
        }
}

/*
 * igb_tx_fill_ring
 *
 * Fill the tx descriptor ring with the data
 */
static int
igb_tx_fill_ring(igb_tx_ring_t *tx_ring, link_list_t *pending_list,
    tx_context_t *ctx, size_t mbsize)
{
        struct e1000_hw *hw = &tx_ring->igb->hw;
        boolean_t load_context;
        uint32_t index, tcb_index, desc_num;
        union e1000_adv_tx_desc *tbd, *first_tbd;
        tx_control_block_t *tcb, *first_tcb;
        uint32_t hcksum_flags;
        int i;
        igb_t *igb = tx_ring->igb;

        ASSERT(mutex_owned(&tx_ring->tx_lock));

        tbd = NULL;
        first_tbd = NULL;
        first_tcb = NULL;
        desc_num = 0;
        hcksum_flags = 0;
        load_context = B_FALSE;

        /*
         * Get the index of the first tx descriptor that will be filled,
         * and the index of the first work list item that will be attached
         * with the first used tx control block in the pending list.
         * Note: the two indexes are the same.
         */
        index = tx_ring->tbd_tail;
        tcb_index = tx_ring->tbd_tail;

        if (ctx != NULL) {
                hcksum_flags = ctx->hcksum_flags;

                /*
                 * Check if a new context descriptor is needed for this packet
                 */
                load_context = igb_check_tx_context(tx_ring, ctx);
                if (load_context) {
                        tbd = &tx_ring->tbd_ring[index];

                        /*
                         * Fill the context descriptor with the
                         * hardware checksum offload informations.
                         */
                        igb_fill_tx_context(
                            (struct e1000_adv_tx_context_desc *)tbd,
                            ctx, tx_ring->index);

                        index = NEXT_INDEX(index, 1, tx_ring->ring_size);
                        desc_num++;

                        /*
                         * Store the checksum context data if
                         * a new context descriptor is added
                         */
                        tx_ring->tx_context = *ctx;
                }
        }

        first_tbd = &tx_ring->tbd_ring[index];

        /*
         * Fill tx data descriptors with the data saved in the pending list.
         * The tx control blocks in the pending list are added to the work list
         * at the same time.
         *
         * The work list is strictly 1:1 corresponding to the descriptor ring.
         * One item of the work list corresponds to one tx descriptor. Because
         * one tx control block can span multiple tx descriptors, the tx
         * control block will be added to the first work list item that
         * corresponds to the first tx descriptor generated from that tx
         * control block.
         */
        tcb = (tx_control_block_t *)LIST_POP_HEAD(pending_list);
        first_tcb = tcb;
        while (tcb != NULL) {

                for (i = 0; i < tcb->desc_num; i++) {
                        tbd = &tx_ring->tbd_ring[index];

                        tbd->read.buffer_addr = tcb->desc[i].address;
                        tbd->read.cmd_type_len = tcb->desc[i].length;

                        tbd->read.cmd_type_len |= E1000_ADVTXD_DCMD_RS |
                            E1000_ADVTXD_DCMD_DEXT | E1000_ADVTXD_DTYP_DATA |
                            E1000_ADVTXD_DCMD_IFCS;

                        tbd->read.olinfo_status = 0;

                        index = NEXT_INDEX(index, 1, tx_ring->ring_size);
                        desc_num++;
                }

                /*
                 * Add the tx control block to the work list
                 */
                ASSERT(tx_ring->work_list[tcb_index] == NULL);
                tx_ring->work_list[tcb_index] = tcb;

                tcb_index = index;
                tcb = (tx_control_block_t *)LIST_POP_HEAD(pending_list);
        }

        if (load_context) {
                /*
                 * Count the checksum context descriptor for
                 * the first tx control block.
                 */
                first_tcb->desc_num++;
        }
        first_tcb->last_index = PREV_INDEX(index, 1, tx_ring->ring_size);

        /*
         * The Insert Ethernet CRC (IFCS) bit and the checksum fields are only
         * valid in the first descriptor of the packet.
         * 82576 also requires the payload length setting even without LSO
         */
        ASSERT(first_tbd != NULL);
        first_tbd->read.cmd_type_len |= E1000_ADVTXD_DCMD_IFCS;
        if (ctx != NULL && ctx->lso_flag) {
                first_tbd->read.cmd_type_len |= E1000_ADVTXD_DCMD_TSE;
                first_tbd->read.olinfo_status |=
                    (mbsize - ctx->mac_hdr_len - ctx->ip_hdr_len
                    - ctx->l4_hdr_len) << E1000_ADVTXD_PAYLEN_SHIFT;
        } else {
                if (hw->mac.type >= e1000_82576) {
                        first_tbd->read.olinfo_status |=
                            (mbsize << E1000_ADVTXD_PAYLEN_SHIFT);
                }
        }

        /* Set hardware checksum bits */
        if (hcksum_flags != 0) {
                if (hcksum_flags & HCK_IPV4_HDRCKSUM)
                        first_tbd->read.olinfo_status |=
                            E1000_TXD_POPTS_IXSM << 8;
                if (hcksum_flags & HCK_PARTIALCKSUM)
                        first_tbd->read.olinfo_status |=
                            E1000_TXD_POPTS_TXSM << 8;
                first_tbd->read.olinfo_status |= tx_ring->index << 4;
        }

        /*
         * The last descriptor of packet needs End Of Packet (EOP),
         * and Report Status (RS) bits set
         */
        ASSERT(tbd != NULL);
        tbd->read.cmd_type_len |=
            E1000_ADVTXD_DCMD_EOP | E1000_ADVTXD_DCMD_RS;

        IGB_DEBUG_STAT(tx_ring->stat_pkt_cnt);

        /*
         * Sync the DMA buffer of the tx descriptor ring
         */
        DMA_SYNC(&tx_ring->tbd_area, DDI_DMA_SYNC_FORDEV);

        /*
         * Update the number of the free tx descriptors.
         * The mutual exclusion between the transmission and the recycling
         * (for the tx descriptor ring and the work list) is implemented
         * with the atomic operation on the number of the free tx descriptors.
         *
         * Note: we should always decrement the counter tbd_free before
         * advancing the hardware TDT pointer to avoid the race condition -
         * before the counter tbd_free is decremented, the transmit of the
         * tx descriptors has done and the counter tbd_free is increased by
         * the tx recycling.
         */
        i = igb_atomic_reserve(&tx_ring->tbd_free, desc_num);
        ASSERT(i >= 0);

        tx_ring->tbd_tail = index;

        /*
         * Advance the hardware TDT pointer of the tx descriptor ring
         */
        E1000_WRITE_REG(hw, E1000_TDT(tx_ring->index), index);

        if (igb_check_acc_handle(igb->osdep.reg_handle) != DDI_FM_OK) {
                ddi_fm_service_impact(igb->dip, DDI_SERVICE_DEGRADED);
                atomic_or_32(&igb->igb_state, IGB_ERROR);
        }

        return (desc_num);
}

/*
 * igb_save_desc
 *
 * Save the address/length pair to the private array
 * of the tx control block. The address/length pairs
 * will be filled into the tx descriptor ring later.
 */
static void
igb_save_desc(tx_control_block_t *tcb, uint64_t address, size_t length)
{
        sw_desc_t *desc;

        desc = &tcb->desc[tcb->desc_num];
        desc->address = address;
        desc->length = length;

        tcb->desc_num++;
}

/*
 * igb_tx_recycle_legacy
 *
 * Recycle the tx descriptors and tx control blocks.
 *
 * The work list is traversed to check if the corresponding
 * tx descriptors have been transmitted. If so, the resources
 * bound to the tx control blocks will be freed, and those
 * tx control blocks will be returned to the free list.
 */
uint32_t
igb_tx_recycle_legacy(igb_tx_ring_t *tx_ring)
{
        uint32_t index, last_index, next_index;
        int desc_num;
        boolean_t desc_done;
        tx_control_block_t *tcb;
        link_list_t pending_list;
        igb_t *igb = tx_ring->igb;

        /*
         * The mutex_tryenter() is used to avoid unnecessary
         * lock contention.
         */
        if (mutex_tryenter(&tx_ring->recycle_lock) == 0)
                return (0);

        ASSERT(tx_ring->tbd_free <= tx_ring->ring_size);

        if (tx_ring->tbd_free == tx_ring->ring_size) {
                tx_ring->recycle_fail = 0;
                tx_ring->stall_watchdog = 0;
                mutex_exit(&tx_ring->recycle_lock);
                return (0);
        }

        /*
         * Sync the DMA buffer of the tx descriptor ring
         */
        DMA_SYNC(&tx_ring->tbd_area, DDI_DMA_SYNC_FORKERNEL);

        if (igb_check_dma_handle(
            tx_ring->tbd_area.dma_handle) != DDI_FM_OK) {
                mutex_exit(&tx_ring->recycle_lock);
                ddi_fm_service_impact(igb->dip, DDI_SERVICE_DEGRADED);
                atomic_or_32(&igb->igb_state, IGB_ERROR);
                return (0);
        }

        LINK_LIST_INIT(&pending_list);
        desc_num = 0;
        index = tx_ring->tbd_head;      /* Index of next tbd/tcb to recycle */

        tcb = tx_ring->work_list[index];
        ASSERT(tcb != NULL);

        while (tcb != NULL) {

                /*
                 * Get the last tx descriptor of this packet.
                 * If the last tx descriptor is done, then
                 * we can recycle all descriptors of a packet
                 * which usually includes several tx control blocks.
                 * For some chips, LSO descriptors can not be recycled
                 * unless the whole packet's transmission is done.
                 * That's why packet level recycling is used here.
                 */
                last_index = tcb->last_index;
                /*
                 * MAX_TX_RING_SIZE is used to judge whether
                 * the index is a valid value or not.
                 */
                if (last_index == MAX_TX_RING_SIZE)
                        break;

                next_index = NEXT_INDEX(last_index, 1, tx_ring->ring_size);

                /*
                 * Check if the Descriptor Done bit is set
                 */
                desc_done = tx_ring->tbd_ring[last_index].wb.status &
                    E1000_TXD_STAT_DD;
                if (desc_done) {
                        while (tcb != NULL) {
                                /*
                                 * Strip off the tx control block from the work
                                 * list, and add it to the pending list.
                                 */
                                tx_ring->work_list[index] = NULL;
                                LIST_PUSH_TAIL(&pending_list, &tcb->link);

                                /*
                                 * Count the total number of the tx descriptors
                                 * recycled.
                                 */
                                desc_num += tcb->desc_num;

                                /*
                                 * Advance the index of the tx descriptor ring
                                 */
                                index = NEXT_INDEX(index, tcb->desc_num,
                                    tx_ring->ring_size);

                                tcb = tx_ring->work_list[index];
                                if (index == next_index)
                                        break;
                        }
                } else {
                        break;
                }
        }

        /*
         * If no tx descriptors are recycled, no need to do more processing
         */
        if (desc_num == 0) {
                tx_ring->recycle_fail++;
                mutex_exit(&tx_ring->recycle_lock);
                return (0);
        }

        tx_ring->recycle_fail = 0;
        tx_ring->stall_watchdog = 0;

        /*
         * Update the head index of the tx descriptor ring
         */
        tx_ring->tbd_head = index;

        /*
         * Update the number of the free tx descriptors with atomic operations
         */
        atomic_add_32(&tx_ring->tbd_free, desc_num);

        mutex_exit(&tx_ring->recycle_lock);

        /*
         * Free the resources used by the tx control blocks
         * in the pending list
         */
        tcb = (tx_control_block_t *)LIST_GET_HEAD(&pending_list);
        while (tcb != NULL) {
                /*
                 * Release the resources occupied by the tx control block
                 */
                igb_free_tcb(tcb);

                tcb = (tx_control_block_t *)
                    LIST_GET_NEXT(&pending_list, &tcb->link);
        }

        /*
         * Add the tx control blocks in the pending list to the free list.
         */
        igb_put_free_list(tx_ring, &pending_list);

        return (desc_num);
}

/*
 * igb_tx_recycle_head_wb
 *
 * Check the head write-back, and recycle all the transmitted
 * tx descriptors and tx control blocks.
 */
uint32_t
igb_tx_recycle_head_wb(igb_tx_ring_t *tx_ring)
{
        uint32_t index;
        uint32_t head_wb;
        int desc_num;
        tx_control_block_t *tcb;
        link_list_t pending_list;
        igb_t *igb = tx_ring->igb;

        /*
         * The mutex_tryenter() is used to avoid unnecessary
         * lock contention.
         */
        if (mutex_tryenter(&tx_ring->recycle_lock) == 0)
                return (0);

        ASSERT(tx_ring->tbd_free <= tx_ring->ring_size);

        if (tx_ring->tbd_free == tx_ring->ring_size) {
                tx_ring->recycle_fail = 0;
                tx_ring->stall_watchdog = 0;
                mutex_exit(&tx_ring->recycle_lock);
                return (0);
        }

        /*
         * Sync the DMA buffer of the tx descriptor ring
         *
         * Note: For head write-back mode, the tx descriptors will not
         * be written back, but the head write-back value is stored at
         * the last extra tbd at the end of the DMA area, we still need
         * to sync the head write-back value for kernel.
         *
         * DMA_SYNC(&tx_ring->tbd_area, DDI_DMA_SYNC_FORKERNEL);
         */
        (void) ddi_dma_sync(tx_ring->tbd_area.dma_handle,
            sizeof (union e1000_adv_tx_desc) * tx_ring->ring_size,
            sizeof (uint32_t),
            DDI_DMA_SYNC_FORKERNEL);

        if (igb_check_dma_handle(
            tx_ring->tbd_area.dma_handle) != DDI_FM_OK) {
                mutex_exit(&tx_ring->recycle_lock);
                ddi_fm_service_impact(igb->dip, DDI_SERVICE_DEGRADED);
                atomic_or_32(&igb->igb_state, IGB_ERROR);
                return (0);
        }

        LINK_LIST_INIT(&pending_list);
        desc_num = 0;
        index = tx_ring->tbd_head;      /* Next index to clean */

        /*
         * Get the value of head write-back
         */
        head_wb = *tx_ring->tbd_head_wb;
        while (index != head_wb) {
                tcb = tx_ring->work_list[index];
                ASSERT(tcb != NULL);

                if (OFFSET(index, head_wb, tx_ring->ring_size) <
                    tcb->desc_num) {
                        /*
                         * The current tx control block is not
                         * completely transmitted, stop recycling
                         */
                        break;
                }

                /*
                 * Strip off the tx control block from the work list,
                 * and add it to the pending list.
                 */
                tx_ring->work_list[index] = NULL;
                LIST_PUSH_TAIL(&pending_list, &tcb->link);

                /*
                 * Advance the index of the tx descriptor ring
                 */
                index = NEXT_INDEX(index, tcb->desc_num, tx_ring->ring_size);

                /*
                 * Count the total number of the tx descriptors recycled
                 */
                desc_num += tcb->desc_num;
        }

        /*
         * If no tx descriptors are recycled, no need to do more processing
         */
        if (desc_num == 0) {
                tx_ring->recycle_fail++;
                mutex_exit(&tx_ring->recycle_lock);
                return (0);
        }

        tx_ring->recycle_fail = 0;
        tx_ring->stall_watchdog = 0;

        /*
         * Update the head index of the tx descriptor ring
         */
        tx_ring->tbd_head = index;

        /*
         * Update the number of the free tx descriptors with atomic operations
         */
        atomic_add_32(&tx_ring->tbd_free, desc_num);

        mutex_exit(&tx_ring->recycle_lock);

        /*
         * Free the resources used by the tx control blocks
         * in the pending list
         */
        tcb = (tx_control_block_t *)LIST_GET_HEAD(&pending_list);
        while (tcb) {
                /*
                 * Release the resources occupied by the tx control block
                 */
                igb_free_tcb(tcb);

                tcb = (tx_control_block_t *)
                    LIST_GET_NEXT(&pending_list, &tcb->link);
        }

        /*
         * Add the tx control blocks in the pending list to the free list.
         */
        igb_put_free_list(tx_ring, &pending_list);

        return (desc_num);
}

/*
 * igb_free_tcb - free up the tx control block
 *
 * Free the resources of the tx control block, including
 * unbind the previously bound DMA handle, and reset other
 * control fields.
 */
void
igb_free_tcb(tx_control_block_t *tcb)
{
        switch (tcb->tx_type) {
        case USE_COPY:
                /*
                 * Reset the buffer length that is used for copy
                 */
                tcb->tx_buf.len = 0;
                break;
        case USE_DMA:
                /*
                 * Release the DMA resource that is used for
                 * DMA binding.
                 */
                (void) ddi_dma_unbind_handle(tcb->tx_dma_handle);
                break;
        default:
                break;
        }

        /*
         * Free the mblk
         */
        if (tcb->mp != NULL) {
                freemsg(tcb->mp);
                tcb->mp = NULL;
        }

        tcb->tx_type = USE_NONE;
        tcb->last_index = MAX_TX_RING_SIZE;
        tcb->frag_num = 0;
        tcb->desc_num = 0;
}

/*
 * igb_get_free_list - Get a free tx control block from the free list
 *
 * The atomic operation on the number of the available tx control block
 * in the free list is used to keep this routine mutual exclusive with
 * the routine igb_put_check_list.
 */
static tx_control_block_t *
igb_get_free_list(igb_tx_ring_t *tx_ring)
{
        tx_control_block_t *tcb;

        /*
         * Check and update the number of the free tx control block
         * in the free list.
         */
        if (igb_atomic_reserve(&tx_ring->tcb_free, 1) < 0)
                return (NULL);

        mutex_enter(&tx_ring->tcb_head_lock);

        tcb = tx_ring->free_list[tx_ring->tcb_head];
        ASSERT(tcb != NULL);
        tx_ring->free_list[tx_ring->tcb_head] = NULL;
        tx_ring->tcb_head = NEXT_INDEX(tx_ring->tcb_head, 1,
            tx_ring->free_list_size);

        mutex_exit(&tx_ring->tcb_head_lock);

        return (tcb);
}

/*
 * igb_put_free_list
 *
 * Put a list of used tx control blocks back to the free list
 *
 * A mutex is used here to ensure the serialization. The mutual exclusion
 * between igb_get_free_list and igb_put_free_list is implemented with
 * the atomic operation on the counter tcb_free.
 */
void
igb_put_free_list(igb_tx_ring_t *tx_ring, link_list_t *pending_list)
{
        uint32_t index;
        int tcb_num;
        tx_control_block_t *tcb;

        mutex_enter(&tx_ring->tcb_tail_lock);

        index = tx_ring->tcb_tail;

        tcb_num = 0;
        tcb = (tx_control_block_t *)LIST_POP_HEAD(pending_list);
        while (tcb != NULL) {
                ASSERT(tx_ring->free_list[index] == NULL);
                tx_ring->free_list[index] = tcb;

                tcb_num++;

                index = NEXT_INDEX(index, 1, tx_ring->free_list_size);

                tcb = (tx_control_block_t *)LIST_POP_HEAD(pending_list);
        }

        tx_ring->tcb_tail = index;

        /*
         * Update the number of the free tx control block
         * in the free list. This operation must be placed
         * under the protection of the lock.
         */
        atomic_add_32(&tx_ring->tcb_free, tcb_num);

        mutex_exit(&tx_ring->tcb_tail_lock);
}