root/usr/src/uts/common/io/sfxge/sfxge_rx.c
/*
 * Copyright (c) 2008-2016 Solarflare Communications Inc.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice,
 *    this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 *    this list of conditions and the following disclaimer in the documentation
 *    and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 * The views and conclusions contained in the software and documentation are
 * those of the authors and should not be interpreted as representing official
 * policies, either expressed or implied, of the FreeBSD Project.
 */

#include <sys/types.h>
#include <sys/sysmacros.h>
#include <sys/ddi.h>
#include <sys/sunddi.h>
#include <sys/atomic.h>
#include <sys/stream.h>
#include <sys/strsun.h>
#include <sys/strsubr.h>
#include <sys/strft.h>
#include <sys/ksynch.h>
#include <sys/ethernet.h>
#include <sys/crc32.h>
#include <sys/pattr.h>
#include <sys/cpu.h>

#include <sys/ethernet.h>
#include <inet/ip.h>

#include <netinet/in.h>
#include <netinet/ip.h>
#include <netinet/tcp.h>

#include "sfxge.h"

#include "efx.h"

/* RXQ flush response timeout (in microseconds) */
#define SFXGE_RX_QFLUSH_USEC    (2000000)

/* RXQ flush tries in the case of failure */
#define SFXGE_RX_QFLUSH_TRIES   (5)

/* RXQ default packet buffer preallocation (number of packet buffers) */
#define SFXGE_RX_QPREALLOC      (0)

/* Receive packet DMA attributes */
static ddi_device_acc_attr_t sfxge_rx_packet_devacc = {

        DDI_DEVICE_ATTR_V0,     /* devacc_attr_version */
        DDI_NEVERSWAP_ACC,      /* devacc_attr_endian_flags */
        DDI_STRICTORDER_ACC     /* devacc_attr_dataorder */
};

static ddi_dma_attr_t sfxge_rx_packet_dma_attr = {
        DMA_ATTR_V0,            /* dma_attr_version     */
        0,                      /* dma_attr_addr_lo     */
        0xffffffffffffffffull,  /* dma_attr_addr_hi     */
        0xffffffffffffffffull,  /* dma_attr_count_max   */
        SFXGE_CPU_CACHE_SIZE,   /* dma_attr_align       */
        0xffffffff,             /* dma_attr_burstsizes  */
        1,                      /* dma_attr_minxfer     */
        0xffffffffffffffffull,  /* dma_attr_maxxfer     */
        0xffffffffffffffffull,  /* dma_attr_seg         */
        1,                      /* dma_attr_sgllen      */
        1,                      /* dma_attr_granular    */
        0                       /* dma_attr_flags       */
};

/* Receive queue DMA attributes */
static ddi_device_acc_attr_t sfxge_rxq_devacc = {

        DDI_DEVICE_ATTR_V0,     /* devacc_attr_version */
        DDI_NEVERSWAP_ACC,      /* devacc_attr_endian_flags */
        DDI_STRICTORDER_ACC     /* devacc_attr_dataorder */
};

static ddi_dma_attr_t sfxge_rxq_dma_attr = {
        DMA_ATTR_V0,            /* dma_attr_version     */
        0,                      /* dma_attr_addr_lo     */
        0xffffffffffffffffull,  /* dma_attr_addr_hi     */
        0xffffffffffffffffull,  /* dma_attr_count_max   */
        EFX_BUF_SIZE,           /* dma_attr_align       */
        0xffffffff,             /* dma_attr_burstsizes  */
        1,                      /* dma_attr_minxfer     */
        0xffffffffffffffffull,  /* dma_attr_maxxfer     */
        0xffffffffffffffffull,  /* dma_attr_seg         */
        1,                      /* dma_attr_sgllen      */
        1,                      /* dma_attr_granular    */
        0                       /* dma_attr_flags       */
};

/* Forward declaration */
static void sfxge_rx_qpreallocate(sfxge_rxq_t *srp, int nprealloc);

static int
sfxge_rx_packet_ctor(void *buf, void *arg, int kmflags)
{
        sfxge_rx_packet_t *srpp = buf;
        sfxge_t *sp = arg;
        dev_info_t *dip = sp->s_dip;
        int err;

        ASSERT3U(sizeof (srpp->__srp_u1.__srp_s1), <=,
            sizeof (srpp->__srp_u1.__srp_pad));
        ASSERT3U(sizeof (srpp->__srp_u2.__srp_s2), <=,
            sizeof (srpp->__srp_u2.__srp_pad));

        bzero(buf, sizeof (sfxge_rx_packet_t));

        /* Allocate a DMA handle */
        err = ddi_dma_alloc_handle(dip, &sfxge_rx_packet_dma_attr,
            (kmflags == KM_SLEEP) ? DDI_DMA_SLEEP : DDI_DMA_DONTWAIT,
            NULL, &(srpp->srp_dma_handle));
        if (err != DDI_SUCCESS)
                goto fail1;

        return (0);

fail1:
        DTRACE_PROBE1(fail1, int, err);

        SFXGE_OBJ_CHECK(srpp, sfxge_rx_packet_t);

        return (-1);
}

static void
sfxge_rx_packet_dtor(void *buf, void *arg)
{
        sfxge_rx_packet_t *srpp = buf;

        _NOTE(ARGUNUSED(arg))

        /* Free the DMA handle */
        ddi_dma_free_handle(&(srpp->srp_dma_handle));
        srpp->srp_dma_handle = NULL;

        SFXGE_OBJ_CHECK(srpp, sfxge_rx_packet_t);
}

static int
sfxge_rx_qctor(void *buf, void *arg, int kmflags)
{
        sfxge_rxq_t *srp = buf;
        efsys_mem_t *esmp = &(srp->sr_mem);
        sfxge_t *sp = arg;
        sfxge_dma_buffer_attr_t dma_attr;
        sfxge_rx_fpp_t *srfppp;
        int nprealloc;
        unsigned int id;
        int rc;

        /* Compile-time structure layout checks */
        EFX_STATIC_ASSERT(sizeof (srp->__sr_u1.__sr_s1) <=
            sizeof (srp->__sr_u1.__sr_pad));
        EFX_STATIC_ASSERT(sizeof (srp->__sr_u2.__sr_s2) <=
            sizeof (srp->__sr_u2.__sr_pad));
        EFX_STATIC_ASSERT(sizeof (srp->__sr_u3.__sr_s3) <=
            sizeof (srp->__sr_u3.__sr_pad));

        bzero(buf, sizeof (sfxge_rxq_t));

        srp->sr_sp = sp;

        dma_attr.sdba_dip        = sp->s_dip;
        dma_attr.sdba_dattrp     = &sfxge_rxq_dma_attr;
        dma_attr.sdba_callback   = DDI_DMA_SLEEP;
        dma_attr.sdba_length     = EFX_RXQ_SIZE(sp->s_rxq_size);
        dma_attr.sdba_memflags   = DDI_DMA_CONSISTENT;
        dma_attr.sdba_devaccp    = &sfxge_rxq_devacc;
        dma_attr.sdba_bindflags  = DDI_DMA_READ | DDI_DMA_CONSISTENT;
        dma_attr.sdba_maxcookies = 1;
        dma_attr.sdba_zeroinit   = B_FALSE;

        if ((rc = sfxge_dma_buffer_create(esmp, &dma_attr)) != 0)
                goto fail1;

        /* Allocate some buffer table entries */
        if ((rc = sfxge_sram_buf_tbl_alloc(sp, EFX_RXQ_NBUFS(sp->s_rxq_size),
            &(srp->sr_id))) != 0)
                goto fail2;

        /* Allocate the context array */
        if ((srp->sr_srpp = kmem_zalloc(sizeof (sfxge_rx_packet_t *) *
            sp->s_rxq_size, kmflags)) == NULL) {
                rc = ENOMEM;
                goto fail3;
        }

        /* Allocate the flow table */
        if ((srp->sr_flow = kmem_zalloc(sizeof (sfxge_rx_flow_t) *
            SFXGE_MAX_FLOW, kmflags)) == NULL) {
                rc = ENOMEM;
                goto fail4;
        }

        srp->sr_srfpp = &(srp->sr_srfp);
        srp->sr_rto = drv_usectohz(200000);

        srp->sr_mpp = &(srp->sr_mp);

        /* Initialize the free packet pool */
        srfppp = &(srp->sr_fpp);
        if ((srfppp->srfpp_putp = kmem_zalloc(SFXGE_CPU_CACHE_SIZE *
            SFXGE_RX_FPP_NSLOTS, kmflags)) == NULL) {
                rc = ENOMEM;
                goto fail5;
        }
        for (id = 0; id < SFXGE_RX_FPP_NSLOTS; id++) {
                sfxge_rx_fpp_putlist_t *putp;
                size_t off;

                off = id * SFXGE_CPU_CACHE_SIZE;
                putp = (void *)(srfppp->srfpp_putp + off);

                putp->srfpl_putp = NULL;
                putp->srfpl_putpp = &(putp->srfpl_putp);
                mutex_init(&(putp->srfpl_lock), NULL, MUTEX_DRIVER,
                    DDI_INTR_PRI(sp->s_intr.si_intr_pri));
        }

        cv_init(&(srp->sr_flush_kv), NULL, CV_DRIVER, NULL);

        /* Preallocate some packets on the free packet pool */
        nprealloc = ddi_prop_get_int(DDI_DEV_T_ANY, sp->s_dip,
            DDI_PROP_DONTPASS, "rx_prealloc_pkt_buffers", SFXGE_RX_QPREALLOC);
        sfxge_rx_qpreallocate(srp, nprealloc);


        return (0);

fail5:
        DTRACE_PROBE(fail5);

        srp->sr_mpp = NULL;

        srp->sr_rto = 0;
        srp->sr_srfpp = NULL;

        /* Free the flow table */
        kmem_free(srp->sr_flow, sizeof (sfxge_rx_flow_t) *
            SFXGE_MAX_FLOW);
        srp->sr_flow = NULL;

fail4:
        DTRACE_PROBE(fail4);

        /* Free the context array */
        kmem_free(srp->sr_srpp, sizeof (sfxge_rx_packet_t *) *
            sp->s_rxq_size);
        srp->sr_srpp = NULL;

fail3:
        DTRACE_PROBE(fail3);

        /* Free the buffer table entries */
        sfxge_sram_buf_tbl_free(sp, srp->sr_id,
            EFX_RXQ_NBUFS(sp->s_rxq_size));
        srp->sr_id = 0;

fail2:
        DTRACE_PROBE(fail2);
        /* Remove dma setup */
        sfxge_dma_buffer_destroy(esmp);

fail1:
        DTRACE_PROBE1(fail1, int, rc);

        srp->sr_sp = NULL;

        SFXGE_OBJ_CHECK(srp, sfxge_rxq_t);

        return (-1);
}

static void
sfxge_rx_qdtor(void *buf, void *arg)
{
        sfxge_rxq_t *srp = buf;
        efsys_mem_t *esmp = &(srp->sr_mem);
        sfxge_t *sp = srp->sr_sp;
        sfxge_rx_fpp_t *srfppp = &(srp->sr_fpp);
        unsigned int id;

        _NOTE(ARGUNUSED(arg))

        cv_destroy(&(srp->sr_flush_kv));

        /* Tear down the free packet pool */
        for (id = 0; id < SFXGE_RX_FPP_NSLOTS; id++) {
                sfxge_rx_fpp_putlist_t *putp;
                size_t off;

                off = id * SFXGE_CPU_CACHE_SIZE;
                putp = (void *)(srfppp->srfpp_putp + off);

                putp->srfpl_putpp = NULL;
                mutex_destroy(&(putp->srfpl_lock));

                SFXGE_OBJ_CHECK(putp, sfxge_rx_fpp_putlist_t);
        }
        kmem_free(srfppp->srfpp_putp, SFXGE_CPU_CACHE_SIZE *
            SFXGE_RX_FPP_NSLOTS);
        srfppp->srfpp_putp = NULL;

        srp->sr_mpp = NULL;

        srp->sr_rto = 0;
        srp->sr_srfpp = NULL;

        /* Free the flow table */
        kmem_free(srp->sr_flow, sizeof (sfxge_rx_flow_t) *
            SFXGE_MAX_FLOW);
        srp->sr_flow = NULL;

        /* Free the context array */
        kmem_free(srp->sr_srpp, sizeof (sfxge_rx_packet_t *) *
            sp->s_rxq_size);
        srp->sr_srpp = NULL;

        /* Free the buffer table entries */
        sfxge_sram_buf_tbl_free(sp, srp->sr_id,
            EFX_RXQ_NBUFS(sp->s_rxq_size));
        srp->sr_id = 0;

        /* Tear down dma setup */
        sfxge_dma_buffer_destroy(esmp);

        SFXGE_OBJ_CHECK(srp, sfxge_rxq_t);
}

/* Note: This function takes ownership of *srpp. */
static inline void
sfxge_rx_qfpp_put(sfxge_rxq_t *srp, sfxge_rx_packet_t *srpp)
{
        sfxge_rx_fpp_t *srfppp = &(srp->sr_fpp);
        mblk_t *mp = srpp->srp_mp;
        unsigned int id;
        size_t off;
        sfxge_rx_fpp_putlist_t *putp;

        ASSERT3P(mp->b_next, ==, NULL);
        ASSERT3P(mp->b_prev, ==, NULL);

        id = CPU->cpu_seqid & SFXGE_RX_FPP_MASK;
        off = id * SFXGE_CPU_CACHE_SIZE;

        ASSERT3P(srpp->srp_putp, ==, srfppp->srfpp_putp);
        putp = (void *)(srpp->srp_putp + off);

        mutex_enter(&(putp->srfpl_lock));
        putp->srfpl_count++;
        *putp->srfpl_putpp = mp;
        putp->srfpl_putpp = &(mp->b_next);
        mutex_exit(&(putp->srfpl_lock));
}

static unsigned int
sfxge_rx_qfpp_swizzle(sfxge_rxq_t *srp)
{
        sfxge_t *sp = srp->sr_sp;
        unsigned int index = srp->sr_index;
        sfxge_evq_t *sep = sp->s_sep[index];
        sfxge_rx_fpp_t *srfppp = &(srp->sr_fpp);
        unsigned int start;
        unsigned int id;
        mblk_t *p;
        mblk_t **pp;
        unsigned int count;
        unsigned int loaned;

        ASSERT(mutex_owned(&(sep->se_lock)));

        /* We want to access the put list for the current CPU last */
        id = start = (CPU->cpu_seqid + 1) & SFXGE_RX_FPP_MASK;

        do {
                sfxge_rx_fpp_putlist_t *putp;
                size_t off;

                off = id * SFXGE_CPU_CACHE_SIZE;
                id  = (id + 1) & SFXGE_RX_FPP_MASK;

                putp = (void *)(srfppp->srfpp_putp + off);

                /* Acquire the put list */
                mutex_enter(&(putp->srfpl_lock));

                p = putp->srfpl_putp;
                pp = putp->srfpl_putpp;
                count = putp->srfpl_count;

                putp->srfpl_putp = NULL;
                putp->srfpl_putpp = &(putp->srfpl_putp);
                putp->srfpl_count = 0;

                mutex_exit(&(putp->srfpl_lock));

                if (p == NULL)
                        continue;

                /* Add the list to the head of the get list */
                *pp = srfppp->srfpp_get;
                srfppp->srfpp_get = p;

                /* Adjust the counters */
                ASSERT3U(srfppp->srfpp_loaned, >=, count);
                srfppp->srfpp_loaned -= count;
                srfppp->srfpp_count += count;

#if 0
                /* NOTE: this probe is disabled because it is expensive!! */
                DTRACE_PROBE2(count,
                    unsigned int, (id - 1) & SFXGE_RX_FPP_MASK,
                    unsigned int, count);
#endif

        } while (id != start);

        /* Return the number of packets yet to appear in the put list */
        loaned = srfppp->srfpp_loaned;


        return (loaned);
}


#define DB_FRTNP(mp)    ((mp)->b_datap->db_frtnp)

static void
sfxge_rx_qfpp_empty(sfxge_rxq_t *srp)
{
        sfxge_t *sp = srp->sr_sp;
        unsigned int index = srp->sr_index;
        sfxge_evq_t *sep = sp->s_sep[index];
        sfxge_rx_fpp_t *srfppp;
        mblk_t *mp;

        mutex_enter(&(sep->se_lock));
        srfppp = &(srp->sr_fpp);

        /* Swizzle put list to get list */
        (void) sfxge_rx_qfpp_swizzle(srp);
        ASSERT3U(srfppp->srfpp_loaned, ==, 0);

        mp = srfppp->srfpp_get;
        srfppp->srfpp_get = NULL;

        /* Free the remainder */
        while (mp != NULL) {
                mblk_t *next;
                frtn_t *freep;
                sfxge_rx_packet_t *srpp;

                next = mp->b_next;
                mp->b_next = NULL;

                ASSERT3U(srfppp->srfpp_count, >, 0);
                srfppp->srfpp_count--;

                freep = DB_FRTNP(mp);
                /*
                 * ASSERT3P(freep->free_func, ==, sfxge_rx_qpacket_free);
                 *   is implied by srpp test below
                 */
                /*LINTED*/
                srpp = (sfxge_rx_packet_t *)(freep->free_arg);
                ASSERT3P(srpp->srp_mp, ==, mp);
                ASSERT3P(mp->b_cont, ==, NULL);
                srpp->srp_recycle = B_FALSE;

                freeb(mp);

                mp = next;
        }
        ASSERT3U(srfppp->srfpp_count, ==, 0);

        srfppp->srfpp_min = 0;

        mutex_exit(&(sep->se_lock));
}

/*
 * This is an estimate of all memory consumed per RX packet
 * it can be inaccurate but but sp->s_rx_pkt_mem_alloc mustn't drift
 */
static uint64_t
sfxge_rx_pkt_mem_approx(const sfxge_rx_packet_t *srpp)
{
        return (srpp->srp_mblksize + sizeof (mblk_t) + sizeof (dblk_t) +
            sizeof (sfxge_rx_packet_t));
}

static void
sfxge_rx_qpacket_destroy(sfxge_rxq_t *srp, sfxge_rx_packet_t *srpp)
{
        sfxge_t *sp = srp->sr_sp;
        int64_t delta = sfxge_rx_pkt_mem_approx(srpp);

        ASSERT(!(srpp->srp_recycle));
        ASSERT3P(srpp->srp_mp, ==, NULL);

        srpp->srp_off = 0;
        srpp->srp_thp = NULL;
        srpp->srp_iphp = NULL;
        srpp->srp_etherhp = NULL;
        srpp->srp_size = 0;
        srpp->srp_flags = 0;

        bzero(&(srpp->srp_free), sizeof (frtn_t));

        srpp->srp_mblksize = 0;
        srpp->srp_base = NULL;

        /* Unbind the DMA memory from the DMA handle */
        srpp->srp_addr = 0;
        (void) ddi_dma_unbind_handle(srpp->srp_dma_handle);

        /* Free the DMA memory */
        srpp->srp_base = NULL;
        ddi_dma_mem_free(&(srpp->srp_acc_handle));
        srpp->srp_acc_handle = NULL;

        srpp->srp_putp = NULL;
        srpp->srp_srp = NULL;

        kmem_cache_free(sp->s_rpc, srpp);
        if (sp->s_rx_pkt_mem_max)
                atomic_add_64(&sp->s_rx_pkt_mem_alloc, -delta);
}

static void
sfxge_rx_qpacket_free(void *arg)
{
        sfxge_rx_packet_t *srpp = arg;
        sfxge_rxq_t *srp = srpp->srp_srp;

        /*
         * WARNING "man -s 9f esballoc"  states:
         * => runs sync from the thread calling freeb()
         * => must not sleep, or access data structures that could be freed
         */

        /* Check whether we want to recycle the receive packets */
        if (srpp->srp_recycle) {
                frtn_t *freep;
                mblk_t *mp;
                size_t size;

                freep = &(srpp->srp_free);
                ASSERT3P(freep->free_func, ==, sfxge_rx_qpacket_free);
                ASSERT3P(freep->free_arg, ==, (caddr_t)srpp);

                /*
                 * Allocate a matching mblk_t before the current one is
                 * freed.
                 */
                size = srpp->srp_mblksize;

                if ((mp = desballoc(srpp->srp_base, size, BPRI_HI,
                    freep)) != NULL) {
                        srpp->srp_mp = mp;

                        /* NORMAL recycled case */
                        sfxge_rx_qfpp_put(srp, srpp);
                        return;
                }
        }

        srpp->srp_mp = NULL;

        sfxge_rx_qpacket_destroy(srp, srpp);
}

static sfxge_rx_packet_t *
sfxge_rx_qpacket_create(sfxge_rxq_t *srp)
{
        sfxge_t *sp = srp->sr_sp;
        sfxge_rx_fpp_t *srfppp = &(srp->sr_fpp);
        sfxge_rx_packet_t *srpp;
        size_t size;
        caddr_t base;
        size_t unit;
        ddi_dma_cookie_t dmac;
        unsigned int ncookies;
        frtn_t *freep;
        mblk_t *mp;
        int err;
        int rc;

        size = sp->s_rx_buffer_size;

        if (sp->s_rx_pkt_mem_max &&
            (sp->s_rx_pkt_mem_alloc + size >= sp->s_rx_pkt_mem_max)) {
                DTRACE_PROBE(rx_pkt_mem_max);
                srp->sr_kstat.srk_rx_pkt_mem_limit++;
                return (NULL);
        }

        /* Allocate a new packet */
        if ((srpp = kmem_cache_alloc(sp->s_rpc, KM_NOSLEEP)) == NULL) {
                srp->sr_kstat.srk_kcache_alloc_nomem++;
                rc = ENOMEM;
                goto fail1;
        }

        srpp->srp_srp = srp;
        srpp->srp_putp = srfppp->srfpp_putp;

        /* Allocate some DMA memory */
        err = ddi_dma_mem_alloc(srpp->srp_dma_handle, size,
            &sfxge_rx_packet_devacc, DDI_DMA_STREAMING, DDI_DMA_DONTWAIT,
            NULL, &base, &unit, &(srpp->srp_acc_handle));
        switch (err) {
        case DDI_SUCCESS:
                break;

        case DDI_FAILURE:
                srp->sr_kstat.srk_dma_alloc_nomem++;
                rc = ENOMEM;
                goto fail2;

        default:
                srp->sr_kstat.srk_dma_alloc_fail++;
                rc = EFAULT;
                goto fail2;
        }

        /* Adjust the buffer to align the start of the DMA area correctly */
        base += sp->s_rx_buffer_align;
        size -= sp->s_rx_buffer_align;

        /* Bind the DMA memory to the DMA handle */
        err = ddi_dma_addr_bind_handle(srpp->srp_dma_handle, NULL,
            base, size, DDI_DMA_READ | DDI_DMA_STREAMING,
            DDI_DMA_DONTWAIT, NULL, &dmac, &ncookies);
        switch (err) {
        case DDI_DMA_MAPPED:
                break;

        case DDI_DMA_INUSE:
                srp->sr_kstat.srk_dma_bind_fail++;
                rc = EEXIST;
                goto fail3;

        case DDI_DMA_NORESOURCES:
                srp->sr_kstat.srk_dma_bind_nomem++;
                rc = ENOMEM;
                goto fail3;

        case DDI_DMA_NOMAPPING:
                srp->sr_kstat.srk_dma_bind_fail++;
                rc = ENOTSUP;
                goto fail3;

        case DDI_DMA_TOOBIG:
                srp->sr_kstat.srk_dma_bind_fail++;
                rc = EFBIG;
                goto fail3;

        default:
                srp->sr_kstat.srk_dma_bind_fail++;
                rc = EFAULT;
                goto fail3;
        }
        ASSERT3U(ncookies, ==, 1);

        srpp->srp_addr = dmac.dmac_laddress;

        srpp->srp_base = (unsigned char *)base;
        srpp->srp_mblksize = size;

        /*
         * Allocate a STREAMS block: We use size 1 so that the allocator will
         * use the first (and smallest) dblk cache.
         */
        freep = &(srpp->srp_free);
        freep->free_func = sfxge_rx_qpacket_free;
        freep->free_arg  = (caddr_t)srpp;

        if ((mp = desballoc(srpp->srp_base, size, BPRI_HI, freep)) == NULL) {
                srp->sr_kstat.srk_desballoc_fail++;
                rc = ENOMEM;
                goto fail4;
        }

        srpp->srp_mp = mp;
        srpp->srp_recycle = B_TRUE;

        if (sp->s_rx_pkt_mem_max) {
                int64_t delta = sfxge_rx_pkt_mem_approx(srpp);
                atomic_add_64(&sp->s_rx_pkt_mem_alloc, delta);
        }

        return (srpp);

fail4:
        DTRACE_PROBE(fail4);

        bzero(&(srpp->srp_free), sizeof (frtn_t));

        srpp->srp_mblksize = 0;
        srpp->srp_base = NULL;

        /* Unbind the DMA memory from the DMA handle */
        srpp->srp_addr = 0;
        (void) ddi_dma_unbind_handle(srpp->srp_dma_handle);

fail3:
        DTRACE_PROBE(fail3);

        /* Free the DMA memory */
        ddi_dma_mem_free(&(srpp->srp_acc_handle));
        srpp->srp_acc_handle = NULL;

fail2:
        DTRACE_PROBE(fail2);

        srpp->srp_putp = NULL;
        srpp->srp_srp = NULL;

        kmem_cache_free(sp->s_rpc, srpp);

fail1:
        DTRACE_PROBE1(fail1, int, rc);

        return (NULL);
}

#define SFXGE_REFILL_BATCH  64

/* Try to refill the RX descriptor ring from the associated free pkt pool */
static void
sfxge_rx_qrefill(sfxge_rxq_t *srp, unsigned int target)
{
        sfxge_t *sp = srp->sr_sp;
        sfxge_rx_fpp_t *srfppp = &(srp->sr_fpp);
        unsigned int index = srp->sr_index;
        sfxge_evq_t *sep = sp->s_sep[index];
        efsys_dma_addr_t addr[SFXGE_REFILL_BATCH];
        mblk_t *mp;
        int ntodo;
        unsigned int count;
        unsigned int batch;
        unsigned int rxfill;
        unsigned int mblksize;

        prefetch_read_many(sp->s_enp);
        prefetch_read_many(srp->sr_erp);

        ASSERT(mutex_owned(&(sep->se_lock)));

        if (srp->sr_state != SFXGE_RXQ_STARTED)
                return;

        rxfill = srp->sr_added - srp->sr_completed;
        ASSERT3U(rxfill, <=, EFX_RXQ_LIMIT(sp->s_rxq_size));
        ntodo = min(EFX_RXQ_LIMIT(sp->s_rxq_size) - rxfill, target);
        ASSERT3U(ntodo, <=, EFX_RXQ_LIMIT(sp->s_rxq_size));

        if (ntodo == 0)
                goto out;

        (void) sfxge_rx_qfpp_swizzle(srp);

        mp = srfppp->srfpp_get;
        count = srfppp->srfpp_count;
        mblksize = sp->s_rx_buffer_size - sp->s_rx_buffer_align;

        batch = 0;
        while (ntodo-- > 0) {
                mblk_t *next;
                frtn_t *freep;
                sfxge_rx_packet_t *srpp;
                unsigned int id;

                if (mp == NULL)
                        break;

                next = mp->b_next;
                mp->b_next = NULL;

                if (next != NULL)
                        prefetch_read_many(next);

                freep = DB_FRTNP(mp);
                /*LINTED*/
                srpp = (sfxge_rx_packet_t *)(freep->free_arg);
                ASSERT3P(srpp->srp_mp, ==, mp);

                /* The MTU may have changed since the packet was allocated */
                if (MBLKSIZE(mp) != mblksize) {
                        srpp->srp_recycle = B_FALSE;

                        freeb(mp);

                        --count;
                        mp = next;
                        continue;
                }

                srpp->srp_off = 0;
                srpp->srp_thp = NULL;
                srpp->srp_iphp = NULL;
                srpp->srp_etherhp = NULL;
                srpp->srp_size = 0;
                srpp->srp_flags = EFX_DISCARD;

                id = (srp->sr_added + batch) & (sp->s_rxq_size - 1);
                ASSERT(srp->sr_srpp[id] == NULL);
                srp->sr_srpp[id] = srpp;

                addr[batch++] = srpp->srp_addr;
                if (batch == SFXGE_REFILL_BATCH) {
                        efx_rx_qpost(srp->sr_erp, addr, mblksize, batch,
                            srp->sr_completed, srp->sr_added);
                        srp->sr_added += batch;
                        batch = 0;
                }

                --count;
                mp = next;
        }

        srfppp->srfpp_get = mp;
        srfppp->srfpp_count = count;

        if (batch != 0) {
                efx_rx_qpost(srp->sr_erp, addr, mblksize, batch,
                    srp->sr_completed, srp->sr_added);
                srp->sr_added += batch;
        }

        efx_rx_qpush(srp->sr_erp, srp->sr_added, &srp->sr_pushed);

out:
        if (srfppp->srfpp_count < srfppp->srfpp_min)
                srfppp->srfpp_min = srfppp->srfpp_count;
}

/* Preallocate packets and put them in the free packet pool */
static void
sfxge_rx_qpreallocate(sfxge_rxq_t *srp, int nprealloc)
{
        sfxge_rx_fpp_t *srfppp = &((srp)->sr_fpp);
        srfppp->srfpp_lowat = nprealloc;
        while (nprealloc-- > 0) {
                sfxge_rx_packet_t *srpp;

                if ((srpp = sfxge_rx_qpacket_create(srp)) == NULL)
                        break;
                sfxge_rx_qfpp_put(srp, srpp);
        }
}

/* Try to refill the RX descriptor ring by allocating new packets */
static void
sfxge_rx_qfill(sfxge_rxq_t *srp, unsigned int target)
{
        sfxge_t *sp = srp->sr_sp;
        unsigned int index = srp->sr_index;
        sfxge_evq_t *sep = sp->s_sep[index];
        unsigned int batch;
        unsigned int rxfill;
        unsigned int mblksize;
        int ntodo;
        efsys_dma_addr_t addr[SFXGE_REFILL_BATCH];
        mblk_t *mp = NULL;

        prefetch_read_many(sp->s_enp);
        prefetch_read_many(srp->sr_erp);

        ASSERT(mutex_owned(&(sep->se_lock)));

        if (srp->sr_state != SFXGE_RXQ_STARTED)
                return;

        rxfill = srp->sr_added - srp->sr_completed;
        ASSERT3U(rxfill, <=, EFX_RXQ_LIMIT(sp->s_rxq_size));
        ntodo = min(EFX_RXQ_LIMIT(sp->s_rxq_size) - rxfill, target);
        ASSERT3U(ntodo, <=, EFX_RXQ_LIMIT(sp->s_rxq_size));

        if (ntodo == 0)
                return;

        mblksize = sp->s_rx_buffer_size - sp->s_rx_buffer_align;

        batch = 0;
        while (ntodo-- > 0) {
                sfxge_rx_packet_t *srpp;
                unsigned int id;

                if ((srpp = sfxge_rx_qpacket_create(srp)) == NULL)
                        break;

                mp = srpp->srp_mp;

                ASSERT3U(MBLKSIZE(mp), ==, mblksize);

                ASSERT3U(srpp->srp_off, ==, 0);
                ASSERT3P(srpp->srp_thp, ==, NULL);
                ASSERT3P(srpp->srp_iphp, ==, NULL);
                ASSERT3P(srpp->srp_etherhp, ==, NULL);
                ASSERT3U(srpp->srp_size, ==, 0);

                srpp->srp_flags = EFX_DISCARD;

                id = (srp->sr_added + batch) & (sp->s_rxq_size - 1);
                ASSERT(srp->sr_srpp[id] == NULL);
                srp->sr_srpp[id] = srpp;

                addr[batch++] = srpp->srp_addr;
                if (batch == SFXGE_REFILL_BATCH) {
                        efx_rx_qpost(srp->sr_erp, addr, mblksize, batch,
                            srp->sr_completed, srp->sr_added);
                        srp->sr_added += batch;
                        batch = 0;
                }
        }

        if (batch != 0) {
                efx_rx_qpost(srp->sr_erp, addr, mblksize, batch,
                    srp->sr_completed, srp->sr_added);
                srp->sr_added += batch;
        }

        efx_rx_qpush(srp->sr_erp, srp->sr_added, &srp->sr_pushed);
}

void
sfxge_rx_qfpp_trim(sfxge_rxq_t *srp)
{
        sfxge_rx_fpp_t *srfppp = &(srp->sr_fpp);
        sfxge_t *sp = srp->sr_sp;
        unsigned int index = srp->sr_index;
        sfxge_evq_t *sep = sp->s_sep[index];
        mblk_t *p;
        mblk_t **pp;
        int count;

        ASSERT(mutex_owned(&(sep->se_lock)));

        if (srp->sr_state != SFXGE_RXQ_STARTED)
                goto done;

        /* Make sure the queue is full */
        sfxge_rx_qrefill(srp, EFX_RXQ_LIMIT(sp->s_rxq_size));

        /* The refill may have emptied the pool */
        if (srfppp->srfpp_min == 0)
                goto done;

        /* Don't trim below the pool's low water mark */
        if (srfppp->srfpp_count <= srfppp->srfpp_lowat)
                goto done;

        ASSERT(srfppp->srfpp_min <= srfppp->srfpp_count);

        /* Trim to the largest of srfppp->srfpp_min and srfpp->srfpp_lowat */
        if (srfppp->srfpp_lowat > srfppp->srfpp_min)
                count = srfppp->srfpp_count - srfppp->srfpp_lowat;
        else
                count = srfppp->srfpp_count - srfppp->srfpp_min;

        /* Walk the get list */
        pp = &(srfppp->srfpp_get);
        while (--count >= 0) {
                ASSERT(pp);
                p = *pp;
                ASSERT(p != NULL);

                pp = &(p->b_next);
        }
        ASSERT(pp);
        p = *pp;

        /* Truncate the get list */
        *pp = NULL;

        /* Free the remainder */
        while (p != NULL) {
                mblk_t *next;
                frtn_t *freep;
                sfxge_rx_packet_t *srpp;

                next = p->b_next;
                p->b_next = NULL;

                ASSERT3U(srfppp->srfpp_min, >, 0);
                srfppp->srfpp_min--;
                srfppp->srfpp_count--;

                freep = DB_FRTNP(p);
                /*LINTED*/
                srpp = (sfxge_rx_packet_t *)(freep->free_arg);
                ASSERT3P(srpp->srp_mp, ==, p);

                srpp->srp_recycle = B_FALSE;

                freeb(p);

                p = next;
        }

done:
        srfppp->srfpp_min = srfppp->srfpp_count;
}

static void
sfxge_rx_qpoll(void *arg)
{
        sfxge_rxq_t *srp = arg;
        sfxge_t *sp = srp->sr_sp;
        unsigned int index = srp->sr_index;
        sfxge_evq_t *sep = sp->s_sep[index];
        uint16_t magic;

        /*
         * man timeout(9f) states that this code should adhere to the
         * same requirements as a softirq handler - DO NOT BLOCK
         */

        /*
         * Post an event to the event queue to cause the free packet pool to be
         * trimmed if it is oversize.
         */
        magic = SFXGE_MAGIC_RX_QFPP_TRIM | index;

#if defined(DEBUG)
        /* This is guaranteed due to the start/stop order of rx and ev */
        ASSERT3U(sep->se_state, ==, SFXGE_EVQ_STARTED);
        ASSERT3U(srp->sr_state, ==, SFXGE_RXQ_STARTED);
#else
        /*
         * Bug22691 WORKAROUND:
         * This handler has been observed in the field to be invoked for a
         * queue in the INITIALIZED state, which should never happen.
         * Until the mechanism for this is properly understood, add defensive
         * checks.
         */
        if ((sep->se_state != SFXGE_EVQ_STARTED) ||
            (srp->sr_state != SFXGE_RXQ_STARTED) ||
            (!sep->se_eep)) {
                dev_err(sp->s_dip, CE_WARN, SFXGE_CMN_ERR
                    "RXQ[%d] bad state in sfxge_rx_qpoll %d %d %p",
                    index, sep->se_state, srp->sr_state, sep->se_eep);
                return;
        }
#endif
        efx_ev_qpost(sep->se_eep, magic);

        srp->sr_tid = timeout(sfxge_rx_qpoll, srp,
            drv_usectohz(sp->s_rxq_poll_usec));
}

static void
sfxge_rx_qpoll_start(sfxge_rxq_t *srp)
{
        sfxge_t *sp = srp->sr_sp;
        unsigned int index = srp->sr_index;
        sfxge_evq_t *sep = sp->s_sep[index];

        ASSERT(mutex_owned(&(sep->se_lock)));
        ASSERT3U(srp->sr_state, ==, SFXGE_RXQ_STARTED);

        /* Schedule a poll */
        ASSERT3P(srp->sr_tid, ==, 0);
        srp->sr_tid = timeout(sfxge_rx_qpoll, srp, 0);
}

static void
sfxge_rx_qpoll_stop(sfxge_rxq_t *srp)
{
        sfxge_t *sp = srp->sr_sp;
        unsigned int index = srp->sr_index;
        sfxge_evq_t *sep = sp->s_sep[index];
        timeout_id_t tid;

        ASSERT(mutex_owned(&(sep->se_lock)));
        ASSERT3U(srp->sr_state, ==, SFXGE_RXQ_STARTED);

        /*
         * Cancel the qpoll timer. Care is needed as this function
         * can race with sfxge_rx_qpoll() for timeout id updates.
         *
         * Do not hold locks used by any timeout(9f) handlers across
         * calls to untimeout(9f) as this will deadlock.
         */
        tid = 0;
        while ((srp->sr_tid != 0) && (srp->sr_tid != tid)) {
                tid = srp->sr_tid;
                (void) untimeout(tid);
        }
        srp->sr_tid = 0;
}

static int
sfxge_rx_kstat_update(kstat_t *ksp, int rw)
{
        sfxge_rxq_t *srp = ksp->ks_private;
        sfxge_t *sp = srp->sr_sp;
        unsigned int index = srp->sr_index;
        sfxge_evq_t *sep = sp->s_sep[index];
        kstat_named_t *knp;
        int rc;

        if (rw != KSTAT_READ) {
                rc = EACCES;
                goto fail1;
        }

        ASSERT(mutex_owned(&(sep->se_lock)));
        if (srp->sr_state != SFXGE_RXQ_STARTED)
                goto done;

        knp = ksp->ks_data;
        /* NB pointer post-increment below */
        knp++->value.ui32 = srp->sr_kstat.srk_rx_pkt_mem_limit;
        knp++->value.ui32 = srp->sr_kstat.srk_kcache_alloc_nomem;
        knp++->value.ui32 = srp->sr_kstat.srk_dma_alloc_nomem;
        knp++->value.ui32 = srp->sr_kstat.srk_dma_alloc_fail;
        knp++->value.ui32 = srp->sr_kstat.srk_dma_bind_nomem;
        knp++->value.ui32 = srp->sr_kstat.srk_dma_bind_fail;
        knp++->value.ui32 = srp->sr_kstat.srk_desballoc_fail;
        knp++->value.ui32 = srp->sr_kstat.srk_rxq_empty_discard;

done:
        return (0);

fail1:
        DTRACE_PROBE1(fail1, int, rc);

        return (rc);
}

static int
sfxge_rx_kstat_init(sfxge_rxq_t *srp)
{
        sfxge_t *sp = srp->sr_sp;
        unsigned int index = srp->sr_index;
        sfxge_evq_t *sep = sp->s_sep[index];
        dev_info_t *dip = sp->s_dip;
        char name[MAXNAMELEN];
        kstat_t *ksp;
        kstat_named_t *knp;
        int rc;

        /* Create the set */
        (void) snprintf(name, MAXNAMELEN - 1, "%s_rxq%04d",
            ddi_driver_name(dip), index);

        if ((ksp = kstat_create((char *)ddi_driver_name(dip),
            ddi_get_instance(dip), name, "rxq", KSTAT_TYPE_NAMED,
            SFXGE_RX_NSTATS, 0)) == NULL) {
                rc = ENOMEM;
                goto fail1;
        }

        srp->sr_ksp = ksp;

        ksp->ks_update = sfxge_rx_kstat_update;
        ksp->ks_private = srp;
        ksp->ks_lock = &(sep->se_lock);

        /* Initialise the named stats */
        knp = ksp->ks_data;
        kstat_named_init(knp, "rx_pkt_mem_limit", KSTAT_DATA_UINT32);
        knp++;
        kstat_named_init(knp, "kcache_alloc_nomem", KSTAT_DATA_UINT32);
        knp++;
        kstat_named_init(knp, "dma_alloc_nomem", KSTAT_DATA_UINT32);
        knp++;
        kstat_named_init(knp, "dma_alloc_fail", KSTAT_DATA_UINT32);
        knp++;
        kstat_named_init(knp, "dma_bind_nomem", KSTAT_DATA_UINT32);
        knp++;
        kstat_named_init(knp, "dma_bind_fail", KSTAT_DATA_UINT32);
        knp++;
        kstat_named_init(knp, "desballoc_fail", KSTAT_DATA_UINT32);
        knp++;
        kstat_named_init(knp, "rxq_empty_discard", KSTAT_DATA_UINT32);

        kstat_install(ksp);
        return (0);

fail1:
        DTRACE_PROBE1(fail1, int, rc);

        return (rc);
}

static int
sfxge_rx_qinit(sfxge_t *sp, unsigned int index)
{
        sfxge_rxq_t *srp;
        int rc;

        ASSERT3U(index, <, SFXGE_RX_SCALE_MAX);

        if ((srp = kmem_cache_alloc(sp->s_rqc, KM_SLEEP)) == NULL) {
                rc = ENOMEM;
                goto fail1;
        }
        ASSERT3U(srp->sr_state, ==, SFXGE_RXQ_UNINITIALIZED);

        srp->sr_index = index;
        sp->s_srp[index] = srp;

        if ((rc = sfxge_rx_kstat_init(srp)) != 0)
                goto fail2;

        srp->sr_state = SFXGE_RXQ_INITIALIZED;

        return (0);

fail2:
        DTRACE_PROBE(fail2);
        kmem_cache_free(sp->s_rqc, srp);

fail1:
        DTRACE_PROBE1(fail1, int, rc);

        return (rc);
}

static int
sfxge_rx_qstart(sfxge_t *sp, unsigned int index)
{
        sfxge_evq_t *sep = sp->s_sep[index];
        sfxge_rxq_t *srp;
        efsys_mem_t *esmp;
        efx_nic_t *enp;
        unsigned int level;
        int rc;

        mutex_enter(&(sep->se_lock));
        srp = sp->s_srp[index];
        enp = sp->s_enp;
        esmp = &(srp->sr_mem);

        ASSERT3U(srp->sr_state, ==, SFXGE_RXQ_INITIALIZED);
        ASSERT3U(sep->se_state, ==, SFXGE_EVQ_STARTED);

        /* Zero the memory */
        bzero(esmp->esm_base, EFX_RXQ_SIZE(sp->s_rxq_size));

        /* Program the buffer table */
        if ((rc = sfxge_sram_buf_tbl_set(sp, srp->sr_id, esmp,
            EFX_RXQ_NBUFS(sp->s_rxq_size))) != 0)
                goto fail1;

        /* Create the receive queue */
        if ((rc = efx_rx_qcreate(enp, index, index, EFX_RXQ_TYPE_DEFAULT,
            esmp, sp->s_rxq_size, srp->sr_id, sep->se_eep, &(srp->sr_erp)))
            != 0)
                goto fail2;

        /* Enable the receive queue */
        efx_rx_qenable(srp->sr_erp);

        /* Set the water marks */
        srp->sr_hiwat = EFX_RXQ_LIMIT(sp->s_rxq_size) * 9 / 10;
        srp->sr_lowat = srp->sr_hiwat / 2;

        srp->sr_state = SFXGE_RXQ_STARTED;
        srp->sr_flush = SFXGE_FLUSH_INACTIVE;

        sfxge_rx_qpoll_start(srp);

        /* Try to fill the queue from the pool */
        sfxge_rx_qrefill(srp, EFX_RXQ_LIMIT(sp->s_rxq_size));

        /*
         * If there were insufficient buffers in the pool to reach the at
         * least a batch then allocate some.
         */
        level = srp->sr_added - srp->sr_completed;
        if (level < SFXGE_RX_BATCH)
                sfxge_rx_qfill(srp, SFXGE_RX_BATCH);

        mutex_exit(&(sep->se_lock));

        return (0);

fail2:
        DTRACE_PROBE(fail2);

        /* Clear entries from the buffer table */
        sfxge_sram_buf_tbl_clear(sp, srp->sr_id,
            EFX_RXQ_NBUFS(sp->s_rxq_size));

fail1:
        DTRACE_PROBE1(fail1, int, rc);

        mutex_exit(&(sep->se_lock));

        return (rc);
}

static void
sfxge_rx_qflow_complete(sfxge_rxq_t *srp, sfxge_rx_flow_t *srfp)
{
        mblk_t *mp;
        struct ether_header *etherhp;
        struct ip *iphp;
        struct tcphdr *thp;

        if (srfp->srf_mp == NULL)
                return;

        mp = srfp->srf_mp;
        etherhp = srfp->srf_etherhp;
        iphp = srfp->srf_iphp;
        thp = srfp->srf_last_thp;

        ASSERT3U(((etherhp->ether_type == htons(ETHERTYPE_VLAN)) ?
            sizeof (struct ether_vlan_header) :
            sizeof (struct ether_header)) +
            srfp->srf_len, ==, msgdsize(mp));

        ASSERT3U(srfp->srf_len & 0xffff, ==, srfp->srf_len);
        iphp->ip_len = htons(srfp->srf_len);

        srfp->srf_first_thp->th_ack = thp->th_ack;
        srfp->srf_first_thp->th_win = thp->th_win;
        srfp->srf_first_thp->th_flags = thp->th_flags;

        DTRACE_PROBE2(flow_complete, uint32_t, srfp->srf_tag,
            size_t, srfp->srf_len);

        srfp->srf_mp = NULL;
        srfp->srf_len = 0;

        ASSERT(mp->b_next == NULL);
        *(srp->sr_mpp) = mp;
        srp->sr_mpp = &(mp->b_next);
}

static boolean_t
sfxge_rx_qflow_add(sfxge_rxq_t *srp, sfxge_rx_flow_t *srfp,
    sfxge_rx_packet_t *srpp, clock_t now)
{
        sfxge_t *sp = srp->sr_sp;
        struct ether_header *etherhp = srpp->srp_etherhp;
        struct ip *iphp = srpp->srp_iphp;
        struct tcphdr *thp = srpp->srp_thp;
        size_t off = srpp->srp_off;
        size_t size = (size_t)(srpp->srp_size);
        mblk_t *mp = srpp->srp_mp;
        uint32_t seq;
        unsigned int shift;

        ASSERT3U(MBLKL(mp), ==, off + size);
        ASSERT3U(DB_CKSUMFLAGS(mp), ==,
            HCK_FULLCKSUM | HCK_FULLCKSUM_OK | HCK_IPV4_HDRCKSUM);

        seq = htonl(thp->th_seq);

        /*
         * If the time between this segment and the last is greater than RTO
         * then consider this a new flow.
         */
        if (now - srfp->srf_lbolt > srp->sr_rto) {
                srfp->srf_count = 1;
                srfp->srf_seq = seq + size;

                goto fail1;
        }

        if (seq != srfp->srf_seq) {
                if (srfp->srf_count > SFXGE_SLOW_START)
                        srfp->srf_count = SFXGE_SLOW_START;

                srfp->srf_count >>= 1;

                srfp->srf_count++;
                srfp->srf_seq = seq + size;

                goto fail2;
        }

        /* Update the in-order segment count and sequence number */
        srfp->srf_count++;
        srfp->srf_seq = seq + size;

        /* Don't merge across pure ACK, URG, SYN or RST segments */
        if (size == 0 || thp->th_flags & (TH_URG | TH_SYN | TH_RST) ||
            thp->th_urp != 0)
                goto fail3;

        /*
         * If the in-order segment count has not yet reached the slow-start
         * threshold then we cannot coalesce.
         */
        if (srfp->srf_count < SFXGE_SLOW_START)
                goto fail4;

        /* Scale up the packet size from 4k (the maximum being 64k) */
        ASSERT3U(srfp->srf_count, >=, SFXGE_SLOW_START);
        shift = MIN(srfp->srf_count - SFXGE_SLOW_START + 12, 16);
        if (srfp->srf_len + size >= (1 << shift))
                sfxge_rx_qflow_complete(srp, srfp);

        ASSERT(mp->b_cont == NULL);

        if (srfp->srf_mp == NULL) {
                /* First packet in this flow */
                srfp->srf_etherhp = etherhp;
                srfp->srf_iphp = iphp;
                srfp->srf_first_thp = srfp->srf_last_thp = thp;

                ASSERT3P(mp->b_cont, ==, NULL);
                srfp->srf_mp = mp;
                srfp->srf_mpp = &(mp->b_cont);

                srfp->srf_len = ntohs(iphp->ip_len);

                /*
                 * If the flow is not already in the list of occupied flows then
                 * add it.
                 */
                if (srfp->srf_next == NULL &&
                    srp->sr_srfpp != &(srfp->srf_next)) {
                        *(srp->sr_srfpp) = srfp;
                        srp->sr_srfpp = &(srfp->srf_next);
                }
        } else {
                /* Later packet in this flow - skip TCP header */
                srfp->srf_last_thp = thp;

                mp->b_rptr += off;
                ASSERT3U(MBLKL(mp), ==, size);

                ASSERT3P(mp->b_cont, ==, NULL);
                *(srfp->srf_mpp) = mp;
                srfp->srf_mpp = &(mp->b_cont);

                srfp->srf_len += size;

                ASSERT(srfp->srf_next != NULL ||
                    srp->sr_srfpp == &(srfp->srf_next));
        }

        DTRACE_PROBE2(flow_add, uint32_t, srfp->srf_tag, size_t, size);

        /*
         * Try to align coalesced segments on push boundaries, unless they
         * are too frequent.
         */
        if (sp->s_rx_coalesce_mode == SFXGE_RX_COALESCE_ALLOW_PUSH &&
            thp->th_flags & TH_PUSH)
                sfxge_rx_qflow_complete(srp, srfp);

        srfp->srf_lbolt = now;
        return (B_TRUE);

fail4:
fail3:
fail2:
fail1:
        sfxge_rx_qflow_complete(srp, srfp);

        srfp->srf_lbolt = now;
        return (B_FALSE);
}

void
sfxge_rx_qpacket_coalesce(sfxge_rxq_t *srp)
{
        sfxge_t *sp = srp->sr_sp;
        clock_t now;
        mblk_t *mp;
        sfxge_rx_flow_t *srfp;

        ASSERT(sp->s_rx_coalesce_mode != SFXGE_RX_COALESCE_OFF);

        now = ddi_get_lbolt();

        mp = srp->sr_mp;

        srp->sr_mp = NULL;
        srp->sr_mpp = &(srp->sr_mp);

        /* Start with the last flow to be appended to */
        srfp = *(srp->sr_srfpp);

        while (mp != NULL) {
                frtn_t *freep;
                sfxge_rx_packet_t *srpp;
                struct ether_header *etherhp;
                struct ip *iphp;
                struct tcphdr *thp;
                size_t off;
                size_t size;
                uint16_t ether_tci;
                uint32_t hash;
                uint32_t tag;
                mblk_t *next;
                sfxge_packet_type_t pkt_type;
                uint16_t sport, dport;

                next = mp->b_next;
                mp->b_next = NULL;

                if (next != NULL)
                        prefetch_read_many(next);

                freep = DB_FRTNP(mp);
                /*LINTED*/
                srpp = (sfxge_rx_packet_t *)(freep->free_arg);
                ASSERT3P(srpp->srp_mp, ==, mp);

                /* If the packet is not TCP then we cannot coalesce it */
                if (~(srpp->srp_flags) & EFX_PKT_TCP)
                        goto reject;

                /*
                 * If the packet is not fully checksummed then we cannot
                 * coalesce it.
                 */
                if (~(srpp->srp_flags) & (EFX_CKSUM_TCPUDP | EFX_CKSUM_IPV4))
                        goto reject;

                /* Parse the TCP header */
                pkt_type = sfxge_pkthdr_parse(mp, &etherhp, &iphp, &thp, &off,
                    &size, &sport, &dport);
                ASSERT(pkt_type == SFXGE_PACKET_TYPE_IPV4_TCP);
                ASSERT(etherhp != NULL);
                ASSERT(iphp != NULL);
                ASSERT(thp != NULL);
                ASSERT(off != 0);

                if ((iphp->ip_off & ~htons(IP_DF)) != 0)
                        goto reject;

                if (etherhp->ether_type == htons(ETHERTYPE_VLAN)) {
                        struct ether_vlan_header *ethervhp;

                        ethervhp = (struct ether_vlan_header *)etherhp;
                        ether_tci = ethervhp->ether_tci;
                } else {
                        ether_tci = 0;
                }

                /*
                 * Make sure any minimum length padding is stripped
                 * before we try to add the packet to a flow.
                 */
                ASSERT3U(sp->s_rx_prefix_size + MBLKL(mp), ==,
                    (size_t)(srpp->srp_size));
                ASSERT3U(sp->s_rx_prefix_size + off + size, <=,
                    (size_t)(srpp->srp_size));

                if (sp->s_rx_prefix_size + off + size <
                    (size_t)(srpp->srp_size))
                        mp->b_wptr = mp->b_rptr + off + size;

                /*
                 * If there is no current flow, or the segment does not match
                 * the current flow then we must attempt to look up the
                 * correct flow in the table.
                 */
                if (srfp == NULL)
                        goto lookup;

                if (srfp->srf_saddr != iphp->ip_src.s_addr ||
                    srfp->srf_daddr != iphp->ip_dst.s_addr)
                        goto lookup;

                if (srfp->srf_sport != thp->th_sport ||
                    srfp->srf_dport != thp->th_dport)
                        goto lookup;

                if (srfp->srf_tci != ether_tci)
                        goto lookup;

add:
                ASSERT(srfp != NULL);

                srpp->srp_etherhp = etherhp;
                srpp->srp_iphp = iphp;
                srpp->srp_thp = thp;
                srpp->srp_off = off;

                ASSERT3U(size, <, (1 << 16));
                srpp->srp_size = (uint16_t)size;

                /* Try to append the packet to the flow */
                if (!sfxge_rx_qflow_add(srp, srfp, srpp, now))
                        goto reject;

                mp = next;
                continue;

lookup:
                /*
                 * If there is a prefix area then read the hash from that,
                 * otherwise calculate it.
                 */
                if (sp->s_rx_prefix_size != 0) {
                        hash = efx_psuedo_hdr_hash_get(sp->s_enp,
                            EFX_RX_HASHALG_TOEPLITZ,
                            DB_BASE(mp));
                } else {
                        SFXGE_TCP_HASH(sp,
                            &iphp->ip_src.s_addr,
                            thp->th_sport,
                            &iphp->ip_dst.s_addr,
                            thp->th_dport,
                            hash);
                }

                srfp = &(srp->sr_flow[(hash >> 6) % SFXGE_MAX_FLOW]);
                tag = hash + 1; /* Make sure it's not zero */

                /*
                 * If the flow we have found does not match the hash then
                 * it may be an unused flow, or it may be stale.
                 */
                if (tag != srfp->srf_tag) {
                        if (srfp->srf_count != 0) {
                                if (now - srfp->srf_lbolt <= srp->sr_rto)
                                        goto reject;
                        }

                        if (srfp->srf_mp != NULL)
                                goto reject;

                        /* Start a new flow */
                        ASSERT(srfp->srf_next == NULL);

                        srfp->srf_tag = tag;

                        srfp->srf_saddr = iphp->ip_src.s_addr;
                        srfp->srf_daddr = iphp->ip_dst.s_addr;
                        srfp->srf_sport = thp->th_sport;
                        srfp->srf_dport = thp->th_dport;
                        srfp->srf_tci = ether_tci;

                        srfp->srf_count = 0;
                        srfp->srf_seq = ntohl(thp->th_seq);

                        srfp->srf_lbolt = now;
                        goto add;
                }

                /*
                 * If the flow we have found does match the hash then it could
                 * still be an alias.
                 */
                if (srfp->srf_saddr != iphp->ip_src.s_addr ||
                    srfp->srf_daddr != iphp->ip_dst.s_addr)
                        goto reject;

                if (srfp->srf_sport != thp->th_sport ||
                    srfp->srf_dport != thp->th_dport)
                        goto reject;

                if (srfp->srf_tci != ether_tci)
                        goto reject;

                goto add;

reject:
                *(srp->sr_mpp) = mp;
                srp->sr_mpp = &(mp->b_next);

                mp = next;
        }
}

void
sfxge_rx_qcomplete(sfxge_rxq_t *srp, boolean_t eop)
{
        sfxge_t *sp = srp->sr_sp;
        unsigned int index = srp->sr_index;
        sfxge_evq_t *sep = sp->s_sep[index];
        unsigned int completed;
        sfxge_rx_fpp_t *srfppp = &(srp->sr_fpp);
        unsigned int level;

        ASSERT(mutex_owned(&(sep->se_lock)));

        ASSERT(srp->sr_mp == NULL);
        ASSERT(srp->sr_mpp == &(srp->sr_mp));

        completed = srp->sr_completed;
        while (completed != srp->sr_pending) {
                unsigned int id;
                sfxge_rx_packet_t *srpp;
                mblk_t *mp;
                size_t size;
                uint16_t flags;
                int rc;

                id = completed++ & (sp->s_rxq_size - 1);

                if (srp->sr_pending - completed >= 4) {
                        unsigned int prefetch;

                        prefetch = (id + 4) & (sp->s_rxq_size - 1);

                        srpp = srp->sr_srpp[prefetch];
                        ASSERT(srpp != NULL);

                        mp = srpp->srp_mp;
                        prefetch_read_many(mp->b_datap);
                } else if (completed == srp->sr_pending) {
                        prefetch_read_many(srp->sr_mp);
                }

                srpp = srp->sr_srpp[id];
                ASSERT(srpp != NULL);

                srp->sr_srpp[id] = NULL;

                mp = srpp->srp_mp;
                ASSERT(mp->b_cont == NULL);

                /* when called from sfxge_rx_qstop() */
                if (srp->sr_state != SFXGE_RXQ_STARTED)
                        goto discard;

                if (srpp->srp_flags & (EFX_ADDR_MISMATCH | EFX_DISCARD))
                        goto discard;

                /* Make the data visible to the kernel */
                rc = ddi_dma_sync(srpp->srp_dma_handle, 0,
                    sp->s_rx_buffer_size, DDI_DMA_SYNC_FORKERNEL);
                ASSERT3P(rc, ==, DDI_SUCCESS);

                /* Read the length from the psuedo header if required */
                if (srpp->srp_flags & EFX_PKT_PREFIX_LEN) {
                        rc = efx_psuedo_hdr_pkt_length_get(sp->s_enp,
                            mp->b_rptr,
                            &srpp->srp_size);
                        ASSERT3P(rc, ==, 0);
                        srpp->srp_size += sp->s_rx_prefix_size;
                }

                /* Set up the packet length */
                ASSERT3P(mp->b_rptr, ==, DB_BASE(mp));
                mp->b_rptr += sp->s_rx_prefix_size;

                prefetch_read_many(mp->b_rptr);

                ASSERT3P(mp->b_wptr, ==, DB_BASE(mp));
                mp->b_wptr += (size_t)(srpp->srp_size);
                ASSERT3P(mp->b_wptr, <=, DB_LIM(mp));

                /* Calculate the maximum packet size */
                size = sp->s_mtu;
                size += (srpp->srp_flags & EFX_PKT_VLAN_TAGGED) ?
                    sizeof (struct ether_vlan_header) :
                    sizeof (struct ether_header);

                if (MBLKL(mp) > size)
                        goto discard;

                /* Check for loopback packets */
                if (!(srpp->srp_flags & EFX_PKT_IPV4) &&
                    !(srpp->srp_flags & EFX_PKT_IPV6)) {
                        struct ether_header *etherhp;

                        /*LINTED*/
                        etherhp = (struct ether_header *)(mp->b_rptr);

                        if (etherhp->ether_type ==
                            htons(SFXGE_ETHERTYPE_LOOPBACK)) {
                                DTRACE_PROBE(loopback);

                                srp->sr_loopback++;
                                goto discard;
                        }
                }

                /* Set up the checksum information */
                flags = 0;

                if (srpp->srp_flags & EFX_CKSUM_IPV4) {
                        ASSERT(srpp->srp_flags & EFX_PKT_IPV4);
                        flags |= HCK_IPV4_HDRCKSUM;
                }

                if (srpp->srp_flags & EFX_CKSUM_TCPUDP) {
                        ASSERT(srpp->srp_flags & EFX_PKT_TCP ||
                            srpp->srp_flags & EFX_PKT_UDP);
                        flags |= HCK_FULLCKSUM | HCK_FULLCKSUM_OK;
                }

                DB_CKSUMSTART(mp) = 0;
                DB_CKSUMSTUFF(mp) = 0;
                DB_CKSUMEND(mp) = 0;
                DB_CKSUMFLAGS(mp) = flags;
                DB_CKSUM16(mp) = 0;

                /* Add the packet to the tail of the chain */
                srfppp->srfpp_loaned++;

                ASSERT(mp->b_next == NULL);
                *(srp->sr_mpp) = mp;
                srp->sr_mpp = &(mp->b_next);

                continue;

discard:
                /* Return the packet to the pool */
                srfppp->srfpp_loaned++;
                freeb(mp); /* Equivalent to freemsg() as b_cont==0 */
        }
        srp->sr_completed = completed;

        /* Attempt to coalesce any TCP packets */
        if (sp->s_rx_coalesce_mode != SFXGE_RX_COALESCE_OFF)
                sfxge_rx_qpacket_coalesce(srp);

        /*
         * If there are any pending flows and this is the end of the
         * poll then they must be completed.
         */
        if (srp->sr_srfp != NULL && eop) {
                sfxge_rx_flow_t *srfp;

                srfp = srp->sr_srfp;

                srp->sr_srfp = NULL;
                srp->sr_srfpp = &(srp->sr_srfp);

                do {
                        sfxge_rx_flow_t *next;

                        next = srfp->srf_next;
                        srfp->srf_next = NULL;

                        sfxge_rx_qflow_complete(srp, srfp);

                        srfp = next;
                } while (srfp != NULL);
        }

        level = srp->sr_pushed - srp->sr_completed;

        /* If there are any packets then pass them up the stack */
        if (srp->sr_mp != NULL) {
                mblk_t *mp;

                mp = srp->sr_mp;

                srp->sr_mp = NULL;
                srp->sr_mpp = &(srp->sr_mp);

                if (level == 0) {
                        /* Try to refill ASAP */
                        sfxge_rx_qrefill(srp, EFX_RXQ_LIMIT(sp->s_rxq_size));
                        level = srp->sr_pushed - srp->sr_completed;
                }

                /*
                 * If the RXQ is still empty, discard and recycle the
                 * current entry to ensure that the ring always
                 * contains at least one descriptor. This ensures that
                 * the next hardware RX will trigger an event
                 * (possibly delayed by interrupt moderation) and
                 * trigger another refill/fill attempt.
                 *
                 * Note this drops a complete LRO fragment from the
                 * start of the batch.
                 *
                 * Note also that copymsgchain() does not help with
                 * resource starvation here, unless we are short of DMA
                 * mappings.
                 */
                if (level == 0) {
                        mblk_t *nmp;

                        srp->sr_kstat.srk_rxq_empty_discard++;
                        DTRACE_PROBE1(rxq_empty_discard, int, index);
                        nmp = mp->b_next;
                        if (nmp)
                                sfxge_gld_rx_post(sp, index, nmp);
                        /* as level==0 will swizzle,rxpost below */
                        freemsg(mp);
                } else {
                        sfxge_gld_rx_post(sp, index, mp);
                }
        }

        /* Top up the queue if necessary */
        if (level < srp->sr_hiwat) {
                sfxge_rx_qrefill(srp, EFX_RXQ_LIMIT(sp->s_rxq_size));

                level = srp->sr_added - srp->sr_completed;
                if (level < srp->sr_lowat)
                        sfxge_rx_qfill(srp, EFX_RXQ_LIMIT(sp->s_rxq_size));
        }
}

void
sfxge_rx_qflush_done(sfxge_rxq_t *srp)
{
        sfxge_t *sp = srp->sr_sp;
        unsigned int index = srp->sr_index;
        sfxge_evq_t *sep = sp->s_sep[index];
        boolean_t flush_pending;

        ASSERT(mutex_owned(&(sep->se_lock)));

        /*
         * Flush successful: wakeup sfxge_rx_qstop() if flush is pending.
         *
         * A delayed flush event received after RxQ stop has timed out
         * will be ignored, as then the flush state will not be PENDING
         * (see SFCbug22989).
         */
        flush_pending = (srp->sr_flush == SFXGE_FLUSH_PENDING);
        srp->sr_flush = SFXGE_FLUSH_DONE;
        if (flush_pending)
                cv_broadcast(&(srp->sr_flush_kv));
}

void
sfxge_rx_qflush_failed(sfxge_rxq_t *srp)
{
        sfxge_t *sp = srp->sr_sp;
        unsigned int index = srp->sr_index;
        sfxge_evq_t *sep = sp->s_sep[index];
        boolean_t flush_pending;

        ASSERT(mutex_owned(&(sep->se_lock)));

        /*
         * Flush failed: wakeup sfxge_rx_qstop() if flush is pending.
         *
         * A delayed flush event received after RxQ stop has timed out
         * will be ignored, as then the flush state will not be PENDING
         * (see SFCbug22989).
         */
        flush_pending = (srp->sr_flush == SFXGE_FLUSH_PENDING);
        srp->sr_flush = SFXGE_FLUSH_FAILED;
        if (flush_pending)
                cv_broadcast(&(srp->sr_flush_kv));
}

static void
sfxge_rx_qstop(sfxge_t *sp, unsigned int index)
{
        dev_info_t *dip = sp->s_dip;
        sfxge_evq_t *sep = sp->s_sep[index];
        sfxge_rxq_t *srp;
        clock_t timeout;
        unsigned int flush_tries = SFXGE_RX_QFLUSH_TRIES;
        int rc;

        ASSERT(mutex_owned(&(sp->s_state_lock)));

        mutex_enter(&(sep->se_lock));

        srp = sp->s_srp[index];
        ASSERT3U(srp->sr_state, ==, SFXGE_RXQ_STARTED);

        sfxge_rx_qpoll_stop(srp);

        /* Further packets are discarded by sfxge_rx_qcomplete() */
        srp->sr_state = SFXGE_RXQ_INITIALIZED;

        if (sp->s_hw_err != SFXGE_HW_OK) {
                /*
                 * Flag indicates possible hardware failure.
                 * Attempt flush but do not wait for it to complete.
                 */
                srp->sr_flush = SFXGE_FLUSH_DONE;
                (void) efx_rx_qflush(srp->sr_erp);
        }

        /* Wait upto 2sec for queue flushing to complete */
        timeout = ddi_get_lbolt() + drv_usectohz(SFXGE_RX_QFLUSH_USEC);

        while (srp->sr_flush != SFXGE_FLUSH_DONE && flush_tries-- > 0) {
                if ((rc = efx_rx_qflush(srp->sr_erp)) != 0) {
                        if (rc == EALREADY)
                                srp->sr_flush = SFXGE_FLUSH_DONE;
                        else
                                srp->sr_flush = SFXGE_FLUSH_FAILED;
                        break;
                }
                srp->sr_flush = SFXGE_FLUSH_PENDING;
                if (cv_timedwait(&(srp->sr_flush_kv), &(sep->se_lock),
                    timeout) < 0) {
                        /* Timeout waiting for successful or failed flush */
                        dev_err(dip, CE_NOTE,
                            SFXGE_CMN_ERR "rxq[%d] flush timeout", index);
                        break;
                }
        }

        if (srp->sr_flush == SFXGE_FLUSH_FAILED)
                dev_err(dip, CE_NOTE,
                    SFXGE_CMN_ERR "rxq[%d] flush failed", index);

        DTRACE_PROBE1(flush, sfxge_flush_state_t, srp->sr_flush);
        srp->sr_flush = SFXGE_FLUSH_DONE;

        /* Destroy the receive queue */
        efx_rx_qdestroy(srp->sr_erp);
        srp->sr_erp = NULL;

        /* Clear entries from the buffer table */
        sfxge_sram_buf_tbl_clear(sp, srp->sr_id,
            EFX_RXQ_NBUFS(sp->s_rxq_size));

        /*
         * Free any unused RX packets which had descriptors on the RXQ
         * Packets will be discard as state != STARTED
         */
        srp->sr_pending = srp->sr_added;
        sfxge_rx_qcomplete(srp, B_TRUE);

        ASSERT3U(srp->sr_completed, ==, srp->sr_pending);

        srp->sr_added = 0;
        srp->sr_pushed = 0;
        srp->sr_pending = 0;
        srp->sr_completed = 0;
        srp->sr_loopback = 0;

        srp->sr_lowat = 0;
        srp->sr_hiwat = 0;

        mutex_exit(&(sep->se_lock));
}

static void
sfxge_rx_kstat_fini(sfxge_rxq_t *srp)
{
        kstat_delete(srp->sr_ksp);
        srp->sr_ksp = NULL;
}

static void
sfxge_rx_qfini(sfxge_t *sp, unsigned int index)
{
        sfxge_rxq_t *srp = sp->s_srp[index];

        ASSERT3U(srp->sr_state, ==, SFXGE_RXQ_INITIALIZED);

        sp->s_srp[index] = NULL;
        srp->sr_state = SFXGE_RXQ_UNINITIALIZED;

        sfxge_rx_kstat_fini(srp);

        /* Empty the pool */
        sfxge_rx_qfpp_empty(srp);

        srp->sr_index = 0;

        kmem_cache_free(sp->s_rqc, srp);
}

static int
sfxge_rx_scale_kstat_update(kstat_t *ksp, int rw)
{
        sfxge_t *sp = ksp->ks_private;
        sfxge_rx_scale_t *srsp = &(sp->s_rx_scale);
        sfxge_intr_t *sip = &(sp->s_intr);
        kstat_named_t *knp;
        unsigned int index;
        unsigned int entry;
        unsigned int *freq;
        int rc;

        ASSERT(mutex_owned(&(srsp->srs_lock)));

        if (rw != KSTAT_READ) {
                rc = EACCES;
                goto fail1;
        }

        if ((freq = kmem_zalloc(sizeof (unsigned int) * sip->si_nalloc,
            KM_NOSLEEP)) == NULL) {
                rc = ENOMEM;
                goto fail2;
        }

        for (entry = 0; entry < SFXGE_RX_SCALE_MAX; entry++) {
                index = srsp->srs_tbl[entry];

                freq[index]++;
        }

        knp = ksp->ks_data;
        for (index = 0; index < sip->si_nalloc; index++) {
                knp->value.ui64 = freq[index];
                knp++;
        }

        knp->value.ui64 = srsp->srs_count;

        kmem_free(freq, sizeof (unsigned int) * sip->si_nalloc);

        return (0);

fail2:
        DTRACE_PROBE(fail2);
fail1:
        DTRACE_PROBE1(fail1, int, rc);
        return (rc);
}

static int
sfxge_rx_scale_kstat_init(sfxge_t *sp)
{
        dev_info_t *dip = sp->s_dip;
        sfxge_rx_scale_t *srsp = &(sp->s_rx_scale);
        sfxge_intr_t *sip = &(sp->s_intr);
        char name[MAXNAMELEN];
        kstat_t *ksp;
        kstat_named_t *knp;
        unsigned int index;
        int rc;

        /* Create the set */
        (void) snprintf(name, MAXNAMELEN - 1, "%s_rss", ddi_driver_name(dip));

        if ((ksp = kstat_create((char *)ddi_driver_name(dip),
            ddi_get_instance(dip), name, "rss", KSTAT_TYPE_NAMED,
            sip->si_nalloc + 1, 0)) == NULL) {
                rc = ENOMEM;
                goto fail1;
        }

        srsp->srs_ksp = ksp;

        ksp->ks_update = sfxge_rx_scale_kstat_update;
        ksp->ks_private = sp;
        ksp->ks_lock = &(srsp->srs_lock);

        /* Initialise the named stats */
        knp = ksp->ks_data;
        for (index = 0; index < sip->si_nalloc; index++) {
                char name[MAXNAMELEN];

                (void) snprintf(name, MAXNAMELEN - 1, "evq%04d_count", index);
                kstat_named_init(knp, name, KSTAT_DATA_UINT64);
                knp++;
        }

        kstat_named_init(knp, "scale", KSTAT_DATA_UINT64);

        kstat_install(ksp);
        return (0);

fail1:
        DTRACE_PROBE1(fail1, int, rc);

        return (rc);
}

static void
sfxge_rx_scale_kstat_fini(sfxge_t *sp)
{
        sfxge_rx_scale_t *srsp = &(sp->s_rx_scale);

        /* Destroy the set */
        kstat_delete(srsp->srs_ksp);
        srsp->srs_ksp = NULL;
}


unsigned int
sfxge_rx_scale_prop_get(sfxge_t *sp)
{
        int rx_scale;

        rx_scale = ddi_prop_get_int(DDI_DEV_T_ANY, sp->s_dip,
            DDI_PROP_DONTPASS, "rx_scale_count", SFXGE_RX_SCALE_MAX);
        /* 0 and all -ve numbers sets to number of logical CPUs */
        if (rx_scale <= 0)
                rx_scale = ncpus;

        return (rx_scale);
}


static int
sfxge_rx_scale_init(sfxge_t *sp)
{
        sfxge_rx_scale_t *srsp = &(sp->s_rx_scale);
        sfxge_intr_t *sip = &(sp->s_intr);
        int rc;

        ASSERT3U(srsp->srs_state, ==, SFXGE_RX_SCALE_UNINITIALIZED);

        /* Create tables for CPU, core, cache and chip counts */
        srsp->srs_cpu = kmem_zalloc(sizeof (unsigned int) * NCPU, KM_SLEEP);

        mutex_init(&(srsp->srs_lock), NULL, MUTEX_DRIVER, NULL);

        /* We need at least one event queue */
        srsp->srs_count = sfxge_rx_scale_prop_get(sp);
        if (srsp->srs_count > sip->si_nalloc)
                srsp->srs_count = sip->si_nalloc;
        if (srsp->srs_count < 1)
                srsp->srs_count = 1;

        /* Set up the kstats */
        if ((rc = sfxge_rx_scale_kstat_init(sp)) != 0)
                goto fail1;

        srsp->srs_state = SFXGE_RX_SCALE_INITIALIZED;

        return (0);

fail1:
        DTRACE_PROBE1(fail1, int, rc);
        mutex_destroy(&(srsp->srs_lock));

        return (rc);
}

void
sfxge_rx_scale_update(void *arg)
{
        sfxge_t *sp = arg;
        sfxge_rx_scale_t *srsp = &(sp->s_rx_scale);
        sfxge_intr_t *sip;
        processorid_t id;
        unsigned int count;
        unsigned int *tbl;
        unsigned int *rating;
        unsigned int entry;
        int rc;

        mutex_enter(&(srsp->srs_lock));

        if (srsp->srs_state != SFXGE_RX_SCALE_STARTED) {
                rc = EFAULT;
                goto fail1;
        }

        if ((tbl =  kmem_zalloc(sizeof (unsigned int) * SFXGE_RX_SCALE_MAX,
            KM_NOSLEEP)) == NULL) {
                rc = ENOMEM;
                goto fail2;
        }

        sip = &(sp->s_intr);
        if ((rating = kmem_zalloc(sizeof (unsigned int) * sip->si_nalloc,
            KM_NOSLEEP)) == NULL) {
                rc = ENOMEM;
                goto fail3;
        }

        mutex_enter(&cpu_lock);

        /*
         * Substract any current CPU, core, cache and chip usage from the
         * global contention tables.
         */
        for (id = 0; id < NCPU; id++) {
                ASSERT3U(sfxge_cpu[id], >=, srsp->srs_cpu[id]);
                sfxge_cpu[id] -= srsp->srs_cpu[id];
                srsp->srs_cpu[id] = 0;
        }

        ASSERT(srsp->srs_count != 0);

        /* Choose as many event queues as we need */
        for (count = 0; count < srsp->srs_count; count++) {
                unsigned int index;
                sfxge_evq_t *sep;
                unsigned int choice;
                unsigned int choice_rating;

                bzero(rating, sizeof (unsigned int) * sip->si_nalloc);

                /*
                 * Rate each event queue on its global level of CPU
                 * contention.
                 */
                for (index = 0; index < sip->si_nalloc; index++) {
                        sep = sp->s_sep[index];

                        id = sep->se_cpu_id;
                        rating[index] += sfxge_cpu[id];
                }

                /* Choose the queue with the lowest CPU contention */
                choice = 0;
                choice_rating = rating[0];

                for (index = 1; index < sip->si_nalloc; index++) {
                        if (rating[index] < choice_rating) {
                                choice = index;
                                choice_rating = rating[index];
                        }
                }

                /* Add our choice to the condensed RSS table */
                tbl[count] = choice;

                /* Add information to the global contention tables */
                sep = sp->s_sep[choice];

                id = sep->se_cpu_id;
                srsp->srs_cpu[id]++;
                sfxge_cpu[id]++;
        }

        mutex_exit(&cpu_lock);

        /* Build the expanded RSS table */
        count = 0;
        for (entry = 0; entry < SFXGE_RX_SCALE_MAX; entry++) {
                unsigned int index;

                index = tbl[count];
                count = (count + 1) % srsp->srs_count;

                srsp->srs_tbl[entry] = index;
        }

        /* Program the expanded RSS table into the hardware */
        (void) efx_rx_scale_tbl_set(sp->s_enp, srsp->srs_tbl,
            SFXGE_RX_SCALE_MAX);

        mutex_exit(&(srsp->srs_lock));
        kmem_free(rating, sizeof (unsigned int) * sip->si_nalloc);
        kmem_free(tbl, sizeof (unsigned int) * SFXGE_RX_SCALE_MAX);
        return;

fail3:
        DTRACE_PROBE(fail3);
        kmem_free(tbl, sizeof (unsigned int) * SFXGE_RX_SCALE_MAX);
fail2:
        DTRACE_PROBE(fail2);
fail1:
        DTRACE_PROBE1(fail1, int, rc);

        mutex_exit(&(srsp->srs_lock));
}

static int
sfxge_rx_scale_start(sfxge_t *sp)
{
        sfxge_rx_scale_t *srsp = &(sp->s_rx_scale);
        int rc;

        mutex_enter(&(srsp->srs_lock));

        ASSERT3U(srsp->srs_state, ==, SFXGE_RX_SCALE_INITIALIZED);

        /* Clear down the RSS table */
        bzero(srsp->srs_tbl, sizeof (unsigned int) * SFXGE_RX_SCALE_MAX);

        (void) efx_rx_scale_tbl_set(sp->s_enp, srsp->srs_tbl,
            SFXGE_RX_SCALE_MAX);

        if ((rc = sfxge_toeplitz_hash_init(sp)) != 0)
                goto fail1;

        srsp->srs_state = SFXGE_RX_SCALE_STARTED;

        mutex_exit(&(srsp->srs_lock));

        /* sfxge_t->s_state_lock held */
        (void) ddi_taskq_dispatch(sp->s_tqp, sfxge_rx_scale_update, sp,
            DDI_SLEEP);

        return (0);

fail1:
        DTRACE_PROBE1(fail1, int, rc);

        mutex_exit(&(srsp->srs_lock));

        return (rc);
}

int
sfxge_rx_scale_count_get(sfxge_t *sp, unsigned int *countp)
{
        sfxge_rx_scale_t *srsp = &(sp->s_rx_scale);
        int rc;

        mutex_enter(&(srsp->srs_lock));

        if (srsp->srs_state != SFXGE_RX_SCALE_INITIALIZED &&
            srsp->srs_state != SFXGE_RX_SCALE_STARTED) {
                rc = ENOTSUP;
                goto fail1;
        }

        *countp = srsp->srs_count;

        mutex_exit(&(srsp->srs_lock));

        return (0);

fail1:
        DTRACE_PROBE1(fail1, int, rc);

        mutex_exit(&(srsp->srs_lock));

        return (rc);
}

int
sfxge_rx_scale_count_set(sfxge_t *sp, unsigned int count)
{
        sfxge_rx_scale_t *srsp = &(sp->s_rx_scale);
        sfxge_intr_t *sip = &(sp->s_intr);
        int dispatch = 1;
        int rc;

        if (count < 1 || count > sip->si_nalloc) {
                rc = EINVAL;
                goto fail1;
        }

        mutex_enter(&(srsp->srs_lock));

        if (srsp->srs_state != SFXGE_RX_SCALE_INITIALIZED &&
            srsp->srs_state != SFXGE_RX_SCALE_STARTED) {
                rc = ENOTSUP;
                goto fail2;
        }

        srsp->srs_count = count;

        if (srsp->srs_state != SFXGE_RX_SCALE_STARTED)
                dispatch = 0;

        mutex_exit(&(srsp->srs_lock));

        if (dispatch)
                /* no locks held */
                (void) ddi_taskq_dispatch(sp->s_tqp, sfxge_rx_scale_update, sp,
                    DDI_SLEEP);

        return (0);

fail2:
        DTRACE_PROBE(fail2);

        mutex_exit(&(srsp->srs_lock));

fail1:
        DTRACE_PROBE1(fail1, int, rc);

        return (rc);
}

static void
sfxge_rx_scale_stop(sfxge_t *sp)
{
        sfxge_rx_scale_t *srsp = &(sp->s_rx_scale);
        processorid_t id;

        mutex_enter(&(srsp->srs_lock));

        ASSERT3U(srsp->srs_state, ==, SFXGE_RX_SCALE_STARTED);

        srsp->srs_state = SFXGE_RX_SCALE_INITIALIZED;

        mutex_enter(&cpu_lock);

        /*
         * Substract any current CPU, core, cache and chip usage from the
         * global contention tables.
         */
        for (id = 0; id < NCPU; id++) {
                ASSERT3U(sfxge_cpu[id], >=, srsp->srs_cpu[id]);
                sfxge_cpu[id] -= srsp->srs_cpu[id];
                srsp->srs_cpu[id] = 0;
        }

        mutex_exit(&cpu_lock);

        /* Clear down the RSS table */
        bzero(srsp->srs_tbl, sizeof (unsigned int) * SFXGE_RX_SCALE_MAX);

        (void) efx_rx_scale_tbl_set(sp->s_enp, srsp->srs_tbl,
            SFXGE_RX_SCALE_MAX);

        mutex_exit(&(srsp->srs_lock));
}

static void
sfxge_rx_scale_fini(sfxge_t *sp)
{
        sfxge_rx_scale_t *srsp = &(sp->s_rx_scale);

        ASSERT3U(srsp->srs_state, ==, SFXGE_RX_SCALE_INITIALIZED);

        srsp->srs_state = SFXGE_RX_SCALE_UNINITIALIZED;

        /* Tear down the kstats */
        sfxge_rx_scale_kstat_fini(sp);

        srsp->srs_count = 0;

        mutex_destroy(&(srsp->srs_lock));

        /* Destroy tables */
        kmem_free(srsp->srs_cpu, sizeof (unsigned int) * NCPU);
        srsp->srs_cpu = NULL;

        sfxge_toeplitz_hash_fini(sp);
}

int
sfxge_rx_init(sfxge_t *sp)
{
        sfxge_intr_t *sip = &(sp->s_intr);
        char name[MAXNAMELEN];
        int index;
        int rc;

        if (sip->si_state == SFXGE_INTR_UNINITIALIZED) {
                rc = EINVAL;
                goto fail1;
        }

        if ((rc = sfxge_rx_scale_init(sp)) != 0)
                goto fail2;

        (void) snprintf(name, MAXNAMELEN - 1, "%s%d_rx_packet_cache",
            ddi_driver_name(sp->s_dip), ddi_get_instance(sp->s_dip));

        sp->s_rpc = kmem_cache_create(name, sizeof (sfxge_rx_packet_t),
            SFXGE_CPU_CACHE_SIZE, sfxge_rx_packet_ctor, sfxge_rx_packet_dtor,
            NULL, sp, NULL, 0);
        ASSERT(sp->s_rpc != NULL);

        (void) snprintf(name, MAXNAMELEN - 1, "%s%d_rxq_cache",
            ddi_driver_name(sp->s_dip), ddi_get_instance(sp->s_dip));

        sp->s_rqc = kmem_cache_create(name, sizeof (sfxge_rxq_t),
            SFXGE_CPU_CACHE_SIZE, sfxge_rx_qctor, sfxge_rx_qdtor, NULL, sp,
            NULL, 0);
        ASSERT(sp->s_rqc != NULL);

        sp->s_rx_pkt_mem_max = ddi_prop_get_int64(DDI_DEV_T_ANY, sp->s_dip,
            DDI_PROP_DONTPASS, "rx_pkt_mem_max", 0); /* disabled */

        /* Initialize the receive queue(s) */
        for (index = 0; index < sip->si_nalloc; index++) {
                if ((rc = sfxge_rx_qinit(sp, index)) != 0)
                        goto fail3;
        }

        sp->s_rx_coalesce_mode = ddi_prop_get_int(DDI_DEV_T_ANY, sp->s_dip,
            DDI_PROP_DONTPASS, "rx_coalesce_mode", SFXGE_RX_COALESCE_OFF);

        return (0);

fail3:
        DTRACE_PROBE(fail3);

        /* Tear down the receive queue(s) */
        while (--index >= 0)
                sfxge_rx_qfini(sp, index);

        kmem_cache_destroy(sp->s_rqc);
        sp->s_rqc = NULL;

        kmem_cache_destroy(sp->s_rpc);
        sp->s_rpc = NULL;

        sfxge_rx_scale_fini(sp);

fail2:
        DTRACE_PROBE(fail2);
fail1:
        DTRACE_PROBE1(fail1, int, rc);

        return (rc);
}

int
sfxge_rx_start(sfxge_t *sp)
{
        sfxge_mac_t *smp = &(sp->s_mac);
        sfxge_intr_t *sip;
        const efx_nic_cfg_t *encp;
        size_t hdrlen, align;
        int index;
        int rc;

        mutex_enter(&(smp->sm_lock));

        /* Calculate the receive packet buffer size and alignment */
        sp->s_rx_buffer_size = EFX_MAC_PDU(sp->s_mtu);

        encp = efx_nic_cfg_get(sp->s_enp);

        /* Packet buffer allocations are cache line aligned */
        EFSYS_ASSERT3U(encp->enc_rx_buf_align_start, <=, SFXGE_CPU_CACHE_SIZE);

        if (sp->s_family == EFX_FAMILY_HUNTINGTON) {
                sp->s_rx_prefix_size = encp->enc_rx_prefix_size;

                hdrlen = sp->s_rx_prefix_size + sizeof (struct ether_header);

                /* Ensure IP headers are 32bit aligned */
                sp->s_rx_buffer_align = P2ROUNDUP(hdrlen, 4) - hdrlen;
                sp->s_rx_buffer_size += sp->s_rx_buffer_align;

        } else if (encp->enc_features & EFX_FEATURE_LFSR_HASH_INSERT) {
                sp->s_rx_prefix_size = encp->enc_rx_prefix_size;

                /*
                 * Place the start of the buffer a prefix length minus 2
                 * before the start of a cache line. This ensures that the
                 * last two bytes of the prefix (which is where the LFSR hash
                 * is located) are in the same cache line as the headers, and
                 * the IP header is 32-bit aligned.
                 */
                sp->s_rx_buffer_align =
                    SFXGE_CPU_CACHE_SIZE - (encp->enc_rx_prefix_size - 2);
                sp->s_rx_buffer_size += sp->s_rx_buffer_align;
        } else {
                sp->s_rx_prefix_size = 0;

                /*
                 * Place the start of the buffer 2 bytes after a cache line
                 * boundary so that the headers fit into the cache line and
                 * the IP header is 32-bit aligned.
                 */
                hdrlen = sp->s_rx_prefix_size + sizeof (struct ether_header);

                sp->s_rx_buffer_align = P2ROUNDUP(hdrlen, 4) - hdrlen;
                sp->s_rx_buffer_size += sp->s_rx_buffer_align;
        }

        /* Align end of packet buffer for RX DMA end padding */
        align = MAX(1, encp->enc_rx_buf_align_end);
        EFSYS_ASSERT(ISP2(align));
        sp->s_rx_buffer_size = P2ROUNDUP(sp->s_rx_buffer_size, align);

        /* Initialize the receive module */
        if ((rc = efx_rx_init(sp->s_enp)) != 0)
                goto fail1;

        mutex_exit(&(smp->sm_lock));

        if ((rc = sfxge_rx_scale_start(sp)) != 0)
                goto fail2;

        /* Start the receive queue(s) */
        sip = &(sp->s_intr);
        for (index = 0; index < sip->si_nalloc; index++) {
                if ((rc = sfxge_rx_qstart(sp, index)) != 0)
                        goto fail3;
        }

        ASSERT3U(sp->s_srp[0]->sr_state, ==, SFXGE_RXQ_STARTED);
        /* It is sufficient to have Rx scale initialized */
        ASSERT3U(sp->s_rx_scale.srs_state, ==, SFXGE_RX_SCALE_STARTED);
        rc = efx_mac_filter_default_rxq_set(sp->s_enp, sp->s_srp[0]->sr_erp,
            sp->s_rx_scale.srs_count > 1);
        if (rc != 0)
                goto fail4;

        return (0);

fail4:
        DTRACE_PROBE(fail4);

fail3:
        DTRACE_PROBE(fail3);

        /* Stop the receive queue(s) */
        while (--index >= 0)
                sfxge_rx_qstop(sp, index);

        sfxge_rx_scale_stop(sp);

fail2:
        DTRACE_PROBE(fail2);

        mutex_enter(&(smp->sm_lock));

        /* Tear down the receive module */
        efx_rx_fini(sp->s_enp);

fail1:
        DTRACE_PROBE1(fail1, int, rc);

        mutex_exit(&(smp->sm_lock));

        return (rc);
}

void
sfxge_rx_coalesce_mode_get(sfxge_t *sp, sfxge_rx_coalesce_mode_t *modep)
{
        *modep = sp->s_rx_coalesce_mode;
}

int
sfxge_rx_coalesce_mode_set(sfxge_t *sp, sfxge_rx_coalesce_mode_t mode)
{
        int rc;

        switch (mode) {
        case SFXGE_RX_COALESCE_OFF:
        case SFXGE_RX_COALESCE_DISALLOW_PUSH:
        case SFXGE_RX_COALESCE_ALLOW_PUSH:
                break;

        default:
                rc = EINVAL;
                goto fail1;
        }

        sp->s_rx_coalesce_mode = mode;

        return (0);

fail1:
        DTRACE_PROBE1(fail1, int, rc);

        return (rc);
}

void
sfxge_rx_stop(sfxge_t *sp)
{
        sfxge_mac_t *smp = &(sp->s_mac);
        sfxge_intr_t *sip = &(sp->s_intr);
        efx_nic_t *enp = sp->s_enp;
        int index;

        ASSERT(mutex_owned(&(sp->s_state_lock)));

        efx_mac_filter_default_rxq_clear(enp);

        /* Stop the receive queue(s) */
        index = sip->si_nalloc;
        while (--index >= 0) {
                /* TBD: Flush RXQs in parallel; HW has limit + may need retry */
                sfxge_rx_qstop(sp, index);
        }

        sfxge_rx_scale_stop(sp);

        mutex_enter(&(smp->sm_lock));

        /* Tear down the receive module */
        efx_rx_fini(enp);

        sp->s_rx_buffer_align = 0;
        sp->s_rx_prefix_size = 0;
        sp->s_rx_buffer_size = 0;

        mutex_exit(&(smp->sm_lock));
}

unsigned int
sfxge_rx_loaned(sfxge_t *sp)
{
        sfxge_intr_t *sip = &(sp->s_intr);
        int index;
        unsigned int loaned;

        ASSERT3U(sip->si_state, ==, SFXGE_INTR_INITIALIZED);

        loaned = 0;
        for (index = 0; index < sip->si_nalloc; index++) {
                sfxge_rxq_t *srp = sp->s_srp[index];
                sfxge_evq_t *sep = sp->s_sep[srp->sr_index];

                mutex_enter(&(sep->se_lock));

                loaned += sfxge_rx_qfpp_swizzle(srp);

                mutex_exit(&(sep->se_lock));
        }

        return (loaned);
}

void
sfxge_rx_fini(sfxge_t *sp)
{
        sfxge_intr_t *sip = &(sp->s_intr);
        int index;

        ASSERT3U(sip->si_state, ==, SFXGE_INTR_INITIALIZED);

        sp->s_rx_coalesce_mode = SFXGE_RX_COALESCE_OFF;

        /* Tear down the receive queue(s) */
        index = sip->si_nalloc;
        while (--index >= 0)
                sfxge_rx_qfini(sp, index);

        ASSERT3U(sp->s_rx_pkt_mem_alloc, ==, 0);

        kmem_cache_destroy(sp->s_rqc);
        sp->s_rqc = NULL;

        kmem_cache_destroy(sp->s_rpc);
        sp->s_rpc = NULL;

        sfxge_rx_scale_fini(sp);
}