root/usr/src/uts/common/io/mlxcx/mlxcx_intr.c
/*
 * This file and its contents are supplied under the terms of the
 * Common Development and Distribution License ("CDDL"), version 1.0.
 * You may only use this file in accordance with the terms of version
 * 1.0 of the CDDL.
 *
 * A full copy of the text of the CDDL should have accompanied this
 * source.  A copy of the CDDL is also available via the Internet at
 * http://www.illumos.org/license/CDDL.
 */

/*
 * Copyright (c) 2021, the University of Queensland
 * Copyright 2020 RackTop Systems, Inc.
 * Copyright 2020 OmniOS Community Edition (OmniOSce) Association.
 */

/*
 * Mellanox Connect-X 4/5/6 driver.
 */

#include <sys/modctl.h>
#include <sys/conf.h>
#include <sys/devops.h>
#include <sys/sysmacros.h>
#include <sys/disp.h>
#include <sys/sdt.h>

#include <sys/mac_provider.h>

#include <mlxcx.h>

/*
 * CTASSERT(s) to cover bad values which would induce bugs.
 */
CTASSERT(MLXCX_CQ_LWM_GAP >= MLXCX_CQ_HWM_GAP);

/*
 * Disable interrupts.
 * The act of calling ddi_intr_disable() does not guarantee an interrupt
 * routine is not running, so flag the vector as quiescing and wait
 * for anything active to finish.
 */
void
mlxcx_intr_disable(mlxcx_t *mlxp)
{
        int i;

        mlxcx_cmd_eq_disable(mlxp);

        for (i = 0; i < mlxp->mlx_intr_count; ++i) {
                mlxcx_event_queue_t *mleq = &mlxp->mlx_eqs[i];

                mutex_enter(&mleq->mleq_mtx);

                if ((mleq->mleq_state & MLXCX_EQ_INTR_ENABLED) == 0) {
                        mutex_exit(&mleq->mleq_mtx);
                        continue;
                }

                (void) ddi_intr_disable(mlxp->mlx_intr_handles[i]);

                mleq->mleq_state |= MLXCX_EQ_INTR_QUIESCE;
                while ((mleq->mleq_state & MLXCX_EQ_INTR_ACTIVE) != 0)
                        cv_wait(&mleq->mleq_cv, &mleq->mleq_mtx);

                mleq->mleq_state &= ~MLXCX_EQ_INTR_ENABLED;

                mutex_exit(&mleq->mleq_mtx);
        }
}

void
mlxcx_intr_teardown(mlxcx_t *mlxp)
{
        int i;
        int ret;

        for (i = 0; i < mlxp->mlx_intr_count; ++i) {
                mlxcx_event_queue_t *mleq = &mlxp->mlx_eqs[i];

                mutex_enter(&mleq->mleq_mtx);
                VERIFY0(mleq->mleq_state & MLXCX_EQ_ALLOC);
                if (mleq->mleq_state & MLXCX_EQ_CREATED)
                        VERIFY(mleq->mleq_state & MLXCX_EQ_DESTROYED);
                if (i >= mlxp->mlx_intr_cq0) {
                        VERIFY(avl_is_empty(&mleq->mleq_cqs));
                        avl_destroy(&mleq->mleq_cqs);
                }
                mutex_exit(&mleq->mleq_mtx);
                (void) ddi_intr_remove_handler(mlxp->mlx_intr_handles[i]);
                ret = ddi_intr_free(mlxp->mlx_intr_handles[i]);
                if (ret != DDI_SUCCESS) {
                        mlxcx_warn(mlxp, "failed to free interrupt %d: %d",
                            i, ret);
                }
                mutex_destroy(&mleq->mleq_mtx);
                cv_destroy(&mleq->mleq_cv);
        }
        kmem_free(mlxp->mlx_intr_handles, mlxp->mlx_intr_size);
        kmem_free(mlxp->mlx_eqs, mlxp->mlx_eqs_size);
        mlxp->mlx_intr_handles = NULL;
        mlxp->mlx_eqs = NULL;
}

/*
 * Get the next SW-owned entry on the event queue, or NULL if we reach the end.
 */
static mlxcx_eventq_ent_t *
mlxcx_eq_next(mlxcx_event_queue_t *mleq)
{
        mlxcx_eventq_ent_t *ent;
        ddi_fm_error_t err;
        uint_t ci;
        const uint_t swowner = ((mleq->mleq_cc >> mleq->mleq_entshift) & 1);

        /*
         * This should only be called from interrupt context to ensure
         * correctness of mleq_cc.
         */
        ASSERT(servicing_interrupt());
        ASSERT(mleq->mleq_state & MLXCX_EQ_CREATED);
        ASSERT0(mleq->mleq_state & MLXCX_EQ_DESTROYED);

        /* mleq_nents is always a power of 2 */
        ci = mleq->mleq_cc & (mleq->mleq_nents - 1);

        ent = &mleq->mleq_ent[ci];
        VERIFY0(ddi_dma_sync(mleq->mleq_dma.mxdb_dma_handle,
            (uintptr_t)ent - (uintptr_t)mleq->mleq_ent,
            sizeof (mlxcx_eventq_ent_t), DDI_DMA_SYNC_FORCPU));
        ddi_fm_dma_err_get(mleq->mleq_dma.mxdb_dma_handle, &err,
            DDI_FME_VERSION);
        if (err.fme_status == DDI_FM_OK && (ent->mleqe_owner & 1) == swowner) {
                /* The PRM says we have to membar here, so we're doing it */
                membar_consumer();
                ++mleq->mleq_cc;
                return (ent);
        }
        /*
         * In the case of a DMA error, we should re-arm this EQ and then come
         * back and try again when the device wakes us back up.
         *
         * Hopefully the fault will be gone by then.
         */
        ddi_fm_dma_err_clear(mleq->mleq_dma.mxdb_dma_handle, DDI_FME_VERSION);

        return (NULL);
}

void
mlxcx_arm_eq(mlxcx_t *mlxp, mlxcx_event_queue_t *mleq)
{
        uint_t try = 0;
        ddi_fm_error_t err;
        bits32_t v = new_bits32();

        /*
         * This is only called during initialization when the EQ is
         * armed for the first time, and when re-armed at the end of
         * interrupt processing.
         */
        ASSERT(mutex_owned(&mleq->mleq_mtx) || servicing_interrupt());
        ASSERT(mleq->mleq_state & MLXCX_EQ_CREATED);
        ASSERT0(mleq->mleq_state & MLXCX_EQ_DESTROYED);
        ASSERT0(mleq->mleq_state & MLXCX_EQ_ARMED);
        ASSERT0(mleq->mleq_state & MLXCX_EQ_POLLING);

        mleq->mleq_state |= MLXCX_EQ_ARMED;
        mleq->mleq_cc_armed = mleq->mleq_cc;

        set_bits32(&v, MLXCX_EQ_ARM_EQN, mleq->mleq_num);
        set_bits32(&v, MLXCX_EQ_ARM_CI, mleq->mleq_cc);

retry:
        mlxcx_uar_put32(mlxp, mleq->mleq_uar, MLXCX_UAR_EQ_ARM,
            from_bits32(v));
        ddi_fm_acc_err_get(mlxp->mlx_regs_handle, &err,
            DDI_FME_VERSION);
        if (err.fme_status == DDI_FM_OK)
                return;
        if (try++ < mlxcx_doorbell_tries) {
                ddi_fm_acc_err_clear(mlxp->mlx_regs_handle, DDI_FME_VERSION);
                goto retry;
        }
        ddi_fm_service_impact(mlxp->mlx_dip, DDI_SERVICE_LOST);
}

static void
mlxcx_update_eq(mlxcx_t *mlxp, mlxcx_event_queue_t *mleq)
{
        bits32_t v = new_bits32();
        ddi_fm_error_t err;

        /*
         * This should only be called from interrupt context to ensure
         * correctness of mleq_cc.
         */
        ASSERT(servicing_interrupt());
        ASSERT(mleq->mleq_state & MLXCX_EQ_CREATED);
        ASSERT0(mleq->mleq_state & MLXCX_EQ_DESTROYED);
        ASSERT0(mleq->mleq_state & MLXCX_EQ_ARMED);

        set_bits32(&v, MLXCX_EQ_ARM_EQN, mleq->mleq_num);
        set_bits32(&v, MLXCX_EQ_ARM_CI, mleq->mleq_cc);

        mlxcx_uar_put32(mlxp, mleq->mleq_uar, MLXCX_UAR_EQ_NOARM,
            from_bits32(v));
        ddi_fm_acc_err_get(mlxp->mlx_regs_handle, &err,
            DDI_FME_VERSION);
        ddi_fm_acc_err_clear(mlxp->mlx_regs_handle, DDI_FME_VERSION);
        /*
         * Ignore the error, if it's still happening when we try to re-arm the
         * EQ, we will note the impact then.
         */
}

static mlxcx_completionq_ent_t *
mlxcx_cq_next(mlxcx_completion_queue_t *mlcq)
{
        mlxcx_completionq_ent_t *ent;
        ddi_fm_error_t err;
        uint_t ci;
        const uint_t swowner = ((mlcq->mlcq_cc >> mlcq->mlcq_entshift) & 1);

        ASSERT(mutex_owned(&mlcq->mlcq_mtx));
        ASSERT(mlcq->mlcq_state & MLXCX_CQ_CREATED);
        ASSERT0(mlcq->mlcq_state & MLXCX_CQ_DESTROYED);

        /* mlcq_nents is always a power of 2 */
        ci = mlcq->mlcq_cc & (mlcq->mlcq_nents - 1);

        ent = &mlcq->mlcq_ent[ci];
        VERIFY0(ddi_dma_sync(mlcq->mlcq_dma.mxdb_dma_handle,
            (uintptr_t)ent - (uintptr_t)mlcq->mlcq_ent,
            sizeof (mlxcx_completionq_ent_t), DDI_DMA_SYNC_FORCPU));
        ddi_fm_dma_err_get(mlcq->mlcq_dma.mxdb_dma_handle, &err,
            DDI_FME_VERSION);
        if (err.fme_status == DDI_FM_OK && (ent->mlcqe_owner & 1) == swowner) {
                /* The PRM says we have to membar here, so we're doing it */
                membar_consumer();
                ++mlcq->mlcq_cc;
                return (ent);
        }
        ddi_fm_dma_err_clear(mlcq->mlcq_dma.mxdb_dma_handle, DDI_FME_VERSION);

        return (NULL);
}

void
mlxcx_update_cqci(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq)
{
        ddi_fm_error_t err;
        uint_t try = 0;

        mlcq->mlcq_doorbell->mlcqd_update_ci = to_be24(mlcq->mlcq_cc);

retry:
        MLXCX_DMA_SYNC(mlcq->mlcq_doorbell_dma, DDI_DMA_SYNC_FORDEV);
        ddi_fm_dma_err_get(mlcq->mlcq_doorbell_dma.mxdb_dma_handle, &err,
            DDI_FME_VERSION);
        if (err.fme_status != DDI_FM_OK) {
                if (try++ < mlxcx_doorbell_tries) {
                        ddi_fm_dma_err_clear(
                            mlcq->mlcq_doorbell_dma.mxdb_dma_handle,
                            DDI_FME_VERSION);
                        goto retry;
                } else {
                        ddi_fm_service_impact(mlxp->mlx_dip, DDI_SERVICE_LOST);
                        return;
                }
        }
}

void
mlxcx_arm_cq(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq)
{
        bits32_t dbval = new_bits32();
        uint64_t udbval;
        ddi_fm_error_t err;
        uint_t try = 0;

        ASSERT(mutex_owned(&mlcq->mlcq_arm_mtx));
        ASSERT(mutex_owned(&mlcq->mlcq_mtx));
        ASSERT(mlcq->mlcq_state & MLXCX_CQ_CREATED);
        ASSERT0(mlcq->mlcq_state & MLXCX_CQ_DESTROYED);

        if (mlcq->mlcq_state & MLXCX_CQ_ARMED) {
                ASSERT3U(mlcq->mlcq_ec, >, mlcq->mlcq_ec_armed);
        }

        if (mlcq->mlcq_state & MLXCX_CQ_TEARDOWN)
                return;

        atomic_or_uint(&mlcq->mlcq_state, MLXCX_CQ_ARMED);
        mlcq->mlcq_cc_armed = mlcq->mlcq_cc;
        mlcq->mlcq_ec_armed = mlcq->mlcq_ec;

        set_bits32(&dbval, MLXCX_CQ_ARM_SEQ, mlcq->mlcq_ec);
        set_bits32(&dbval, MLXCX_CQ_ARM_CI, mlcq->mlcq_cc);

        udbval = (uint64_t)from_bits32(dbval) << 32;
        udbval |= mlcq->mlcq_num & 0xffffff;

        mlcq->mlcq_doorbell->mlcqd_update_ci = to_be24(mlcq->mlcq_cc);
        mlcq->mlcq_doorbell->mlcqd_arm_ci = dbval;

retry:
        MLXCX_DMA_SYNC(mlcq->mlcq_doorbell_dma, DDI_DMA_SYNC_FORDEV);
        ddi_fm_dma_err_get(mlcq->mlcq_doorbell_dma.mxdb_dma_handle, &err,
            DDI_FME_VERSION);
        if (err.fme_status != DDI_FM_OK) {
                if (try++ < mlxcx_doorbell_tries) {
                        ddi_fm_dma_err_clear(
                            mlcq->mlcq_doorbell_dma.mxdb_dma_handle,
                            DDI_FME_VERSION);
                        goto retry;
                } else {
                        goto err;
                }
        }

        mlxcx_uar_put64(mlxp, mlcq->mlcq_uar, MLXCX_UAR_CQ_ARM, udbval);
        ddi_fm_acc_err_get(mlxp->mlx_regs_handle, &err,
            DDI_FME_VERSION);
        if (err.fme_status == DDI_FM_OK)
                return;
        if (try++ < mlxcx_doorbell_tries) {
                ddi_fm_acc_err_clear(mlxp->mlx_regs_handle, DDI_FME_VERSION);
                goto retry;
        }

err:
        ddi_fm_service_impact(mlxp->mlx_dip, DDI_SERVICE_LOST);
}

const char *
mlxcx_event_name(mlxcx_event_t evt)
{
        switch (evt) {
        case MLXCX_EVENT_COMPLETION:
                return ("COMPLETION");
        case MLXCX_EVENT_PATH_MIGRATED:
                return ("PATH_MIGRATED");
        case MLXCX_EVENT_COMM_ESTABLISH:
                return ("COMM_ESTABLISH");
        case MLXCX_EVENT_SENDQ_DRAIN:
                return ("SENDQ_DRAIN");
        case MLXCX_EVENT_LAST_WQE:
                return ("LAST_WQE");
        case MLXCX_EVENT_SRQ_LIMIT:
                return ("SRQ_LIMIT");
        case MLXCX_EVENT_DCT_ALL_CLOSED:
                return ("DCT_ALL_CLOSED");
        case MLXCX_EVENT_DCT_ACCKEY_VIOL:
                return ("DCT_ACCKEY_VIOL");
        case MLXCX_EVENT_CQ_ERROR:
                return ("CQ_ERROR");
        case MLXCX_EVENT_WQ_CATASTROPHE:
                return ("WQ_CATASTROPHE");
        case MLXCX_EVENT_PATH_MIGRATE_FAIL:
                return ("PATH_MIGRATE_FAIL");
        case MLXCX_EVENT_PAGE_FAULT:
                return ("PAGE_FAULT");
        case MLXCX_EVENT_WQ_INVALID_REQ:
                return ("WQ_INVALID_REQ");
        case MLXCX_EVENT_WQ_ACCESS_VIOL:
                return ("WQ_ACCESS_VIOL");
        case MLXCX_EVENT_SRQ_CATASTROPHE:
                return ("SRQ_CATASTROPHE");
        case MLXCX_EVENT_INTERNAL_ERROR:
                return ("INTERNAL_ERROR");
        case MLXCX_EVENT_PORT_STATE:
                return ("PORT_STATE");
        case MLXCX_EVENT_GPIO:
                return ("GPIO");
        case MLXCX_EVENT_PORT_MODULE:
                return ("PORT_MODULE");
        case MLXCX_EVENT_TEMP_WARNING:
                return ("TEMP_WARNING");
        case MLXCX_EVENT_REMOTE_CONFIG:
                return ("REMOTE_CONFIG");
        case MLXCX_EVENT_DCBX_CHANGE:
                return ("DCBX_CHANGE");
        case MLXCX_EVENT_DOORBELL_CONGEST:
                return ("DOORBELL_CONGEST");
        case MLXCX_EVENT_STALL_VL:
                return ("STALL_VL");
        case MLXCX_EVENT_CMD_COMPLETION:
                return ("CMD_COMPLETION");
        case MLXCX_EVENT_PAGE_REQUEST:
                return ("PAGE_REQUEST");
        case MLXCX_EVENT_NIC_VPORT:
                return ("NIC_VPORT");
        case MLXCX_EVENT_EC_PARAMS_CHANGE:
                return ("EC_PARAMS_CHANGE");
        case MLXCX_EVENT_XRQ_ERROR:
                return ("XRQ_ERROR");
        }
        return ("UNKNOWN");
}

/* Should be called only when link state has changed. */
void
mlxcx_update_link_state(mlxcx_t *mlxp, mlxcx_port_t *port)
{
        link_state_t ls;

        mutex_enter(&port->mlp_mtx);
        (void) mlxcx_cmd_query_port_status(mlxp, port);
        (void) mlxcx_cmd_query_port_speed(mlxp, port);
        (void) mlxcx_cmd_query_port_fec(mlxp, port);

        switch (port->mlp_oper_status) {
        case MLXCX_PORT_STATUS_UP:
        case MLXCX_PORT_STATUS_UP_ONCE:
                ls = LINK_STATE_UP;
                break;
        case MLXCX_PORT_STATUS_DOWN:
                ls = LINK_STATE_DOWN;
                break;
        default:
                ls = LINK_STATE_UNKNOWN;
        }

        if (mlxp->mlx_mac_hdl != NULL)
                mac_link_update(mlxp->mlx_mac_hdl, ls);

        mutex_exit(&port->mlp_mtx);
}

CTASSERT(MLXCX_MANAGE_PAGES_MAX_PAGES < UINT_MAX);

static void
mlxcx_give_pages_once(mlxcx_t *mlxp, size_t npages)
{
        ddi_device_acc_attr_t acc;
        ddi_dma_attr_t attr;
        mlxcx_dev_page_t *mdp;
        mlxcx_dev_page_t **pages;
        size_t i;
        const ddi_dma_cookie_t *ck;

        /*
         * If this isn't enough, the HCA will ask for more
         */
        npages = MIN(npages, MLXCX_MANAGE_PAGES_MAX_PAGES);

        pages = kmem_zalloc(sizeof (*pages) * npages, KM_SLEEP);

        for (i = 0; i < npages; i++) {
                mdp = kmem_zalloc(sizeof (mlxcx_dev_page_t), KM_SLEEP);
                mlxcx_dma_acc_attr(mlxp, &acc);
                mlxcx_dma_page_attr(mlxp, &attr);
                if (!mlxcx_dma_alloc(mlxp, &mdp->mxdp_dma, &attr, &acc,
                    B_TRUE, MLXCX_HW_PAGE_SIZE, B_TRUE)) {
                        mlxcx_warn(mlxp, "failed to allocate 4k page %u/%lu", i,
                            npages);
                        kmem_free(mdp, sizeof (mlxcx_dev_page_t));
                        goto cleanup_npages;
                }
                ck = mlxcx_dma_cookie_one(&mdp->mxdp_dma);
                mdp->mxdp_pa = ck->dmac_laddress;
                pages[i] = mdp;
        }

        mutex_enter(&mlxp->mlx_pagemtx);

        if (!mlxcx_cmd_give_pages(mlxp,
            MLXCX_MANAGE_PAGES_OPMOD_GIVE_PAGES, npages, pages)) {
                mlxcx_warn(mlxp, "!hardware refused our gift of %lu "
                    "pages!", npages);
                mutex_exit(&mlxp->mlx_pagemtx);
                goto cleanup_npages;
        }

        for (i = 0; i < npages; i++) {
                avl_add(&mlxp->mlx_pages, pages[i]);
        }
        mlxp->mlx_npages += npages;
        mutex_exit(&mlxp->mlx_pagemtx);

        kmem_free(pages, sizeof (*pages) * npages);

        return;

cleanup_npages:
        for (i = 0; i < npages; i++) {
                if ((mdp = pages[i]) == NULL)
                        break;

                mlxcx_dma_free(&mdp->mxdp_dma);
                kmem_free(mdp, sizeof (mlxcx_dev_page_t));
        }
        /* Tell the hardware we had an allocation failure. */
        (void) mlxcx_cmd_give_pages(mlxp, MLXCX_MANAGE_PAGES_OPMOD_ALLOC_FAIL,
            0, NULL);
        mutex_exit(&mlxp->mlx_pagemtx);

        kmem_free(pages, sizeof (*pages) * npages);
}

static void
mlxcx_take_pages_once(mlxcx_t *mlxp, size_t npages)
{
        uint_t i;
        int32_t ret;
        uint64_t *pas;
        mlxcx_dev_page_t *mdp, probe;

        pas = kmem_alloc(sizeof (*pas) * npages, KM_SLEEP);

        if (!mlxcx_cmd_return_pages(mlxp, npages, pas, &ret)) {
                kmem_free(pas, sizeof (*pas) * npages);
                return;
        }

        mutex_enter(&mlxp->mlx_pagemtx);

        ASSERT0(avl_is_empty(&mlxp->mlx_pages));

        for (i = 0; i < ret; i++) {
                bzero(&probe, sizeof (probe));
                probe.mxdp_pa = pas[i];

                mdp = avl_find(&mlxp->mlx_pages, &probe, NULL);

                if (mdp != NULL) {
                        avl_remove(&mlxp->mlx_pages, mdp);
                        mlxp->mlx_npages--;
                        mlxcx_dma_free(&mdp->mxdp_dma);
                        kmem_free(mdp, sizeof (mlxcx_dev_page_t));
                } else {
                        mlxcx_warn(mlxp, "hardware returned a page "
                            "with PA 0x%" PRIx64 " but we have no "
                            "record of giving out such a page", pas[i]);
                }
        }

        mutex_exit(&mlxp->mlx_pagemtx);

        kmem_free(pas, sizeof (*pas) * npages);
}

static void
mlxcx_pages_task(void *arg)
{
        mlxcx_async_param_t *param = arg;
        mlxcx_t *mlxp = param->mla_mlx;
        int32_t npages;

        /*
         * We can drop the pending status now, as we've extracted what
         * is needed to process the pages request.
         *
         * Even though we should never get another pages request until
         * we have responded to this, along with the guard in mlxcx_sync_intr,
         * this safely allows the reuse of mlxcx_async_param_t.
         */
        mutex_enter(&param->mla_mtx);
        npages = param->mla_pages.mlp_npages;
        param->mla_pending = B_FALSE;
        bzero(&param->mla_pages, sizeof (param->mla_pages));
        mutex_exit(&param->mla_mtx);

        /*
         * The PRM describes npages as: "Number of missing / unneeded pages
         * (signed number, msb indicate sign)". The implication is that
         * it will not be zero. We are expected to use this to give or
         * take back pages (based on the sign) using the MANAGE_PAGES
         * command but we can't determine whether to give or take
         * when npages is zero. So we do nothing.
         */
        if (npages > 0) {
                mlxcx_give_pages_once(mlxp, npages);
        } else if (npages < 0) {
                mlxcx_take_pages_once(mlxp, -1 * npages);
        }
}

static void
mlxcx_link_state_task(void *arg)
{
        mlxcx_async_param_t *param = arg;
        mlxcx_port_t *port;
        mlxcx_t *mlxp;

        /*
         * Gather the argruments from the parameters and clear the
         * pending status.
         *
         * The pending status must be cleared *before* we update the
         * link state. This is both safe and required to ensure we always
         * have the correct link state. It is safe because taskq_ents are
         * reusable (by the caller of taskq_dispatch_ent()) once the
         * task function has started executing. It is necessarily before
         * updating the link state to guarantee further link state change
         * events are not missed and we always have the current link state.
         */
        mutex_enter(&param->mla_mtx);
        mlxp = param->mla_mlx;
        port = param->mla_port;
        param->mla_pending = B_FALSE;
        mutex_exit(&param->mla_mtx);

        mlxcx_update_link_state(mlxp, port);
}

static const char *
mlxcx_module_error_string(mlxcx_module_error_type_t err)
{
        switch (err) {
        case MLXCX_MODULE_ERR_POWER_BUDGET:
                return ("POWER_BUDGET");
        case MLXCX_MODULE_ERR_LONG_RANGE:
                return ("LONG_RANGE");
        case MLXCX_MODULE_ERR_BUS_STUCK:
                return ("BUS_STUCK");
        case MLXCX_MODULE_ERR_NO_EEPROM:
                return ("NO_EEPROM");
        case MLXCX_MODULE_ERR_ENFORCEMENT:
                return ("ENFORCEMENT");
        case MLXCX_MODULE_ERR_UNKNOWN_IDENT:
                return ("UNKNOWN_IDENT");
        case MLXCX_MODULE_ERR_HIGH_TEMP:
                return ("HIGH_TEMP");
        case MLXCX_MODULE_ERR_CABLE_SHORTED:
                return ("CABLE_SHORTED");
        default:
                return ("UNKNOWN");
        }
}

static void
mlxcx_report_module_error(mlxcx_t *mlxp, mlxcx_evdata_port_mod_t *evd)
{
        uint64_t ena;
        char buf[FM_MAX_CLASS];
        const char *lename;
        const char *ename;
        const char *stname;
        uint_t eno = 0;
        mlxcx_module_status_t state = evd->mled_port_mod_module_status;

        switch (state) {
        case MLXCX_MODULE_ERROR:
                stname = "error";
                eno = evd->mled_port_mod_error_type;
                lename = mlxcx_module_error_string(eno);
                switch (eno) {
                case MLXCX_MODULE_ERR_ENFORCEMENT:
                        ename = DDI_FM_TXR_ERROR_WHITELIST;
                        break;
                case MLXCX_MODULE_ERR_UNKNOWN_IDENT:
                case MLXCX_MODULE_ERR_NO_EEPROM:
                        ename = DDI_FM_TXR_ERROR_NOTSUPP;
                        break;
                case MLXCX_MODULE_ERR_HIGH_TEMP:
                        ename = DDI_FM_TXR_ERROR_OVERTEMP;
                        break;
                case MLXCX_MODULE_ERR_POWER_BUDGET:
                case MLXCX_MODULE_ERR_LONG_RANGE:
                case MLXCX_MODULE_ERR_CABLE_SHORTED:
                        ename = DDI_FM_TXR_ERROR_HWFAIL;
                        break;
                case MLXCX_MODULE_ERR_BUS_STUCK:
                default:
                        ename = DDI_FM_TXR_ERROR_UNKNOWN;
                }
                break;
        default:
                return;
        }

        (void) snprintf(buf, FM_MAX_CLASS, "%s.%s",
            DDI_FM_NIC, DDI_FM_TXR_ERROR);
        ena = fm_ena_generate(0, FM_ENA_FMT1);
        if (!DDI_FM_EREPORT_CAP(mlxp->mlx_fm_caps))
                return;

        ddi_fm_ereport_post(mlxp->mlx_dip, buf, ena, DDI_NOSLEEP,
            /* compulsory FM props */
            FM_VERSION, DATA_TYPE_UINT8, FM_EREPORT_VERS0,
            /* generic NIC txr error event props */
            "error", DATA_TYPE_STRING, ename,
            "port_index", DATA_TYPE_UINT8, 0,
            "txr_index", DATA_TYPE_UINT8, evd->mled_port_mod_module,
            /* local props */
            "mlxcx_state", DATA_TYPE_STRING, stname,
            "mlxcx_error", DATA_TYPE_STRING, lename,
            "mlxcx_error_num", DATA_TYPE_UINT8, eno,
            NULL);
        ddi_fm_service_impact(mlxp->mlx_dip, DDI_SERVICE_LOST);
}

/*
 * Common beginning of interrupt processing.
 * Confirm interrupt hasn't been disabled, verify its state and
 * mark the vector as active.
 */
static boolean_t
mlxcx_intr_ini(mlxcx_t *mlxp, mlxcx_event_queue_t *mleq)
{
        mutex_enter(&mleq->mleq_mtx);

        if ((mleq->mleq_state & MLXCX_EQ_INTR_ENABLED) == 0) {
                mutex_exit(&mleq->mleq_mtx);
                return (B_FALSE);
        }

        if (!(mleq->mleq_state & MLXCX_EQ_ALLOC) ||
            !(mleq->mleq_state & MLXCX_EQ_CREATED) ||
            (mleq->mleq_state & MLXCX_EQ_DESTROYED)) {
                mlxcx_warn(mlxp, "intr %d in bad eq state",
                    mleq->mleq_intr_index);
                mutex_exit(&mleq->mleq_mtx);
                return (B_FALSE);
        }

        mleq->mleq_state |= MLXCX_EQ_INTR_ACTIVE;
        mutex_exit(&mleq->mleq_mtx);

        return (B_TRUE);
}

/*
 * End of interrupt processing.
 * Mark vector as no longer active and if shutdown is blocked on this vector,
 * wake it up.
 */
static void
mlxcx_intr_fini(mlxcx_event_queue_t *mleq)
{
        mutex_enter(&mleq->mleq_mtx);
        if ((mleq->mleq_state & MLXCX_EQ_INTR_QUIESCE) != 0)
                cv_signal(&mleq->mleq_cv);

        mleq->mleq_state &= ~MLXCX_EQ_INTR_ACTIVE;
        mutex_exit(&mleq->mleq_mtx);
}

static uint_t
mlxcx_intr_async(caddr_t arg, caddr_t arg2)
{
        mlxcx_t *mlxp = (mlxcx_t *)arg;
        mlxcx_event_queue_t *mleq = (mlxcx_event_queue_t *)arg2;
        mlxcx_eventq_ent_t *ent;
        mlxcx_async_param_t *param;
        uint_t portn;
        uint16_t func;

        if (!mlxcx_intr_ini(mlxp, mleq))
                return (DDI_INTR_CLAIMED);

        ent = mlxcx_eq_next(mleq);
        if (ent == NULL) {
                goto done;
        }

        ASSERT(mleq->mleq_state & MLXCX_EQ_ARMED);
        mleq->mleq_state &= ~MLXCX_EQ_ARMED;

        for (; ent != NULL; ent = mlxcx_eq_next(mleq)) {
                DTRACE_PROBE2(event, mlxcx_t *, mlxp, mlxcx_eventq_ent_t *,
                    ent);

                /*
                 * Handle events which can be processed while we're still in
                 * mlxcx_attach(). Everything on the mlxcx_t which these events
                 * use must be allocated and set up prior to the call to
                 * mlxcx_setup_async_eqs().
                 */
                switch (ent->mleqe_event_type) {
                case MLXCX_EVENT_CMD_COMPLETION:
                        mlxcx_cmd_completion(mlxp, ent);
                        continue;
                case MLXCX_EVENT_PAGE_REQUEST:
                        func = from_be16(ent->mleqe_page_request.
                            mled_page_request_function_id);
                        VERIFY3U(func, <=, MLXCX_FUNC_ID_MAX);

                        param = &mlxp->mlx_npages_req[func];
                        mutex_enter(&param->mla_mtx);
                        if (param->mla_pending) {
                                /*
                                 * The PRM states we will not get another
                                 * page request event until any pending have
                                 * been posted as complete to the HCA.
                                 * This will guard against this anyway.
                                 */
                                mutex_exit(&param->mla_mtx);
                                mlxcx_warn(mlxp, "Unexpected page request "
                                    "whilst another is pending");
                                continue;
                        }
                        param->mla_pages.mlp_npages =
                            (int32_t)from_be32(ent->mleqe_page_request.
                            mled_page_request_num_pages);
                        param->mla_pages.mlp_func = func;
                        param->mla_pending = B_TRUE;
                        ASSERT3P(param->mla_mlx, ==, mlxp);
                        mutex_exit(&param->mla_mtx);

                        taskq_dispatch_ent(mlxp->mlx_async_tq, mlxcx_pages_task,
                            param, 0, &param->mla_tqe);
                        continue;
                }

                /*
                 * All other events should be ignored while in attach.
                 */
                mutex_enter(&mleq->mleq_mtx);
                if (mleq->mleq_state & MLXCX_EQ_ATTACHING) {
                        mutex_exit(&mleq->mleq_mtx);
                        continue;
                }
                mutex_exit(&mleq->mleq_mtx);

                switch (ent->mleqe_event_type) {
                case MLXCX_EVENT_PORT_STATE:
                        portn = get_bits8(
                            ent->mleqe_port_state.mled_port_state_port_num,
                            MLXCX_EVENT_PORT_NUM) - 1;
                        if (portn >= mlxp->mlx_nports)
                                break;

                        param = &mlxp->mlx_ports[portn].mlx_port_event;
                        mutex_enter(&param->mla_mtx);
                        if (param->mla_pending) {
                                /*
                                 * There is a link state event pending
                                 * processing. When that event is handled
                                 * it will get the current link state.
                                 */
                                mutex_exit(&param->mla_mtx);
                                break;
                        }

                        ASSERT3P(param->mla_mlx, ==, mlxp);
                        ASSERT3P(param->mla_port, ==, &mlxp->mlx_ports[portn]);

                        param->mla_pending = B_TRUE;
                        mutex_exit(&param->mla_mtx);

                        taskq_dispatch_ent(mlxp->mlx_async_tq,
                            mlxcx_link_state_task, param, 0, &param->mla_tqe);
                        break;
                case MLXCX_EVENT_PORT_MODULE:
                        mlxcx_report_module_error(mlxp, &ent->mleqe_port_mod);
                        break;
                default:
                        mlxcx_warn(mlxp, "unhandled event 0x%x on intr %d",
                            ent->mleqe_event_type, mleq->mleq_intr_index);
                }
        }

        mlxcx_arm_eq(mlxp, mleq);

done:
        mlxcx_intr_fini(mleq);
        return (DDI_INTR_CLAIMED);
}

static boolean_t
mlxcx_process_cq(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq, mblk_t **mpp,
    size_t bytelim)
{
        mlxcx_work_queue_t *wq = mlcq->mlcq_wq;
        mlxcx_completionq_ent_t *cent;
        mblk_t *mp, *cmp, *nmp;
        mlxcx_buffer_t *buf;
        boolean_t found, added;
        size_t bytes = 0;
        uint_t rx_frames = 0;
        uint_t comp_cnt = 0;
        int64_t wqebbs, bufcnt;

        *mpp = NULL;

        if (!(mlcq->mlcq_state & MLXCX_CQ_ALLOC) ||
            !(mlcq->mlcq_state & MLXCX_CQ_CREATED) ||
            (mlcq->mlcq_state & MLXCX_CQ_DESTROYED) ||
            (mlcq->mlcq_state & MLXCX_CQ_TEARDOWN)) {
                return (B_FALSE);
        }

        nmp = cmp = mp = NULL;

        wqebbs = 0;
        bufcnt = 0;
        for (cent = mlxcx_cq_next(mlcq); cent != NULL;
            cent = mlxcx_cq_next(mlcq)) {
                /*
                 * Teardown and ring stop can atomic_or this flag
                 * into our state if they want us to stop early.
                 */
                if (mlcq->mlcq_state & MLXCX_CQ_TEARDOWN)
                        return (B_FALSE);

                comp_cnt++;
                if (cent->mlcqe_opcode == MLXCX_CQE_OP_REQ &&
                    cent->mlcqe_send_wqe_opcode == MLXCX_WQE_OP_NOP) {
                        /* NOP */
                        atomic_dec_64(&wq->mlwq_wqebb_used);
                        goto nextcq;
                }

lookagain:
                /*
                 * Generally the buffer we're looking for will be
                 * at the front of the list, so this loop won't
                 * need to look far.
                 */
                buf = list_head(&mlcq->mlcq_buffers);
                found = B_FALSE;
                while (buf != NULL) {
                        if ((buf->mlb_wqe_index & UINT16_MAX) ==
                            from_be16(cent->mlcqe_wqe_counter)) {
                                found = B_TRUE;
                                break;
                        }
                        buf = list_next(&mlcq->mlcq_buffers, buf);
                }

                if (!found) {
                        /*
                         * If there's any buffers waiting on the
                         * buffers_b list, then merge those into
                         * the main list and have another look.
                         *
                         * The wq enqueue routines push new buffers
                         * into buffers_b so that they can avoid
                         * taking the mlcq_mtx and blocking us for
                         * every single packet.
                         */
                        added = B_FALSE;
                        mutex_enter(&mlcq->mlcq_bufbmtx);
                        if (!list_is_empty(&mlcq->mlcq_buffers_b)) {
                                list_move_tail(&mlcq->mlcq_buffers,
                                    &mlcq->mlcq_buffers_b);
                                added = B_TRUE;
                        }
                        mutex_exit(&mlcq->mlcq_bufbmtx);
                        if (added)
                                goto lookagain;

                        /*
                         * This check could go just after the lookagain
                         * label, but it is a hot code path so we don't
                         * want to unnecessarily grab a lock and check
                         * a flag for a relatively rare event (the ring
                         * being stopped).
                         */
                        mutex_enter(&wq->mlwq_mtx);
                        if ((wq->mlwq_state & MLXCX_WQ_STARTED) == 0) {
                                mutex_exit(&wq->mlwq_mtx);
                                goto nextcq;
                        }
                        mutex_exit(&wq->mlwq_mtx);

                        buf = list_head(&mlcq->mlcq_buffers);
                        mlxcx_warn(mlxp, "got completion on CQ %x but "
                            "no buffer matching wqe found: %x (first "
                            "buffer counter = %x)", mlcq->mlcq_num,
                            from_be16(cent->mlcqe_wqe_counter),
                            buf == NULL ? UINT32_MAX :
                            buf->mlb_wqe_index);
                        mlxcx_fm_ereport(mlxp, DDI_FM_DEVICE_INVAL_STATE);
                        goto nextcq;
                }

                /*
                 * The buf is likely to be freed below, count this now.
                 */
                wqebbs += buf->mlb_wqebbs;

                list_remove(&mlcq->mlcq_buffers, buf);
                bufcnt++;

                switch (mlcq->mlcq_wq->mlwq_type) {
                case MLXCX_WQ_TYPE_SENDQ:
                        mlxcx_tx_completion(mlxp, mlcq, cent, buf);
                        break;
                case MLXCX_WQ_TYPE_RECVQ:
                        nmp = mlxcx_rx_completion(mlxp, mlcq, cent, buf);
                        bytes += from_be32(cent->mlcqe_byte_cnt);
                        if (nmp != NULL) {
                                if (cmp != NULL) {
                                        cmp->b_next = nmp;
                                        cmp = nmp;
                                } else {
                                        mp = cmp = nmp;
                                }

                                rx_frames++;
                        }
                        break;
                }

                /*
                 * Update the consumer index with what has been processed,
                 * followed by driver counters. It is important to tell the
                 * hardware first, otherwise when we throw more packets at
                 * it, it may get an overflow error.
                 * We do this whenever we've processed enough to bridge the
                 * high->low water mark.
                 */
                if (bufcnt > (MLXCX_CQ_LWM_GAP - MLXCX_CQ_HWM_GAP)) {
                        mlxcx_update_cqci(mlxp, mlcq);
                        /*
                         * Both these variables are incremented using
                         * atomics as they are modified in other code paths
                         * (Eg during tx) which hold different locks.
                         */
                        atomic_add_64(&mlcq->mlcq_bufcnt, -bufcnt);
                        atomic_add_64(&wq->mlwq_wqebb_used, -wqebbs);
                        wqebbs = 0;
                        bufcnt = 0;
                        comp_cnt = 0;
                }
nextcq:
                if (rx_frames > mlxp->mlx_props.mldp_rx_per_cq ||
                    (bytelim != 0 && bytes > bytelim))
                        break;
        }

        if (comp_cnt > 0) {
                mlxcx_update_cqci(mlxp, mlcq);
                atomic_add_64(&mlcq->mlcq_bufcnt, -bufcnt);
                atomic_add_64(&wq->mlwq_wqebb_used, -wqebbs);
        }

        *mpp = mp;
        return (B_TRUE);
}


mblk_t *
mlxcx_rx_poll(mlxcx_t *mlxp, mlxcx_completion_queue_t *mlcq, size_t bytelim)
{
        mblk_t *mp = NULL;

        ASSERT(mutex_owned(&mlcq->mlcq_mtx));

        ASSERT(mlcq->mlcq_wq != NULL);
        ASSERT3U(mlcq->mlcq_wq->mlwq_type, ==, MLXCX_WQ_TYPE_RECVQ);

        (void) mlxcx_process_cq(mlxp, mlcq, &mp, bytelim);

        return (mp);
}

static uint_t
mlxcx_intr_n(caddr_t arg, caddr_t arg2)
{
        mlxcx_t *mlxp = (mlxcx_t *)arg;
        mlxcx_event_queue_t *mleq = (mlxcx_event_queue_t *)arg2;
        mlxcx_eventq_ent_t *ent;
        mlxcx_completion_queue_t *mlcq, probe;
        mlxcx_work_queue_t *mlwq;
        mblk_t *mp = NULL;
        boolean_t tellmac = B_FALSE;

        if (!mlxcx_intr_ini(mlxp, mleq))
                return (DDI_INTR_CLAIMED);

        ent = mlxcx_eq_next(mleq);
        if (ent == NULL) {
                if (++mleq->mleq_badintrs > mlxcx_stuck_intr_count) {
                        mlxcx_fm_ereport(mlxp, DDI_FM_DEVICE_BADINT_LIMIT);
                        ddi_fm_service_impact(mlxp->mlx_dip, DDI_SERVICE_LOST);
                        (void) ddi_intr_disable(mlxp->mlx_intr_handles[
                            mleq->mleq_intr_index]);
                }
                goto done;
        }
        mleq->mleq_badintrs = 0;

        mutex_enter(&mleq->mleq_mtx);
        ASSERT(mleq->mleq_state & MLXCX_EQ_ARMED);
        mleq->mleq_state &= ~MLXCX_EQ_ARMED;
#if defined(DEBUG)
        /*
         * If we're still in mlxcx_attach and an intr_n fired, something really
         * weird is going on. This shouldn't happen in the absence of a driver
         * or firmware bug, so in the interests of minimizing branches in this
         * function this check is under DEBUG.
         */
        if (mleq->mleq_state & MLXCX_EQ_ATTACHING) {
                mutex_exit(&mleq->mleq_mtx);
                mlxcx_warn(mlxp, "intr_n (%u) fired during attach, disabling "
                    "vector", mleq->mleq_intr_index);
                mlxcx_fm_ereport(mlxp, DDI_FM_DEVICE_INVAL_STATE);
                ddi_fm_service_impact(mlxp->mlx_dip, DDI_SERVICE_LOST);
                (void) ddi_intr_disable(mlxp->mlx_intr_handles[
                    mleq->mleq_intr_index]);
                goto done;
        }
#endif
        mutex_exit(&mleq->mleq_mtx);

        for (; ent != NULL; ent = mlxcx_eq_next(mleq)) {
                ASSERT3U(ent->mleqe_event_type, ==, MLXCX_EVENT_COMPLETION);

                probe.mlcq_num =
                    from_be24(ent->mleqe_completion.mled_completion_cqn);
                mutex_enter(&mleq->mleq_mtx);
                mlcq = avl_find(&mleq->mleq_cqs, &probe, NULL);
                mutex_exit(&mleq->mleq_mtx);

                if (mlcq == NULL)
                        goto update_eq;

                mlwq = mlcq->mlcq_wq;

                /*
                 * mlcq_arm_mtx is used to avoid race conditions between
                 * this interrupt routine and the transition from polling
                 * back to interrupt mode. When exiting poll mode the
                 * CQ is likely to be un-armed, which means there will
                 * be no events for the CQ coming though here,
                 * consequently very low contention on mlcq_arm_mtx.
                 *
                 * mlcq_arm_mtx must be released before calls into mac
                 * layer in order to avoid deadlocks.
                 */
                mutex_enter(&mlcq->mlcq_arm_mtx);
                mlcq->mlcq_ec++;
                atomic_and_uint(&mlcq->mlcq_state, ~MLXCX_CQ_ARMED);

                if (mutex_tryenter(&mlcq->mlcq_mtx) == 0) {
                        /*
                         * If we failed to take the mutex because the
                         * polling function has it, just move on.
                         * We don't want to block other CQs behind
                         * this one.
                         */
                        if ((mlcq->mlcq_state & MLXCX_CQ_POLLING) != 0) {
                                mutex_exit(&mlcq->mlcq_arm_mtx);
                                goto update_eq;
                        }

                        /* Otherwise we will wait. */
                        mutex_enter(&mlcq->mlcq_mtx);
                }

                if ((mlcq->mlcq_state & MLXCX_CQ_POLLING) == 0 &&
                    mlxcx_process_cq(mlxp, mlcq, &mp, 0)) {
                        /*
                         * The ring is not in polling mode and we processed
                         * some completion queue entries.
                         */
                        if ((mlcq->mlcq_state & MLXCX_CQ_BLOCKED_MAC) != 0 &&
                            mlcq->mlcq_bufcnt < mlcq->mlcq_buflwm) {
                                atomic_and_uint(&mlcq->mlcq_state,
                                    ~MLXCX_CQ_BLOCKED_MAC);
                                tellmac = B_TRUE;
                        }

                        if ((mlwq->mlwq_state & MLXCX_WQ_BLOCKED_MAC) != 0 &&
                            mlwq->mlwq_wqebb_used < mlwq->mlwq_buflwm) {
                                atomic_and_uint(&mlwq->mlwq_state,
                                    ~MLXCX_WQ_BLOCKED_MAC);
                                tellmac = B_TRUE;
                        }

                        mlxcx_arm_cq(mlxp, mlcq);

                        mutex_exit(&mlcq->mlcq_mtx);
                        mutex_exit(&mlcq->mlcq_arm_mtx);

                        if (tellmac) {
                                mac_tx_ring_update(mlxp->mlx_mac_hdl,
                                    mlcq->mlcq_mac_hdl);
                                tellmac = B_FALSE;
                        }

                        if (mp != NULL) {
                                mac_rx_ring(mlxp->mlx_mac_hdl,
                                    mlcq->mlcq_mac_hdl, mp, mlcq->mlcq_mac_gen);
                        }
                } else {
                        mutex_exit(&mlcq->mlcq_mtx);
                        mutex_exit(&mlcq->mlcq_arm_mtx);
                }

update_eq:
                /*
                 * Updating the consumer counter for an EQ requires a write
                 * to the UAR, which is possibly expensive.
                 *
                 * Try to do it only often enough to stop us wrapping around.
                 */
                if ((mleq->mleq_cc & 0x7) == 0)
                        mlxcx_update_eq(mlxp, mleq);
        }

        mlxcx_arm_eq(mlxp, mleq);

done:
        mlxcx_intr_fini(mleq);
        return (DDI_INTR_CLAIMED);
}

boolean_t
mlxcx_intr_setup(mlxcx_t *mlxp)
{
        dev_info_t *dip = mlxp->mlx_dip;
        int ret;
        int nintrs = 0;
        int navail = 0;
        int types, i;
        mlxcx_eventq_type_t eqt = MLXCX_EQ_TYPE_ANY;

        ret = ddi_intr_get_supported_types(dip, &types);
        if (ret != DDI_SUCCESS) {
                mlxcx_warn(mlxp, "Failed to get supported interrupt types");
                return (B_FALSE);
        }

        if (!(types & DDI_INTR_TYPE_MSIX)) {
                mlxcx_warn(mlxp, "MSI-X interrupts not available, but mlxcx "
                    "requires MSI-X");
                return (B_FALSE);
        }

        ret = ddi_intr_get_nintrs(dip, DDI_INTR_TYPE_MSIX, &nintrs);
        if (ret != DDI_SUCCESS) {
                mlxcx_warn(mlxp, "Failed to get number of interrupts");
                return (B_FALSE);
        }
        if (nintrs < 2) {
                mlxcx_warn(mlxp, "%d MSI-X interrupts supported, but mlxcx "
                    "requires 2", nintrs);
                return (B_FALSE);
        }

        ret = ddi_intr_get_navail(dip, DDI_INTR_TYPE_MSIX, &navail);
        if (ret != DDI_SUCCESS) {
                mlxcx_warn(mlxp,
                    "Failed to get number of available interrupts");
                return (B_FALSE);
        }
        if (navail < 2) {
                mlxcx_warn(mlxp, "%d MSI-X interrupts available, but mlxcx "
                    "requires 2", navail);
                return (B_FALSE);
        }

        mlxp->mlx_intr_size = navail * sizeof (ddi_intr_handle_t);
        mlxp->mlx_intr_handles = kmem_alloc(mlxp->mlx_intr_size, KM_SLEEP);
        /*
         * Interrupts for Completion Queues events start from vector 1
         * up to available vectors. Vector 0 is used for asynchronous
         * events.
         */
        mlxp->mlx_intr_cq0 = 1;

        ret = ddi_intr_alloc(dip, mlxp->mlx_intr_handles, DDI_INTR_TYPE_MSIX,
            0, navail, &mlxp->mlx_intr_count, DDI_INTR_ALLOC_NORMAL);
        if (ret != DDI_SUCCESS) {
                mlxcx_warn(mlxp, "Failed to allocate %d interrupts", navail);
                mlxcx_intr_teardown(mlxp);
                return (B_FALSE);
        }
        if (mlxp->mlx_intr_count < mlxp->mlx_intr_cq0 + 1) {
                mlxcx_warn(mlxp, "%d MSI-X interrupts allocated, but mlxcx "
                    "requires %d", mlxp->mlx_intr_count,
                    mlxp->mlx_intr_cq0 + 1);
                mlxcx_intr_teardown(mlxp);
                return (B_FALSE);
        }
        mlxp->mlx_intr_type = DDI_INTR_TYPE_MSIX;

        ret = ddi_intr_get_pri(mlxp->mlx_intr_handles[0], &mlxp->mlx_intr_pri);
        if (ret != DDI_SUCCESS) {
                mlxcx_warn(mlxp, "Failed to get interrupt priority");
                mlxcx_intr_teardown(mlxp);
                return (B_FALSE);
        }

        /*
         * Set the interrupt priority for the asynchronous handler higher
         * than the ring handlers. Some operations which issue commands,
         * and thus rely on the async interrupt handler for posting
         * completion, do so with a CQ mutex held. The CQ mutex is also
         * acquired during ring processing, so if the ring processing vector
         * happens to be assigned to the same CPU as the async vector
         * it can hold off the async interrupt thread and lead to a deadlock.
         * By assigning a higher priority to the async vector, it will
         * always be dispatched.
         */
        mlxp->mlx_async_intr_pri = mlxp->mlx_intr_pri;
        if (mlxp->mlx_async_intr_pri < LOCK_LEVEL) {
                mlxp->mlx_async_intr_pri++;
        } else {
                mlxp->mlx_intr_pri--;
        }

        mlxp->mlx_eqs_size = mlxp->mlx_intr_count *
            sizeof (mlxcx_event_queue_t);
        mlxp->mlx_eqs = kmem_zalloc(mlxp->mlx_eqs_size, KM_SLEEP);

        /*
         * In the failure path, mlxcx_intr_teardown() expects this
         * mutex and avl tree to be init'ed - so do it now.
         */
        for (i = 0; i < mlxp->mlx_intr_count; ++i) {
                uint_t pri = (i == 0) ? mlxp->mlx_async_intr_pri :
                    mlxp->mlx_intr_pri;

                mutex_init(&mlxp->mlx_eqs[i].mleq_mtx, NULL, MUTEX_DRIVER,
                    DDI_INTR_PRI(pri));
                cv_init(&mlxp->mlx_eqs[i].mleq_cv, NULL, CV_DRIVER, NULL);

                if (i < mlxp->mlx_intr_cq0)
                        continue;

                avl_create(&mlxp->mlx_eqs[i].mleq_cqs, mlxcx_cq_compare,
                    sizeof (mlxcx_completion_queue_t),
                    offsetof(mlxcx_completion_queue_t, mlcq_eq_entry));
        }

        while (mlxp->mlx_async_intr_pri > DDI_INTR_PRI_MIN) {
                ret = ddi_intr_set_pri(mlxp->mlx_intr_handles[0],
                    mlxp->mlx_async_intr_pri);
                if (ret == DDI_SUCCESS)
                        break;
                mlxcx_note(mlxp,
                    "!Failed to set interrupt priority to %u for "
                    "async interrupt vector", mlxp->mlx_async_intr_pri);
                /*
                 * If it was not possible to set the IPL for the async
                 * interrupt to the desired value, then try a lower priority.
                 * Some PSMs can only accommodate a limited number of vectors
                 * at eatch priority level (or group of priority levels). Since
                 * the async priority must be set higher than the ring
                 * handlers, lower both. The ring handler priority is set
                 * below.
                 */
                mlxp->mlx_async_intr_pri--;
                mlxp->mlx_intr_pri--;
        }

        if (mlxp->mlx_async_intr_pri == DDI_INTR_PRI_MIN) {
                mlxcx_warn(mlxp, "Failed to find an interrupt priority for "
                    "async interrupt vector");
                mlxcx_intr_teardown(mlxp);
                return (B_FALSE);
        }

        ret = ddi_intr_add_handler(mlxp->mlx_intr_handles[0], mlxcx_intr_async,
            (caddr_t)mlxp, (caddr_t)&mlxp->mlx_eqs[0]);
        if (ret != DDI_SUCCESS) {
                mlxcx_warn(mlxp, "Failed to add async interrupt handler");
                mlxcx_intr_teardown(mlxp);
                return (B_FALSE);
        }

        /*
         * If we have enough interrupts, set their "type" fields so that we
         * avoid mixing RX and TX queues on the same EQs.
         */
        if (mlxp->mlx_intr_count >= 8) {
                eqt = MLXCX_EQ_TYPE_RX;
        }

        for (i = mlxp->mlx_intr_cq0; i < mlxp->mlx_intr_count; ++i) {
                mlxp->mlx_eqs[i].mleq_intr_index = i;

                mlxp->mlx_eqs[i].mleq_type = eqt;
                /*
                 * If eqt is still ANY, just leave it set to that
                 * (no else here).
                 */
                if (eqt == MLXCX_EQ_TYPE_RX) {
                        eqt = MLXCX_EQ_TYPE_TX;
                } else if (eqt == MLXCX_EQ_TYPE_TX) {
                        eqt = MLXCX_EQ_TYPE_RX;
                }

                while (mlxp->mlx_intr_pri >= DDI_INTR_PRI_MIN) {
                        ret = ddi_intr_set_pri(mlxp->mlx_intr_handles[i],
                            mlxp->mlx_intr_pri);
                        if (ret == DDI_SUCCESS)
                                break;
                        mlxcx_note(mlxp, "!Failed to set interrupt priority to "
                            "%u for interrupt vector %d",
                            mlxp->mlx_intr_pri, i);
                        mlxp->mlx_intr_pri--;
                }
                if (mlxp->mlx_intr_pri < DDI_INTR_PRI_MIN) {
                        mlxcx_warn(mlxp,
                            "Failed to find an interrupt priority for "
                            "interrupt vector %d", i);
                        mlxcx_intr_teardown(mlxp);
                        return (B_FALSE);
                }

                ret = ddi_intr_add_handler(mlxp->mlx_intr_handles[i],
                    mlxcx_intr_n, (caddr_t)mlxp, (caddr_t)&mlxp->mlx_eqs[i]);
                if (ret != DDI_SUCCESS) {
                        mlxcx_warn(mlxp, "Failed to add interrupt handler %d",
                            i);
                        mlxcx_intr_teardown(mlxp);
                        return (B_FALSE);
                }
        }

        return (B_TRUE);
}