root/usr/src/uts/common/io/ib/clients/eoib/enx_hdlrs.c
/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */

/*
 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
 */

#include <sys/types.h>
#include <sys/kmem.h>
#include <sys/conf.h>
#include <sys/ddi.h>
#include <sys/sunddi.h>
#include <sys/sunndi.h>
#include <sys/ksynch.h>
#include <sys/callb.h>
#include <sys/ib/mgt/sm_attr.h>         /* SM_INIT_TYPE_REPLY_... */

#include <sys/ib/clients/eoib/enx_impl.h>

/*
 * Static function declarations
 */
static void eibnx_gw_is_alive(eibnx_gw_info_t *);
static void eibnx_gw_is_aware(eibnx_thr_info_t *, eibnx_gw_info_t *, boolean_t);
static void eibnx_process_rx(eibnx_thr_info_t *, ibt_wc_t *, eibnx_wqe_t *);
static void eibnx_handle_wcerr(uint8_t, eibnx_wqe_t *, eibnx_thr_info_t *);
static void eibnx_handle_login_ack(eibnx_thr_info_t *, uint8_t *);
static void eibnx_handle_gw_rebirth(eibnx_thr_info_t *, uint16_t);
static void eibnx_handle_gw_info_update(eibnx_thr_info_t *, uint16_t, void *);
static int eibnx_replace_portinfo(eibnx_thr_info_t *, ibt_hca_portinfo_t *,
    uint_t);
static void eibnx_handle_port_events(ibt_hca_hdl_t, uint8_t);
static void eibnx_handle_hca_attach(ib_guid_t);
static void eibnx_handle_hca_detach(ib_guid_t);

/*
 * NDI event handle we need
 */
extern ndi_event_hdl_t enx_ndi_event_hdl;

/*
 * SM's init type reply flags
 */
#define ENX_PORT_ATTR_LOADED(itr)                               \
        (((itr) & SM_INIT_TYPE_REPLY_NO_LOAD_REPLY) == 0)
#define ENX_PORT_ATTR_NOT_PRESERVED(itr)                        \
        (((itr) & SM_INIT_TYPE_PRESERVE_CONTENT_REPLY) == 0)
#define ENX_PORT_PRES_NOT_PRESERVED(itr)                        \
        (((itr) & SM_INIT_TYPE_PRESERVE_PRESENCE_REPLY) == 0)

/*
 * Port monitor progress flags (all flag values should be non-zero)
 */
#define ENX_MON_LINKSTATE_UP            0x01
#define ENX_MON_FOUND_MCGS              0x02
#define ENX_MON_SETUP_CQ                0x04
#define ENX_MON_SETUP_UD_CHAN           0x08
#define ENX_MON_SETUP_BUFS              0x10
#define ENX_MON_SETUP_CQ_HDLR           0x20
#define ENX_MON_JOINED_MCGS             0x40
#define ENX_MON_MULTICAST_SLCT          0x80
#define ENX_MON_MAX                     0xFF

/*
 * Per-port thread to solicit, monitor and discover EoIB gateways
 * and create the corresponding EoIB driver instances on the host.
 */
void
eibnx_port_monitor(eibnx_thr_info_t *info)
{
        clock_t solicit_period_ticks;
        clock_t deadline;
        kmutex_t ci_lock;
        callb_cpr_t ci;
        char thr_name[MAXNAMELEN];

        (void) snprintf(thr_name, MAXNAMELEN, ENX_PORT_MONITOR,
            info->ti_pi->p_port_num);

        mutex_init(&ci_lock, NULL, MUTEX_DRIVER, NULL);
        CALLB_CPR_INIT(&ci, &ci_lock, callb_generic_cpr, thr_name);

        info->ti_progress = 0;

        /*
         * If the port is not active yet, wait for a port up event. The
         * async handler, when it sees a port-up event, is expected to
         * update the port_monitor's portinfo structure's p_linkstate
         * and wake us up with ENX_EVENT_LINK_UP.
         */
        while (info->ti_pi->p_linkstate != IBT_PORT_ACTIVE) {
                mutex_enter(&info->ti_event_lock);
                while ((info->ti_event &
                    (ENX_EVENT_LINK_UP | ENX_EVENT_DIE)) == 0) {
                        mutex_enter(&ci_lock);
                        CALLB_CPR_SAFE_BEGIN(&ci);
                        mutex_exit(&ci_lock);

                        cv_wait(&info->ti_event_cv, &info->ti_event_lock);

                        mutex_enter(&ci_lock);
                        CALLB_CPR_SAFE_END(&ci, &ci_lock);
                        mutex_exit(&ci_lock);
                }
                if (info->ti_event & ENX_EVENT_DIE) {
                        mutex_exit(&info->ti_event_lock);
                        goto port_monitor_exit;
                }
                info->ti_event &= (~ENX_EVENT_LINK_UP);
                mutex_exit(&info->ti_event_lock);
        }
        info->ti_progress |= ENX_MON_LINKSTATE_UP;

        /*
         * Locate the multicast groups for sending solicit requests
         * to the GW and receiving advertisements from the GW. If
         * either of the mcg is not present, wait for them to be
         * created by the GW.
         */
        while (eibnx_find_mgroups(info) != ENX_E_SUCCESS) {
                mutex_enter(&info->ti_event_lock);
                while ((info->ti_event &
                    (ENX_EVENT_MCGS_AVAILABLE | ENX_EVENT_DIE)) == 0) {
                        mutex_enter(&ci_lock);
                        CALLB_CPR_SAFE_BEGIN(&ci);
                        mutex_exit(&ci_lock);

                        cv_wait(&info->ti_event_cv, &info->ti_event_lock);

                        mutex_enter(&ci_lock);
                        CALLB_CPR_SAFE_END(&ci, &ci_lock);
                        mutex_exit(&ci_lock);
                }
                if (info->ti_event & ENX_EVENT_DIE) {
                        mutex_exit(&info->ti_event_lock);
                        goto port_monitor_exit;
                }
                info->ti_event &= (~ENX_EVENT_MCGS_AVAILABLE);
                mutex_exit(&info->ti_event_lock);
        }
        info->ti_progress |= ENX_MON_FOUND_MCGS;

        /*
         * Setup a shared CQ
         */
        if (eibnx_setup_cq(info) != ENX_E_SUCCESS) {
                ENX_DPRINTF_ERR("eibnx_setup_cq() failed, terminating "
                    "port monitor for (hca_guid=0x%llx, port_num=0x%x)",
                    info->ti_hca_guid, info->ti_pi->p_port_num);
                goto port_monitor_exit;
        }
        info->ti_progress |= ENX_MON_SETUP_CQ;

        /*
         * Setup UD channel
         */
        if (eibnx_setup_ud_channel(info) != ENX_E_SUCCESS) {
                ENX_DPRINTF_ERR("eibnx_setup_ud_channel() failed, terminating "
                    "port monitor for (hca_guid=0x%llx, port_num=0x%x)",
                    info->ti_hca_guid, info->ti_pi->p_port_num);
                goto port_monitor_exit;
        }
        info->ti_progress |= ENX_MON_SETUP_UD_CHAN;

        /*
         * Allocate/initialize any tx/rx buffers
         */
        if (eibnx_setup_bufs(info) != ENX_E_SUCCESS) {
                ENX_DPRINTF_ERR("eibnx_setup_bufs() failed, terminating "
                    "port monitor for (hca_guid=0x%llx, port_num=0x%x)",
                    info->ti_hca_guid, info->ti_pi->p_port_num);
                goto port_monitor_exit;
        }
        info->ti_progress |= ENX_MON_SETUP_BUFS;

        /*
         * Setup completion handler
         */
        if (eibnx_setup_cq_handler(info) != ENX_E_SUCCESS) {
                ENX_DPRINTF_ERR("eibnx_setup_cq_handler() failed, terminating "
                    "port monitor for (hca_guid=0x%llx, port_num=0x%x)",
                    info->ti_hca_guid, info->ti_pi->p_port_num);
                goto port_monitor_exit;
        }
        info->ti_progress |= ENX_MON_SETUP_CQ_HDLR;

        /*
         * Join EoIB multicast groups
         */
        if (eibnx_join_mcgs(info) != ENX_E_SUCCESS) {
                ENX_DPRINTF_ERR("eibnx_join_mcgs() failed, terminating ",
                    "port monitor for (hca_guid=0x%llx, port_num=0x%x)",
                    info->ti_hca_guid, info->ti_pi->p_port_num);
                goto port_monitor_exit;
        }
        info->ti_progress |= ENX_MON_JOINED_MCGS;

        /*
         * Send SOLICIT pkt to the EoIB multicast group
         */
        if (eibnx_fip_solicit_mcast(info) != ENX_E_SUCCESS) {
                ENX_DPRINTF_ERR("eibnx_fip_solicit_mcast() failed, terminating "
                    "port monitor for (hca_guid=0x%llx, port_num=0x%x)",
                    info->ti_hca_guid, info->ti_pi->p_port_num);
                goto port_monitor_exit;
        }
        info->ti_progress |= ENX_MON_MULTICAST_SLCT;

        mutex_enter(&info->ti_event_lock);

        solicit_period_ticks = drv_usectohz(ENX_DFL_SOLICIT_PERIOD_USEC);

periodic_solicit:
        deadline = ddi_get_lbolt() + solicit_period_ticks;
        while ((info->ti_event & (ENX_EVENT_TIMED_OUT | ENX_EVENT_DIE)) == 0) {
                mutex_enter(&ci_lock);
                CALLB_CPR_SAFE_BEGIN(&ci);
                mutex_exit(&ci_lock);

                if (cv_timedwait(&info->ti_event_cv, &info->ti_event_lock,
                    deadline) == -1) {
                        info->ti_event |= ENX_EVENT_TIMED_OUT;
                }

                mutex_enter(&ci_lock);
                CALLB_CPR_SAFE_END(&ci, &ci_lock);
                mutex_exit(&ci_lock);
        }

        if (info->ti_event & ENX_EVENT_DIE) {
                mutex_exit(&info->ti_event_lock);
                goto port_monitor_exit;
        }

        if (info->ti_event & ENX_EVENT_TIMED_OUT) {
                if (eibnx_fip_solicit_ucast(info,
                    &solicit_period_ticks) != ENX_E_SUCCESS) {
                        ENX_DPRINTF_WARN("failed to send solicit ucast to "
                            "gateways (hca_guid=0x%llx, port_num=0x%x)",
                            info->ti_hca_guid, info->ti_pi->p_port_num);
                }
                info->ti_event &= ~ENX_EVENT_TIMED_OUT;
        }

        goto periodic_solicit;

port_monitor_exit:
        if (info->ti_progress & ENX_MON_MULTICAST_SLCT) {
                eibnx_cleanup_port_nodes(info);
                info->ti_progress &= (~ENX_MON_MULTICAST_SLCT);
        }
        if (info->ti_progress & ENX_MON_JOINED_MCGS) {
                eibnx_rb_join_mcgs(info);
                info->ti_progress &= (~ENX_MON_JOINED_MCGS);
        }
        if (info->ti_progress & ENX_MON_SETUP_CQ_HDLR) {
                eibnx_rb_setup_cq_handler(info);
                info->ti_progress &= (~ENX_MON_SETUP_CQ_HDLR);
        }
        if (info->ti_progress & ENX_MON_SETUP_BUFS) {
                eibnx_rb_setup_bufs(info);
                info->ti_progress &= (~ENX_MON_SETUP_BUFS);
        }
        if (info->ti_progress & ENX_MON_SETUP_UD_CHAN) {
                eibnx_rb_setup_ud_channel(info);
                info->ti_progress &= (~ENX_MON_SETUP_UD_CHAN);
        }
        if (info->ti_progress & ENX_MON_SETUP_CQ) {
                eibnx_rb_setup_cq(info);
                info->ti_progress &= (~ENX_MON_SETUP_CQ);
        }
        if (info->ti_progress & ENX_MON_FOUND_MCGS) {
                eibnx_rb_find_mgroups(info);
                info->ti_progress &= (~ENX_MON_FOUND_MCGS);
        }

        mutex_enter(&ci_lock);
        CALLB_CPR_EXIT(&ci);
        mutex_destroy(&ci_lock);
}

/*
 * Async subnet notices handler registered with IBTF
 */
/*ARGSUSED*/
void
eibnx_subnet_notices_handler(void *arg, ib_gid_t gid,
    ibt_subnet_event_code_t sn_evcode, ibt_subnet_event_t *sn_event)
{
        eibnx_t *ss = enx_global_ss;
        eibnx_thr_info_t *ti;
        ib_gid_t notice_gid;

        switch (sn_evcode) {
        case IBT_SM_EVENT_MCG_CREATED:
                notice_gid = sn_event->sm_notice_gid;

                if ((notice_gid.gid_prefix == enx_solicit_mgid.gid_prefix &&
                    notice_gid.gid_guid == enx_solicit_mgid.gid_guid) ||
                    (notice_gid.gid_prefix == enx_advertise_mgid.gid_prefix &&
                    notice_gid.gid_guid == enx_advertise_mgid.gid_guid)) {

                        mutex_enter(&ss->nx_lock);
                        for (ti = ss->nx_thr_info; ti; ti = ti->ti_next) {
                                mutex_enter(&ti->ti_event_lock);
                                ti->ti_event |= ENX_EVENT_MCGS_AVAILABLE;
                                cv_broadcast(&ti->ti_event_cv);
                                mutex_exit(&ti->ti_event_lock);
                        }
                        mutex_exit(&ss->nx_lock);
                }
                break;

        case IBT_SM_EVENT_MCG_DELETED:
                break;

        default:
                break;
        }
}

/*
 * Async event handler registered with IBTF
 */
/*ARGSUSED*/
void
eibnx_async_handler(void *clnt_pvt, ibt_hca_hdl_t hca,
    ibt_async_code_t code, ibt_async_event_t *event)
{
        switch (code) {
        case IBT_ERROR_CATASTROPHIC_CHAN:
        case IBT_ERROR_INVALID_REQUEST_CHAN:
        case IBT_ERROR_ACCESS_VIOLATION_CHAN:
        case IBT_ERROR_CQ:
        case IBT_ERROR_CATASTROPHIC_SRQ:
                ENX_DPRINTF_ERR("ibt ERROR event 0x%x received "
                    "(hca_guid=0x%llx)", code, event->ev_hca_guid);
                break;

        case IBT_ERROR_PORT_DOWN:
                ENX_DPRINTF_WARN("ibt PORT_DOWN event received "
                    "(hca_guid=0x%llx, port_num=0x%x)",
                    event->ev_hca_guid, event->ev_port);
                break;

        case IBT_EVENT_PORT_UP:
                ENX_DPRINTF_WARN("ibt PORT_UP event received "
                    "(hca_guid=0x%llx, port_num=0x%x)",
                    event->ev_hca_guid, event->ev_port);
                eibnx_handle_port_events(hca, event->ev_port);
                break;

        case IBT_PORT_CHANGE_EVENT:
                ENX_DPRINTF_WARN("ibt PORT_CHANGE event received "
                    "(hca_guid=0x%llx, port_num=0x%x)",
                    event->ev_hca_guid, event->ev_port);
                eibnx_handle_port_events(hca, event->ev_port);
                break;

        case IBT_CLNT_REREG_EVENT:
                ENX_DPRINTF_WARN("ibt CLNT_REREG event received "
                    "(hca_guid=0x%llx, port_num=0x%x)",
                    event->ev_hca_guid, event->ev_port);
                eibnx_handle_port_events(hca, event->ev_port);
                break;

        case IBT_HCA_ATTACH_EVENT:
                ENX_DPRINTF_VERBOSE("ibt HCA_ATTACH event received "
                    "(new hca_guid=0x%llx)", event->ev_hca_guid);
                eibnx_handle_hca_attach(event->ev_hca_guid);
                break;

        case IBT_HCA_DETACH_EVENT:
                ENX_DPRINTF_VERBOSE("ibt HCA_DETACH event received "
                    "(target hca_guid=0x%llx)", event->ev_hca_guid);
                eibnx_handle_hca_detach(event->ev_hca_guid);
                break;

        default:
                ENX_DPRINTF_VERBOSE("ibt UNSUPPORTED event 0x%x received "
                    "(hca_guid=0x%llx)", code, event->ev_hca_guid);
                break;
        }
}

boolean_t
eibnx_is_gw_dead(eibnx_gw_info_t *gwi)
{
        int64_t cur_lbolt;

        cur_lbolt = ddi_get_lbolt64();

        mutex_enter(&gwi->gw_adv_lock);
        if ((cur_lbolt - gwi->gw_adv_last_lbolt) > gwi->gw_adv_timeout_ticks) {
                gwi->gw_adv_flag = ENX_GW_DEAD;
                mutex_exit(&gwi->gw_adv_lock);
                return (B_TRUE);
        }
        mutex_exit(&gwi->gw_adv_lock);

        return (B_FALSE);
}

static void
eibnx_gw_is_alive(eibnx_gw_info_t *gwi)
{
        /*
         * We've just received a multicast advertisement from this
         * gateway.  Multicast or unicast, this means that the gateway
         * is alive. Record this timestamp (in ticks).
         */
        mutex_enter(&gwi->gw_adv_lock);
        gwi->gw_adv_last_lbolt = ddi_get_lbolt64();
        if (gwi->gw_adv_flag == ENX_GW_DEAD) {
                gwi->gw_adv_flag = ENX_GW_ALIVE;
        }
        mutex_exit(&gwi->gw_adv_lock);
}

static void
eibnx_gw_is_aware(eibnx_thr_info_t *info, eibnx_gw_info_t *gwi,
    boolean_t gwi_changed)
{
        eib_gw_info_t eib_gwi;
        boolean_t post_rebirth_event = B_FALSE;

        /*
         * We're here when we receive a unicast advertisement from a
         * gateway. If this gateway was discovered earlier but was in
         * a dead state, this means it has come back alive and become
         * aware of us.  We may need to inform any EoIB children
         * waiting for notification.  Note that if this gateway is
         * being discovered for the first time now, we wouldn't have
         * created the binding eoib node for it (we will do that when
         * we return from this routine), so the "rebirth" and "gw info
         * update" event postings will be NOPs.
         */
        mutex_enter(&gwi->gw_adv_lock);
        gwi->gw_adv_last_lbolt = ddi_get_lbolt64();
        if (gwi->gw_adv_flag != ENX_GW_AWARE) {
                post_rebirth_event = B_TRUE;
        }
        gwi->gw_adv_flag = ENX_GW_AWARE;
        mutex_exit(&gwi->gw_adv_lock);

        /*
         * If we have a gateway information update event, we post that
         * first, so any rebirth event processed later will have the
         * correct gateway information.
         */
        if (gwi_changed) {
                eib_gwi.gi_system_guid = gwi->gw_system_guid;
                eib_gwi.gi_guid = gwi->gw_guid;
                eib_gwi.gi_sn_prefix = gwi->gw_addr.ga_gid.gid_prefix;
                eib_gwi.gi_adv_period = gwi->gw_adv_period;
                eib_gwi.gi_ka_period = gwi->gw_ka_period;
                eib_gwi.gi_vnic_ka_period = gwi->gw_vnic_ka_period;
                eib_gwi.gi_ctrl_qpn = gwi->gw_ctrl_qpn;
                eib_gwi.gi_lid = gwi->gw_lid;
                eib_gwi.gi_portid = gwi->gw_portid;
                eib_gwi.gi_num_net_vnics = gwi->gw_num_net_vnics;
                eib_gwi.gi_flag_available = gwi->gw_flag_available;
                eib_gwi.gi_is_host_adm_vnics = gwi->gw_is_host_adm_vnics;
                eib_gwi.gi_sl = gwi->gw_sl;
                eib_gwi.gi_n_rss_qpn = gwi->gw_n_rss_qpn;
                bcopy(gwi->gw_system_name, eib_gwi.gi_system_name,
                    EIB_GW_SYSNAME_LEN);
                bcopy(gwi->gw_port_name, eib_gwi.gi_port_name,
                    EIB_GW_PORTNAME_LEN);
                bcopy(gwi->gw_vendor_id, eib_gwi.gi_vendor_id,
                    EIB_GW_VENDOR_LEN);

                eibnx_handle_gw_info_update(info, eib_gwi.gi_portid, &eib_gwi);
        }
        if (post_rebirth_event) {
                eibnx_handle_gw_rebirth(info, gwi->gw_portid);
        }
}

/*
 * Thread to create eoib nodes and online instances
 */
void
eibnx_create_eoib_node(void)
{
        eibnx_t *ss = enx_global_ss;
        eibnx_nodeq_t *node;
        kmutex_t ci_lock;
        callb_cpr_t ci;

        mutex_init(&ci_lock, NULL, MUTEX_DRIVER, NULL);
        CALLB_CPR_INIT(&ci, &ci_lock, callb_generic_cpr, ENX_NODE_CREATOR);

wait_for_node_to_create:
        mutex_enter(&ss->nx_nodeq_lock);

        while ((ss->nx_nodeq == NULL) && (ss->nx_nodeq_thr_die == 0)) {
                mutex_enter(&ci_lock);
                CALLB_CPR_SAFE_BEGIN(&ci);
                mutex_exit(&ci_lock);

                cv_wait(&ss->nx_nodeq_cv, &ss->nx_nodeq_lock);

                mutex_enter(&ci_lock);
                CALLB_CPR_SAFE_END(&ci, &ci_lock);
                mutex_exit(&ci_lock);
        }

        /*
         * If this is not really a work item, but a request for us to
         * die, throwaway all pending work requests and just die.
         */
        if (ss->nx_nodeq_thr_die) {
                while (ss->nx_nodeq) {
                        node = ss->nx_nodeq;
                        ss->nx_nodeq = node->nc_next;
                        node->nc_next = NULL;

                        kmem_free(node, sizeof (eibnx_nodeq_t));
                }
                mutex_exit(&ss->nx_nodeq_lock);

                mutex_enter(&ci_lock);
                CALLB_CPR_EXIT(&ci);
                mutex_destroy(&ci_lock);

                return;
        }

        /*
         * Grab the first node entry from the queue
         */
        ASSERT(ss->nx_nodeq != NULL);
        node = ss->nx_nodeq;
        ss->nx_nodeq = node->nc_next;
        node->nc_next = NULL;

        mutex_exit(&ss->nx_nodeq_lock);

        (void) eibnx_configure_node(node->nc_info, node->nc_gwi, NULL);

        kmem_free(node, sizeof (eibnx_nodeq_t));
        goto wait_for_node_to_create;

        /*NOTREACHED*/
}

/*
 * Tx and Rx completion interrupt handler. Guaranteed to be single
 * threaded and nonreentrant for this CQ.
 */
void
eibnx_comp_intr(ibt_cq_hdl_t cq_hdl, void *arg)
{
        eibnx_thr_info_t *info = arg;

        if (info->ti_cq_hdl != cq_hdl) {
                ENX_DPRINTF_DEBUG("eibnx_comp_intr: "
                    "cq_hdl(0x%llx) != info->ti_cq_hdl(0x%llx), "
                    "ignoring completion", cq_hdl, info->ti_cq_hdl);
                return;
        }

        ASSERT(info->ti_softint_hdl != NULL);

        (void) ddi_intr_trigger_softint(info->ti_softint_hdl, NULL);
}

/*
 * Send and Receive completion handler functions for EoIB nexus
 */

/*ARGSUSED*/
uint_t
eibnx_comp_handler(caddr_t arg1, caddr_t arg2)
{
        eibnx_thr_info_t *info = (eibnx_thr_info_t *)arg1;
        ibt_wc_t *wc;
        eibnx_wqe_t *wqe;
        ibt_status_t ret;
        uint_t polled;
        int i;

        /*
         * Make sure the port monitor isn't killed if we're in the completion
         * handler. If the port monitor thread is already being killed, we'll
         * stop processing completions.
         */
        mutex_enter(&info->ti_event_lock);
        if (info->ti_event & (ENX_EVENT_DIE | ENX_EVENT_COMPLETION)) {
                mutex_exit(&info->ti_event_lock);
                return ((uint_t)ENX_E_SUCCESS);
        }
        info->ti_event |= ENX_EVENT_COMPLETION;
        mutex_exit(&info->ti_event_lock);

        /*
         * Re-arm the notification callback before we start polling
         * the completion queue.  There's nothing much we can do if the
         * enable_cq_notify fails - we issue a warning and move on.
         */
        ret = ibt_enable_cq_notify(info->ti_cq_hdl, IBT_NEXT_COMPLETION);
        if (ret != IBT_SUCCESS) {
                ENX_DPRINTF_WARN("ibt_enable_cq_notify(cq_hdl=0x%llx) "
                    "failed, ret=%d", info->ti_cq_hdl, ret);
        }

        /*
         * Handle tx and rx completions
         */
        while ((ret = ibt_poll_cq(info->ti_cq_hdl, info->ti_wc, info->ti_cq_sz,
            &polled)) == IBT_SUCCESS) {
                for (wc = info->ti_wc, i = 0; i < polled; i++, wc++) {
                        wqe = (eibnx_wqe_t *)(uintptr_t)wc->wc_id;
                        if (wc->wc_status != IBT_WC_SUCCESS) {
                                eibnx_handle_wcerr(wc->wc_status, wqe, info);
                        } else if (wqe->qe_type == ENX_QETYP_RWQE) {
                                eibnx_process_rx(info, wc, wqe);
                                eibnx_return_rwqe(info, wqe);
                        } else {
                                eibnx_return_swqe(wqe);
                        }
                }
        }

        /*
         * On the way out, make sure we wake up any pending death requestor
         * for the port-monitor thread. Note that we need to do a cv_broadcast()
         * here since there could be multiple threads sleeping on the event cv
         * and we want to make sure all waiters get a chance to see if it's
         * their turn.
         */
        mutex_enter(&info->ti_event_lock);
        info->ti_event &= (~ENX_EVENT_COMPLETION);
        cv_broadcast(&info->ti_event_cv);
        mutex_exit(&info->ti_event_lock);

        return (DDI_INTR_CLAIMED);
}

/*
 * Rx processing code
 */
static void
eibnx_process_rx(eibnx_thr_info_t *info, ibt_wc_t *wc, eibnx_wqe_t *wqe)
{
        eibnx_gw_msg_t msg;
        eibnx_gw_info_t *gwi;
        eibnx_gw_info_t *orig_gwi;
        eibnx_gw_info_t *new_gwi;
        uint_t orig_gw_state;
        uint8_t *pkt = (uint8_t *)(uintptr_t)(wqe->qe_sgl.ds_va);
        boolean_t gwi_changed;

        /*
         * We'll simply drop any packet (including broadcast advertisements
         * from gws) we receive before we've done our solicitation broadcast.
         */
        if (info->ti_mcast_done == 0) {
                return;
        }

        /*
         * Skip the GRH and parse the message in the packet
         */
        if (eibnx_fip_parse_pkt(pkt + ENX_GRH_SZ, &msg) != ENX_E_SUCCESS) {
                return;
        }

        /*
         * If it was a login ack for one of our children, we need to pass
         * it on to the child
         */
        if (msg.gm_type == FIP_VNIC_LOGIN_ACK) {
                eibnx_handle_login_ack(info, pkt);
                return;
        }

        /*
         * Other than that, we only handle gateway advertisements
         */
        if (msg.gm_type != FIP_GW_ADVERTISE_MCAST &&
            msg.gm_type != FIP_GW_ADVERTISE_UCAST) {
                return;
        }

        gwi = &msg.u.gm_info;

        /*
         * State machine to create eoib instances. Whether this advertisement
         * is from a new gateway or an old gateway that we already know about,
         * if this was a unicast response to our earlier solicitation and it's
         * the first time we're receiving it from this gateway, we're ready to
         * login, so we create the EoIB instance for it.
         */
        orig_gwi = eibnx_find_gw_in_gwlist(info, gwi);
        if (orig_gwi == NULL) {
                if (gwi->gw_flag_available == 0) {
                        gwi->gw_state = ENX_GW_STATE_UNAVAILABLE;
                        gwi->gw_adv_flag = ENX_GW_ALIVE;
                        (void) eibnx_add_gw_to_gwlist(info, gwi, wc, pkt);
                } else if (gwi->gw_flag_ucast_advt == 0) {
                        gwi->gw_state = ENX_GW_STATE_AVAILABLE;
                        gwi->gw_adv_flag = ENX_GW_ALIVE;
                        (void) eibnx_add_gw_to_gwlist(info, gwi, wc, pkt);
                } else {
                        gwi->gw_state = ENX_GW_STATE_READY_TO_LOGIN;
                        gwi->gw_adv_flag = ENX_GW_AWARE;
                        if ((new_gwi = eibnx_add_gw_to_gwlist(info, gwi,
                            wc, pkt)) != NULL) {
                                eibnx_queue_for_creation(info, new_gwi);
                        }
                }
        } else {
                orig_gw_state = orig_gwi->gw_state;
                if (gwi->gw_flag_available == 0) {
                        gwi->gw_state = ENX_GW_STATE_UNAVAILABLE;
                        eibnx_replace_gw_in_gwlist(info, orig_gwi, gwi,
                            wc, pkt, NULL);
                        eibnx_gw_is_alive(orig_gwi);

                } else if (gwi->gw_flag_ucast_advt == 0) {
                        if (orig_gw_state == ENX_GW_STATE_UNAVAILABLE) {
                                gwi->gw_state = ENX_GW_STATE_AVAILABLE;
                        } else {
                                gwi->gw_state = orig_gw_state;
                        }
                        eibnx_replace_gw_in_gwlist(info, orig_gwi, gwi,
                            wc, pkt, NULL);
                        eibnx_gw_is_alive(orig_gwi);

                } else {
                        gwi->gw_state = ENX_GW_STATE_READY_TO_LOGIN;
                        eibnx_replace_gw_in_gwlist(info, orig_gwi, gwi,
                            wc, pkt, &gwi_changed);
                        eibnx_gw_is_aware(info, orig_gwi, gwi_changed);

                        if (orig_gw_state != ENX_GW_STATE_READY_TO_LOGIN)
                                eibnx_queue_for_creation(info, orig_gwi);
                }
        }
}

/*ARGSUSED*/
static void
eibnx_handle_wcerr(uint8_t wcerr, eibnx_wqe_t *wqe, eibnx_thr_info_t *info)
{
        /*
         * Currently, all we do is report
         */
        switch (wcerr) {
        case IBT_WC_WR_FLUSHED_ERR:
                ENX_DPRINTF_VERBOSE("IBT_WC_WR_FLUSHED_ERR seen "
                    "(hca_guid=0x%llx, port_num=0x%x, wqe_type=0x%x)",
                    info->ti_hca_guid, info->ti_pi->p_port_num, wqe->qe_type);
                break;

        case IBT_WC_LOCAL_CHAN_OP_ERR:
                ENX_DPRINTF_ERR("IBT_WC_LOCAL_CHAN_OP_ERR seen "
                    "(hca_guid=0x%llx, port_num=0x%x, wqe_type=0x%x)",
                    info->ti_hca_guid, info->ti_pi->p_port_num, wqe->qe_type);
                break;

        case IBT_WC_LOCAL_PROTECT_ERR:
                ENX_DPRINTF_ERR("IBT_WC_LOCAL_PROTECT_ERR seen "
                    "(hca_guid=0x%llx, port_num=0x%x, wqe_type=0x%x)",
                    info->ti_hca_guid, info->ti_pi->p_port_num, wqe->qe_type);
                break;
        }
}

static void
eibnx_handle_login_ack(eibnx_thr_info_t *info, uint8_t *pkt)
{
        eibnx_t *ss = enx_global_ss;
        fip_login_ack_t *ack;
        fip_desc_vnic_login_t *login;
        ddi_eventcookie_t cookie;
        dev_info_t *rdip;
        uint16_t vnic_id;
        uint16_t inst;
        int ret;

        /*
         * When we get login acknowledgements, we simply invoke the
         * appropriate EoIB driver callback to process it on behalf
         * of the driver instance. We will let the callback do error
         * checks.
         */
        ack = (fip_login_ack_t *)(pkt + ENX_GRH_SZ);
        login = &(ack->ak_vnic_login);
        vnic_id = ntohs(login->vl_vnic_id);
        inst = EIB_DEVI_INSTANCE(vnic_id);

        if ((rdip = eibnx_find_child_dip_by_inst(info, inst)) == NULL) {
                ENX_DPRINTF_DEBUG("no eoib child with instance 0x%x found "
                    "for (hca_guid=0x%llx, port_num=0x%x)", inst,
                    info->ti_hca_guid, info->ti_pi->p_port_num);
                return;
        }

        ret = ndi_event_retrieve_cookie(enx_ndi_event_hdl, rdip,
            EIB_NDI_EVENT_LOGIN_ACK, &cookie, NDI_EVENT_NOPASS);
        if (ret != NDI_SUCCESS) {
                ENX_DPRINTF_WARN("no login-ack cookie for (hca_guid=0x%llx, "
                    "port_num=0x%x, eoib_inst=0x%x), ret=%d", info->ti_hca_guid,
                    info->ti_pi->p_port_num, inst, ret);
                return;
        }

        (void) ndi_post_event(ss->nx_dip, rdip, cookie, (void *)pkt);
}

static void
eibnx_handle_gw_rebirth(eibnx_thr_info_t *info, uint16_t portid)
{
        eibnx_t *ss = enx_global_ss;
        ddi_eventcookie_t cookie;
        dev_info_t *rdip;
        int ret;

        if ((rdip = eibnx_find_child_dip_by_gw(info, portid)) == NULL) {
                ENX_DPRINTF_WARN("no eoib child bound to gw portid 0x%x "
                    "found for (hca_guid=0x%llx, port_num=0x%x)",
                    portid, info->ti_hca_guid, info->ti_pi->p_port_num);
                return;
        }

        ret = ndi_event_retrieve_cookie(enx_ndi_event_hdl, rdip,
            EIB_NDI_EVENT_GW_AVAILABLE, &cookie, NDI_EVENT_NOPASS);
        if (ret != NDI_SUCCESS) {
                ENX_DPRINTF_WARN("no gw-available cookie for (hca_guid=0x%llx, "
                    "port_num=0x%x, gw_portid=0x%x), ret=%d", info->ti_hca_guid,
                    info->ti_pi->p_port_num, portid, ret);
                return;
        }

        (void) ndi_post_event(ss->nx_dip, rdip, cookie, NULL);
}

static void
eibnx_handle_gw_info_update(eibnx_thr_info_t *info, uint16_t portid,
    void *new_gw_info)
{
        eibnx_t *ss = enx_global_ss;
        ddi_eventcookie_t cookie;
        dev_info_t *rdip;
        int ret;

        if ((rdip = eibnx_find_child_dip_by_gw(info, portid)) == NULL) {
                ENX_DPRINTF_WARN("no eoib child bound to gw portid 0x%x "
                    "found for (hca_guid=0x%llx, port_num=0x%x)",
                    portid, info->ti_hca_guid, info->ti_pi->p_port_num);
                return;
        }

        ret = ndi_event_retrieve_cookie(enx_ndi_event_hdl, rdip,
            EIB_NDI_EVENT_GW_INFO_UPDATE, &cookie, NDI_EVENT_NOPASS);
        if (ret != NDI_SUCCESS) {
                ENX_DPRINTF_WARN("no gw-info-update cookie for "
                    "(hca_guid=0x%llx, port_num=0x%x, gw_portid=0x%x), "
                    "ret=%d", info->ti_hca_guid, info->ti_pi->p_port_num,
                    portid, ret);
                return;
        }

        (void) ndi_post_event(ss->nx_dip, rdip, cookie, new_gw_info);
}

static int
eibnx_replace_portinfo(eibnx_thr_info_t *ti, ibt_hca_portinfo_t *new_pi,
    uint_t new_size_pi)
{
        eibnx_t *ss = enx_global_ss;
        eibnx_hca_t *hca;
        eibnx_port_t *port;

        mutex_enter(&ss->nx_lock);

        for (hca = ss->nx_hca; hca; hca = hca->hc_next) {
                if (hca->hc_hdl == ti->ti_hca)
                        break;
        }

        if (hca == NULL) {
                ENX_DPRINTF_WARN("hca hdl (0x%llx) not found in hca list",
                    ti->ti_hca);
                mutex_exit(&ss->nx_lock);
                return (ENX_E_FAILURE);
        }

        for (port = hca->hc_port; port; port = port->po_next) {
                if (port->po_pi == ti->ti_pi) {
                        ibt_free_portinfo(port->po_pi, port->po_pi_size);
                        port->po_pi = new_pi;
                        port->po_pi_size = new_size_pi;
                        ti->ti_pi = port->po_pi;
                        break;
                }
        }

        if (port == NULL) {
                ENX_DPRINTF_WARN("portinfo (0x%llx) not found in hca list",
                    ti->ti_pi);
                mutex_exit(&ss->nx_lock);
                return (ENX_E_FAILURE);
        }

        mutex_exit(&ss->nx_lock);

        return (ENX_E_SUCCESS);
}

static void
eibnx_handle_port_events(ibt_hca_hdl_t ev_hca, uint8_t ev_portnum)
{
        eibnx_t *ss = enx_global_ss;
        eibnx_thr_info_t *ti;
        ibt_hca_portinfo_t *pi;
        ibt_status_t ret;
        uint_t num_pi;
        uint_t size_pi;
        uint8_t itr;

        /*
         * Find the port monitor thread that matches the event hca and
         * portnum
         */
        mutex_enter(&ss->nx_lock);
        for (ti = ss->nx_thr_info; ti; ti = ti->ti_next) {
                if ((ti->ti_hca == ev_hca) &&
                    (ti->ti_pi->p_port_num == ev_portnum)) {
                        break;
                }
        }
        mutex_exit(&ss->nx_lock);

        if (ti == NULL)
                return;

        /*
         * See if we need to rejoin the mcgs for this port and do so if true
         */
        ret = ibt_query_hca_ports(ev_hca, ev_portnum, &pi, &num_pi, &size_pi);
        if (ret != IBT_SUCCESS) {
                ENX_DPRINTF_WARN("ibt_query_hca_ports() failed with %d", ret);
                return;
        } else if (num_pi != 1 || pi->p_linkstate != IBT_PORT_ACTIVE) {
                ENX_DPRINTF_WARN("ibt_query_hca_ports(port_num=%d) failed, "
                    "num_pi=%d, linkstate=0x%x", ev_portnum, num_pi,
                    pi->p_linkstate);
                ibt_free_portinfo(pi, size_pi);
                return;
        }

        itr = pi->p_init_type_reply;
        if (ENX_PORT_ATTR_LOADED(itr) && ENX_PORT_ATTR_NOT_PRESERVED(itr)) {
                /*
                 * If our port's base lid has changed, we need to replace
                 * the saved portinfo in our lists with the new one before
                 * going further.
                 */
                if (ti->ti_pi->p_base_lid != pi->p_base_lid) {
                        if (eibnx_replace_portinfo(ti, pi, size_pi) ==
                            ENX_E_SUCCESS) {
                                pi = NULL;
                                size_pi = 0;
                        }
                }
        }

        /*
         * If the port monitor was stuck waiting for the link to come up,
         * let it know that it is up now.
         */
        mutex_enter(&ti->ti_event_lock);
        if ((ti->ti_progress & ENX_MON_LINKSTATE_UP) != ENX_MON_LINKSTATE_UP) {
                ti->ti_pi->p_linkstate = IBT_PORT_ACTIVE;
                ti->ti_event |= ENX_EVENT_LINK_UP;
                cv_broadcast(&ti->ti_event_cv);
        }
        mutex_exit(&ti->ti_event_lock);

        if (ENX_PORT_PRES_NOT_PRESERVED(itr)) {
                if (ti->ti_progress & ENX_MON_JOINED_MCGS)
                        (void) eibnx_rejoin_mcgs(ti);
        }

        if (pi != NULL)
                ibt_free_portinfo(pi, size_pi);
}

static void
eibnx_handle_hca_attach(ib_guid_t new_hca_guid)
{
        eibnx_t *ss = enx_global_ss;
        eibnx_thr_info_t *ti;
        eibnx_hca_t *hca;
        eibnx_port_t *port;

        /*
         * All we need to do is to start a port monitor for all the ports
         * on the new HCA.  To do this, go through our current port monitors
         * and see if we already have a monitor for this HCA - if so, print
         * a warning and return.
         */
        mutex_enter(&ss->nx_lock);
        for (ti = ss->nx_thr_info; ti; ti = ti->ti_next) {
                if (ti->ti_hca_guid == new_hca_guid) {
                        ENX_DPRINTF_VERBOSE("hca (guid=0x%llx) already "
                            "attached", new_hca_guid);
                        mutex_exit(&ss->nx_lock);
                        return;
                }
        }
        mutex_exit(&ss->nx_lock);

        /*
         * If we don't have it in our list, process the HCA and start the
         * port monitors
         */
        if ((hca = eibnx_prepare_hca(new_hca_guid)) != NULL) {
                mutex_enter(&ss->nx_lock);

                hca->hc_next = ss->nx_hca;
                ss->nx_hca = hca;

                for (port = hca->hc_port; port; port = port->po_next) {
                        ti = eibnx_start_port_monitor(hca, port);

                        ti->ti_next = ss->nx_thr_info;
                        ss->nx_thr_info = ti;
                }
                mutex_exit(&ss->nx_lock);
        }
}

static void
eibnx_handle_hca_detach(ib_guid_t del_hca_guid)
{
        eibnx_t *ss = enx_global_ss;
        eibnx_thr_info_t *ti;
        eibnx_thr_info_t *ti_stop_list = NULL;
        eibnx_thr_info_t *ti_prev;
        eibnx_thr_info_t *ti_next;
        eibnx_hca_t *hca;
        eibnx_hca_t *hca_prev;

        /*
         * We need to locate all monitor threads for this HCA and stop them
         */
        mutex_enter(&ss->nx_lock);
        ti_prev = NULL;
        for (ti = ss->nx_thr_info; ti; ti = ti_next) {
                ti_next = ti->ti_next;

                if (ti->ti_hca_guid != del_hca_guid) {
                        ti_prev = ti;
                } else {
                        /*
                         * Take it out from the good list
                         */
                        if (ti_prev)
                                ti_prev->ti_next = ti_next;
                        else
                                ss->nx_thr_info = ti_next;

                        /*
                         * And put it in the to-stop list
                         */
                        ti->ti_next = ti_stop_list;
                        ti_stop_list = ti;
                }
        }
        mutex_exit(&ss->nx_lock);

        /*
         * Ask all the port_monitor threads to die.
         */
        for (ti = ti_stop_list; ti; ti = ti_next) {
                ti_next = ti->ti_next;
                eibnx_stop_port_monitor(ti);
        }

        /*
         * Now, locate the HCA in our list and release all HCA related
         * resources.
         */
        mutex_enter(&ss->nx_lock);
        hca_prev = NULL;
        for (hca = ss->nx_hca; hca; hca = hca->hc_next) {
                if (hca->hc_guid != del_hca_guid) {
                        hca_prev = hca;
                } else {
                        if (hca_prev) {
                                hca_prev->hc_next = hca->hc_next;
                        } else {
                                ss->nx_hca = hca->hc_next;
                        }
                        hca->hc_next = NULL;
                        break;
                }
        }
        mutex_exit(&ss->nx_lock);

        if (hca) {
                (void) eibnx_cleanup_hca(hca);
        }
}