usr/src/uts/common/io/ib/clients/eoib/eib_svc.c

root/usr/src/uts/common/io/ib/clients/eoib/eib_svc.c
/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */

/*
 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
 */

#include <sys/types.h>
#include <sys/kmem.h>
#include <sys/conf.h>
#include <sys/ddi.h>
#include <sys/sunddi.h>
#include <sys/ksynch.h>
#include <sys/callb.h>
#include <sys/mac_provider.h>

#include <sys/ib/clients/eoib/eib_impl.h>

/*
 * Thread to handle EoIB events asynchronously
 */
void
eib_events_handler(eib_t *ss)
{
        eib_event_t *evi;
        eib_event_t *nxt;
        kmutex_t ci_lock;
        callb_cpr_t ci;

        mutex_init(&ci_lock, NULL, MUTEX_DRIVER, NULL);
        CALLB_CPR_INIT(&ci, &ci_lock, callb_generic_cpr, EIB_EVENTS_HDLR);

wait_for_event:
        mutex_enter(&ss->ei_ev_lock);
        while ((evi = ss->ei_event) == NULL) {
                mutex_enter(&ci_lock);
                CALLB_CPR_SAFE_BEGIN(&ci);
                mutex_exit(&ci_lock);

                cv_wait(&ss->ei_ev_cv, &ss->ei_ev_lock);

                mutex_enter(&ci_lock);
                CALLB_CPR_SAFE_END(&ci, &ci_lock);
                mutex_exit(&ci_lock);
        }

        /*
         * Are we being asked to die ?
         */
        if (evi->ev_code == EIB_EV_SHUTDOWN) {
                while (evi) {
                        nxt = evi->ev_next;
                        kmem_free(evi, sizeof (eib_event_t));
                        evi = nxt;
                }
                ss->ei_event = NULL;
                mutex_exit(&ss->ei_ev_lock);

                mutex_enter(&ci_lock);
                CALLB_CPR_EXIT(&ci);
                mutex_destroy(&ci_lock);

                return;
        }

        /*
         * Otherwise, pull out the first entry from our work queue
         */
        ss->ei_event = evi->ev_next;
        evi->ev_next = NULL;

        mutex_exit(&ss->ei_ev_lock);

        /*
         * Process this event
         *
         * Note that we don't want to race with plumb/unplumb in this
         * handler, since we may have to restart vnics or do stuff that
         * may get re-initialized or released if we allowed plumb/unplumb
         * to happen in parallel.
         */
        eib_mac_set_nic_state(ss, EIB_NIC_RESTARTING);

        switch (evi->ev_code) {
        case EIB_EV_PORT_DOWN:
                EIB_DPRINTF_DEBUG(ss->ei_instance,
                    "eib_events_handler: Begin EIB_EV_PORT_DOWN");

                eib_mac_link_down(ss, B_FALSE);

                EIB_DPRINTF_DEBUG(ss->ei_instance,
                    "eib_events_handler: End EIB_EV_PORT_DOWN");
                break;

        case EIB_EV_PORT_UP:
                EIB_DPRINTF_DEBUG(ss->ei_instance,
                    "eib_events_handler: Begin EIB_EV_PORT_UP");

                eib_ibt_link_mod(ss);

                EIB_DPRINTF_DEBUG(ss->ei_instance,
                    "eib_events_handler: End EIB_EV_PORT_UP");
                break;

        case EIB_EV_PKEY_CHANGE:
                EIB_DPRINTF_DEBUG(ss->ei_instance,
                    "eib_events_handler: Begin EIB_EV_PKEY_CHANGE");

                eib_ibt_link_mod(ss);

                EIB_DPRINTF_DEBUG(ss->ei_instance,
                    "eib_events_handler: End EIB_EV_PKEY_CHANGE");
                break;

        case EIB_EV_SGID_CHANGE:
                EIB_DPRINTF_DEBUG(ss->ei_instance,
                    "eib_events_handler: Begin EIB_EV_SGID_CHANGE");

                eib_ibt_link_mod(ss);

                EIB_DPRINTF_DEBUG(ss->ei_instance,
                    "eib_events_handler: End EIB_EV_SGID_CHANGE");
                break;

        case EIB_EV_CLNT_REREG:
                EIB_DPRINTF_DEBUG(ss->ei_instance,
                    "eib_events_handler: Begin EIB_EV_CLNT_REREG");

                eib_ibt_link_mod(ss);

                EIB_DPRINTF_DEBUG(ss->ei_instance,
                    "eib_events_handler: End EIB_EV_CLNT_REREG");
                break;

        case EIB_EV_GW_UP:
                EIB_DPRINTF_DEBUG(ss->ei_instance,
                    "eib_events_handler: Begin EIB_EV_GW_UP");

                /*
                 * EoIB nexus has notified us that our gateway is now
                 * reachable. Unless we already think it is reachable,
                 * mark it so in our records and try to resurrect dead
                 * vnics.
                 */
                mutex_enter(&ss->ei_vnic_lock);
                if (ss->ei_gw_unreachable == B_FALSE) {
                        EIB_DPRINTF_DEBUG(ss->ei_instance,
                            "eib_events_handler: gw reachable");
                        mutex_exit(&ss->ei_vnic_lock);

                        EIB_DPRINTF_DEBUG(ss->ei_instance,
                            "eib_events_handler: End EIB_EV_GW_UP");
                        break;
                }
                ss->ei_gw_unreachable = B_FALSE;
                mutex_exit(&ss->ei_vnic_lock);

                /*
                 * If we've not even started yet, we have nothing to do.
                 */
                if ((ss->ei_node_state->ns_nic_state & EIB_NIC_STARTED) == 0) {
                        EIB_DPRINTF_DEBUG(ss->ei_instance,
                            "eib_events_handler: End EIB_EV_GW_UP");
                        break;
                }

                if (eib_mac_hca_portstate(ss, NULL, NULL) != EIB_E_SUCCESS) {
                        EIB_DPRINTF_DEBUG(ss->ei_instance,
                            "eib_events_handler: "
                            "HCA portstate failed, marking link down");

                        eib_mac_link_down(ss, B_FALSE);
                } else {
                        uint8_t vn0_mac[ETHERADDRL];

                        EIB_DPRINTF_DEBUG(ss->ei_instance,
                            "eib_events_handler: "
                            "HCA portstate ok, resurrecting zombies");

                        bcopy(eib_zero_mac, vn0_mac, ETHERADDRL);
                        eib_vnic_resurrect_zombies(ss, vn0_mac);

                        /*
                         * If we've resurrected the zombies because the gateway
                         * went down and came back, it is possible our unicast
                         * mac address changed from what it was earlier. If
                         * so, we need to update our unicast address with the
                         * mac layer before marking the link up.
                         */
                        if (bcmp(vn0_mac, eib_zero_mac, ETHERADDRL) != 0) {
                                EIB_DPRINTF_DEBUG(ss->ei_instance,
                                    "eib_events_handler: updating unicast "
                                    "addr to %x:%x:%x:%x:%x:%x", vn0_mac[0],
                                    vn0_mac[1], vn0_mac[2], vn0_mac[3],
                                    vn0_mac[4], vn0_mac[5]);

                                mac_unicst_update(ss->ei_mac_hdl, vn0_mac);
                        }

                        EIB_DPRINTF_DEBUG(ss->ei_instance,
                            "eib_events_handler: eib_mac_link_up(B_FALSE)");

                        eib_mac_link_up(ss, B_FALSE);
                }

                EIB_DPRINTF_DEBUG(ss->ei_instance,
                    "eib_events_handler: End EIB_EV_GW_UP");
                break;

        case EIB_EV_GW_INFO_UPDATE:
                EIB_DPRINTF_DEBUG(ss->ei_instance,
                    "eib_events_handler: Begin EIB_EV_GW_INFO_UPDATE");

                if (evi->ev_arg) {
                        eib_update_props(ss, (eib_gw_info_t *)(evi->ev_arg));
                        kmem_free(evi->ev_arg, sizeof (eib_gw_info_t));
                }

                EIB_DPRINTF_DEBUG(ss->ei_instance,
                    "eib_events_handler: End EIB_EV_GW_INFO_UPDATE");
                break;

        case EIB_EV_MCG_DELETED:
                EIB_DPRINTF_DEBUG(ss->ei_instance,
                    "eib_events_handler: Begin-End EIB_EV_MCG_DELETED");
                break;

        case EIB_EV_MCG_CREATED:
                EIB_DPRINTF_DEBUG(ss->ei_instance,
                    "eib_events_handler: Begin-End EIB_EV_MCG_CREATED");
                break;

        case EIB_EV_GW_EPORT_DOWN:
                EIB_DPRINTF_DEBUG(ss->ei_instance,
                    "eib_events_handler: Begin-End EIB_EV_GW_EPORT_DOWN");
                break;

        case EIB_EV_GW_DOWN:
                EIB_DPRINTF_DEBUG(ss->ei_instance,
                    "eib_events_handler: Begin-End EIB_EV_GW_DOWN");
                break;
        }

        eib_mac_clr_nic_state(ss, EIB_NIC_RESTARTING);

        kmem_free(evi, sizeof (eib_event_t));
        goto wait_for_event;

        /*NOTREACHED*/
}

void
eib_svc_enqueue_event(eib_t *ss, eib_event_t *evi)
{
        eib_event_t *elem = NULL;
        eib_event_t *tail = NULL;

        mutex_enter(&ss->ei_ev_lock);

        /*
         * Notice to shutdown has a higher priority than the
         * rest and goes to the head of the list. Everything
         * else goes at the end.
         */
        if (evi->ev_code == EIB_EV_SHUTDOWN) {
                evi->ev_next = ss->ei_event;
                ss->ei_event = evi;
        } else {
                for (elem = ss->ei_event; elem; elem = elem->ev_next)
                        tail = elem;

                if (tail)
                        tail->ev_next = evi;
                else
                        ss->ei_event = evi;
        }

        cv_signal(&ss->ei_ev_cv);
        mutex_exit(&ss->ei_ev_lock);
}

/*
 * Thread to refill channels with rwqes whenever they get low.
 */
void
eib_refill_rwqes(eib_t *ss)
{
        eib_chan_t *chan;
        kmutex_t ci_lock;
        callb_cpr_t ci;

        mutex_init(&ci_lock, NULL, MUTEX_DRIVER, NULL);
        CALLB_CPR_INIT(&ci, &ci_lock, callb_generic_cpr, EIB_RWQES_REFILLER);

wait_for_refill_work:
        mutex_enter(&ss->ei_rxpost_lock);

        while ((ss->ei_rxpost == NULL) && (ss->ei_rxpost_die == 0)) {
                mutex_enter(&ci_lock);
                CALLB_CPR_SAFE_BEGIN(&ci);
                mutex_exit(&ci_lock);

                cv_wait(&ss->ei_rxpost_cv, &ss->ei_rxpost_lock);

                mutex_enter(&ci_lock);
                CALLB_CPR_SAFE_END(&ci, &ci_lock);
                mutex_exit(&ci_lock);
        }

        /*
         * Discard all requests for refill if we're being asked to die
         */
        if (ss->ei_rxpost_die) {
                ss->ei_rxpost = NULL;
                mutex_exit(&ss->ei_rxpost_lock);

                mutex_enter(&ci_lock);
                CALLB_CPR_EXIT(&ci);
                mutex_destroy(&ci_lock);

                return;
        }
        ASSERT(ss->ei_rxpost != NULL);

        /*
         * Take the first element out of the queue
         */
        chan = ss->ei_rxpost;
        ss->ei_rxpost = chan->ch_rxpost_next;
        chan->ch_rxpost_next = NULL;

        mutex_exit(&ss->ei_rxpost_lock);

        /*
         * Try to post a bunch of recv wqes into this channel. If we
         * fail, it means that we haven't even been able to post a
         * single recv wqe.  This is alarming, but there's nothing
         * we can do. We just move on to the next channel needing
         * our service.
         */
        if (eib_chan_post_rx(ss, chan, NULL) != EIB_E_SUCCESS) {
                EIB_DPRINTF_ERR(ss->ei_instance,
                    "eib_refill_rwqes: eib_chan_post_rx() failed");
        }

        /*
         * Mark it to indicate that the refilling is done
         */
        mutex_enter(&chan->ch_rx_lock);
        chan->ch_rx_refilling = B_FALSE;
        mutex_exit(&chan->ch_rx_lock);

        goto wait_for_refill_work;

        /*NOTREACHED*/
}

/*
 * Thread to create or restart vnics when required
 */
void
eib_vnic_creator(eib_t *ss)
{
        eib_vnic_req_t *vrq;
        eib_vnic_req_t *elem;
        eib_vnic_req_t *nxt;
        kmutex_t ci_lock;
        callb_cpr_t ci;
        uint_t vr_req;
        uint8_t *vr_mac;
        int ret;
        int err;

        mutex_init(&ci_lock, NULL, MUTEX_DRIVER, NULL);
        CALLB_CPR_INIT(&ci, &ci_lock, callb_generic_cpr, EIB_VNIC_CREATOR);

wait_for_vnic_req:
        mutex_enter(&ss->ei_vnic_req_lock);

        while ((vrq = ss->ei_vnic_req) == NULL) {
                mutex_enter(&ci_lock);
                CALLB_CPR_SAFE_BEGIN(&ci);
                mutex_exit(&ci_lock);

                cv_wait(&ss->ei_vnic_req_cv, &ss->ei_vnic_req_lock);

                mutex_enter(&ci_lock);
                CALLB_CPR_SAFE_END(&ci, &ci_lock);
                mutex_exit(&ci_lock);
        }

        /*
         * Pull out the first request
         */
        ss->ei_vnic_req = vrq->vr_next;
        vrq->vr_next = NULL;

        vr_req = vrq->vr_req;
        vr_mac = vrq->vr_mac;

        switch (vr_req) {
        case EIB_CR_REQ_DIE:
        case EIB_CR_REQ_FLUSH:
                /*
                 * Cleanup all pending reqs and failed reqs
                 */
                for (elem = ss->ei_vnic_req; elem; elem = nxt) {
                        nxt = elem->vr_next;
                        kmem_free(elem, sizeof (eib_vnic_req_t));
                }
                for (elem = ss->ei_failed_vnic_req; elem; elem = nxt) {
                        nxt = elem->vr_next;
                        kmem_free(elem, sizeof (eib_vnic_req_t));
                }
                ss->ei_vnic_req = NULL;
                ss->ei_failed_vnic_req = NULL;
                ss->ei_pending_vnic_req = NULL;
                mutex_exit(&ss->ei_vnic_req_lock);

                break;

        case EIB_CR_REQ_NEW_VNIC:
                ss->ei_pending_vnic_req = vrq;
                mutex_exit(&ss->ei_vnic_req_lock);

                EIB_DPRINTF_DEBUG(ss->ei_instance, "eib_vnic_creator: "
                    "new vnic creation request for %x:%x:%x:%x:%x:%x, 0x%x",
                    vr_mac[0], vr_mac[1], vr_mac[2], vr_mac[3], vr_mac[4],
                    vr_mac[5], vrq->vr_vlan);

                /*
                 * Make sure we don't race with the plumb/unplumb code.  If
                 * the eoib instance has been unplumbed already, we ignore any
                 * creation requests that may have been pending.
                 */
                eib_mac_set_nic_state(ss, EIB_NIC_STARTING);

                if ((ss->ei_node_state->ns_nic_state & EIB_NIC_STARTED) !=
                    EIB_NIC_STARTED) {
                        mutex_enter(&ss->ei_vnic_req_lock);
                        ss->ei_pending_vnic_req = NULL;
                        mutex_exit(&ss->ei_vnic_req_lock);
                        eib_mac_clr_nic_state(ss, EIB_NIC_STARTING);
                        break;
                }

                /*
                 * Try to create a new vnic with the supplied parameters.
                 */
                err = 0;
                if ((ret = eib_vnic_create(ss, vrq->vr_mac, vrq->vr_vlan,
                    NULL, &err)) != EIB_E_SUCCESS) {
                        EIB_DPRINTF_WARN(ss->ei_instance, "eib_vnic_creator: "
                            "eib_vnic_create(mac=%x:%x:%x:%x:%x:%x, vlan=0x%x) "
                            "failed, ret=%d", vr_mac[0], vr_mac[1], vr_mac[2],
                            vr_mac[3], vr_mac[4], vr_mac[5], vrq->vr_vlan, err);
                }

                /*
                 * If we failed, add this vnic req to our failed list (unless
                 * it already exists there), so we won't try to create this
                 * vnic again.  Whether we fail or succeed, we're done with
                 * processing this req, so clear the pending req.
                 */
                mutex_enter(&ss->ei_vnic_req_lock);
                if ((ret != EIB_E_SUCCESS) && (err != EEXIST)) {
                        vrq->vr_next = ss->ei_failed_vnic_req;
                        ss->ei_failed_vnic_req = vrq;
                        vrq = NULL;
                }
                ss->ei_pending_vnic_req = NULL;
                mutex_exit(&ss->ei_vnic_req_lock);

                /*
                 * Notify the mac layer that it should retry its tx again. If we
                 * had created the vnic successfully, we'll be able to send the
                 * packets; if we had not been successful, we'll drop packets on
                 * this vnic.
                 */
                EIB_DPRINTF_DEBUG(ss->ei_instance,
                    "eib_vnic_creator: calling mac_tx_update()");
                mac_tx_update(ss->ei_mac_hdl);

                eib_mac_clr_nic_state(ss, EIB_NIC_STARTING);
                break;

        default:
                EIB_DPRINTF_DEBUG(ss->ei_instance, "eib_vnic_creator: "
                    "unknown request 0x%lx, ignoring", vrq->vr_req);
                break;
        }

        /*
         * Free the current req and quit if we have to
         */
        if (vrq) {
                kmem_free(vrq, sizeof (eib_vnic_req_t));
        }

        if (vr_req == EIB_CR_REQ_DIE) {
                mutex_enter(&ci_lock);
                CALLB_CPR_EXIT(&ci);
                mutex_destroy(&ci_lock);

                return;
        }

        goto wait_for_vnic_req;
        /*NOTREACHED*/
}

/*
 * Thread to monitor tx wqes and update the mac layer when needed.
 * Note that this thread can only be started after the tx wqe pool
 * has been allocated and initialized.
 */
void
eib_monitor_tx_wqes(eib_t *ss)
{
        eib_wqe_pool_t *wp = ss->ei_tx;
        kmutex_t ci_lock;
        callb_cpr_t ci;

        mutex_init(&ci_lock, NULL, MUTEX_DRIVER, NULL);
        CALLB_CPR_INIT(&ci, &ci_lock, callb_generic_cpr, EIB_TXWQES_MONITOR);

        ASSERT(wp != NULL);

monitor_wqe_status:
        mutex_enter(&wp->wp_lock);

        /*
         * Wait till someone falls short of wqes
         */
        while (wp->wp_status == 0) {
                mutex_enter(&ci_lock);
                CALLB_CPR_SAFE_BEGIN(&ci);
                mutex_exit(&ci_lock);

                cv_wait(&wp->wp_cv, &wp->wp_lock);

                mutex_enter(&ci_lock);
                CALLB_CPR_SAFE_END(&ci, &ci_lock);
                mutex_exit(&ci_lock);
        }

        /*
         * Have we been asked to die ?
         */
        if (wp->wp_status & EIB_TXWQE_MONITOR_DIE) {
                mutex_exit(&wp->wp_lock);

                mutex_enter(&ci_lock);
                CALLB_CPR_EXIT(&ci);
                mutex_destroy(&ci_lock);

                return;
        }

        ASSERT((wp->wp_status & EIB_TXWQE_SHORT) != 0);

        /*
         * Start monitoring free wqes till they cross min threshold
         */
        while ((wp->wp_nfree < EIB_NFREE_SWQES_HWM) &&
            ((wp->wp_status & EIB_TXWQE_MONITOR_DIE) == 0)) {

                mutex_enter(&ci_lock);
                CALLB_CPR_SAFE_BEGIN(&ci);
                mutex_exit(&ci_lock);

                cv_wait(&wp->wp_cv, &wp->wp_lock);

                mutex_enter(&ci_lock);
                CALLB_CPR_SAFE_END(&ci, &ci_lock);
                mutex_exit(&ci_lock);
        }

        /*
         * Have we been asked to die ?
         */
        if (wp->wp_status & EIB_TXWQE_MONITOR_DIE) {
                mutex_exit(&wp->wp_lock);

                mutex_enter(&ci_lock);
                CALLB_CPR_EXIT(&ci);
                mutex_destroy(&ci_lock);

                return;
        }

        ASSERT(wp->wp_nfree >= EIB_NFREE_SWQES_HWM);
        wp->wp_status &= (~EIB_TXWQE_SHORT);

        mutex_exit(&wp->wp_lock);

        /*
         * Inform the mac layer that tx resources are now available
         * and go back to monitoring
         */
        if (ss->ei_mac_hdl) {
                mac_tx_update(ss->ei_mac_hdl);
        }
        goto monitor_wqe_status;

        /*NOTREACHED*/
}

/*
 * Thread to monitor lso bufs and update the mac layer as needed.
 * Note that this thread can only be started after the lso buckets
 * have been allocated and initialized.
 */
void
eib_monitor_lso_bufs(eib_t *ss)
{
        eib_lsobkt_t *bkt = ss->ei_lso;
        kmutex_t ci_lock;
        callb_cpr_t ci;

        mutex_init(&ci_lock, NULL, MUTEX_DRIVER, NULL);
        CALLB_CPR_INIT(&ci, &ci_lock, callb_generic_cpr, EIB_LSOBUFS_MONITOR);

        ASSERT(bkt != NULL);

monitor_lso_status:
        mutex_enter(&bkt->bk_lock);

        /*
         * Wait till someone falls short of LSO buffers or we're asked
         * to die
         */
        while (bkt->bk_status == 0) {
                mutex_enter(&ci_lock);
                CALLB_CPR_SAFE_BEGIN(&ci);
                mutex_exit(&ci_lock);

                cv_wait(&bkt->bk_cv, &bkt->bk_lock);

                mutex_enter(&ci_lock);
                CALLB_CPR_SAFE_END(&ci, &ci_lock);
                mutex_exit(&ci_lock);
        }

        if (bkt->bk_status & EIB_LBUF_MONITOR_DIE) {
                mutex_exit(&bkt->bk_lock);

                mutex_enter(&ci_lock);
                CALLB_CPR_EXIT(&ci);
                mutex_destroy(&ci_lock);

                return;
        }

        ASSERT((bkt->bk_status & EIB_LBUF_SHORT) != 0);

        /*
         * Start monitoring free LSO buffers till there are enough
         * free buffers available
         */
        while ((bkt->bk_nfree < EIB_LSO_FREE_BUFS_THRESH) &&
            ((bkt->bk_status & EIB_LBUF_MONITOR_DIE) == 0)) {

                mutex_enter(&ci_lock);
                CALLB_CPR_SAFE_BEGIN(&ci);
                mutex_exit(&ci_lock);

                cv_wait(&bkt->bk_cv, &bkt->bk_lock);

                mutex_enter(&ci_lock);
                CALLB_CPR_SAFE_END(&ci, &ci_lock);
                mutex_exit(&ci_lock);
        }

        if (bkt->bk_status & EIB_LBUF_MONITOR_DIE) {
                mutex_exit(&bkt->bk_lock);

                mutex_enter(&ci_lock);
                CALLB_CPR_EXIT(&ci);
                mutex_destroy(&ci_lock);

                return;
        }

        /*
         * We have enough lso buffers available now
         */
        ASSERT(bkt->bk_nfree >= EIB_LSO_FREE_BUFS_THRESH);
        bkt->bk_status &= (~EIB_LBUF_SHORT);

        mutex_exit(&bkt->bk_lock);

        /*
         * Inform the mac layer that tx lso resources are now available
         * and go back to monitoring
         */
        if (ss->ei_mac_hdl) {
                mac_tx_update(ss->ei_mac_hdl);
        }
        goto monitor_lso_status;

        /*NOTREACHED*/
}

/*
 * Thread to manage the keepalive requirements for vnics and the gateway.
 */
void
eib_manage_keepalives(eib_t *ss)
{
        eib_ka_vnics_t *elem;
        eib_ka_vnics_t *nxt;
        clock_t deadline;
        int64_t lbolt64;
        int err;
        kmutex_t ci_lock;
        callb_cpr_t ci;

        mutex_init(&ci_lock, NULL, MUTEX_DRIVER, NULL);
        CALLB_CPR_INIT(&ci, &ci_lock, callb_generic_cpr, EIB_EVENTS_HDLR);

        mutex_enter(&ss->ei_ka_vnics_lock);

periodic_keepalive:
        deadline = ddi_get_lbolt() + ss->ei_gw_props->pp_vnic_ka_ticks;

        while ((ss->ei_ka_vnics_event &
            (EIB_KA_VNICS_DIE | EIB_KA_VNICS_TIMED_OUT)) == 0) {
                mutex_enter(&ci_lock);
                CALLB_CPR_SAFE_BEGIN(&ci);
                mutex_exit(&ci_lock);

                if (cv_timedwait(&ss->ei_ka_vnics_cv, &ss->ei_ka_vnics_lock,
                    deadline) == -1) {
                        ss->ei_ka_vnics_event |= EIB_KA_VNICS_TIMED_OUT;
                }

                mutex_enter(&ci_lock);
                CALLB_CPR_SAFE_END(&ci, &ci_lock);
                mutex_exit(&ci_lock);
        }

        if (ss->ei_ka_vnics_event & EIB_KA_VNICS_DIE) {
                for (elem = ss->ei_ka_vnics; elem; elem = nxt) {
                        nxt = elem->ka_next;
                        kmem_free(elem, sizeof (eib_ka_vnics_t));
                }
                ss->ei_ka_vnics = NULL;
                mutex_exit(&ss->ei_ka_vnics_lock);

                mutex_enter(&ci_lock);
                CALLB_CPR_EXIT(&ci);
                mutex_destroy(&ci_lock);

                return;
        }

        /*
         * Are there any vnics that need keepalive management ?
         */
        ss->ei_ka_vnics_event &= ~EIB_KA_VNICS_TIMED_OUT;
        if (ss->ei_ka_vnics == NULL)
                goto periodic_keepalive;

        /*
         * Ok, we need to send vnic keepalives to our gateway. But first
         * check if the gateway heartbeat is good as of this moment.  Note
         * that we need do get the lbolt value after acquiring ei_vnic_lock
         * to ensure that ei_gw_last_heartbeat does not change before the
         * comparison (to avoid a negative value in the comparison result
         * causing us to incorrectly assume that the gateway heartbeat has
         * stopped).
         */
        mutex_enter(&ss->ei_vnic_lock);

        lbolt64 = ddi_get_lbolt64();

        if (ss->ei_gw_last_heartbeat != 0) {
                if ((lbolt64 - ss->ei_gw_last_heartbeat) >
                    ss->ei_gw_props->pp_gw_ka_ticks) {

                        EIB_DPRINTF_WARN(ss->ei_instance,
                            "eib_manage_keepalives: no keepalives from gateway "
                            "0x%x for hca_guid=0x%llx, port=0x%x, "
                            "last_gw_ka=0x%llx", ss->ei_gw_props->pp_gw_portid,
                            ss->ei_props->ep_hca_guid,
                            ss->ei_props->ep_port_num,
                            ss->ei_gw_last_heartbeat);

                        for (elem = ss->ei_ka_vnics; elem; elem = nxt) {
                                nxt = elem->ka_next;
                                ss->ei_zombie_vnics |=
                                    ((uint64_t)1 << elem->ka_vnic->vn_instance);
                                kmem_free(elem, sizeof (eib_ka_vnics_t));
                        }
                        ss->ei_ka_vnics = NULL;
                        ss->ei_gw_unreachable = B_TRUE;
                        mutex_exit(&ss->ei_vnic_lock);

                        eib_mac_link_down(ss, B_FALSE);

                        goto periodic_keepalive;
                }
        }
        mutex_exit(&ss->ei_vnic_lock);

        for (elem = ss->ei_ka_vnics; elem; elem = elem->ka_next)
                (void) eib_fip_heartbeat(ss, elem->ka_vnic, &err);

        goto periodic_keepalive;
        /*NOTREACHED*/
}

void
eib_stop_events_handler(eib_t *ss)
{
        eib_event_t *evi;

        evi = kmem_zalloc(sizeof (eib_event_t), KM_SLEEP);
        evi->ev_code = EIB_EV_SHUTDOWN;
        evi->ev_arg = NULL;

        eib_svc_enqueue_event(ss, evi);

        thread_join(ss->ei_events_handler);
}

void
eib_stop_refill_rwqes(eib_t *ss)
{
        mutex_enter(&ss->ei_rxpost_lock);

        ss->ei_rxpost_die = 1;

        cv_signal(&ss->ei_rxpost_cv);
        mutex_exit(&ss->ei_rxpost_lock);

        thread_join(ss->ei_rwqes_refiller);
}

void
eib_stop_vnic_creator(eib_t *ss)
{
        eib_vnic_req_t *vrq;

        vrq = kmem_zalloc(sizeof (eib_vnic_req_t), KM_SLEEP);
        vrq->vr_req = EIB_CR_REQ_DIE;
        vrq->vr_next = NULL;

        eib_vnic_enqueue_req(ss, vrq);

        thread_join(ss->ei_vnic_creator);
}

void
eib_stop_monitor_tx_wqes(eib_t *ss)
{
        eib_wqe_pool_t *wp = ss->ei_tx;

        mutex_enter(&wp->wp_lock);

        wp->wp_status |= EIB_TXWQE_MONITOR_DIE;

        cv_signal(&wp->wp_cv);
        mutex_exit(&wp->wp_lock);

        thread_join(ss->ei_txwqe_monitor);
}

int
eib_stop_monitor_lso_bufs(eib_t *ss, boolean_t force)
{
        eib_lsobkt_t *bkt = ss->ei_lso;

        mutex_enter(&bkt->bk_lock);

        /*
         * If there are some buffers still not reaped and the force
         * flag is not set, return without doing anything. Otherwise,
         * stop the lso bufs monitor and wait for it to die.
         */
        if ((bkt->bk_nelem != bkt->bk_nfree) && (force == B_FALSE)) {
                mutex_exit(&bkt->bk_lock);
                return (EIB_E_FAILURE);
        }

        bkt->bk_status |= EIB_LBUF_MONITOR_DIE;

        cv_signal(&bkt->bk_cv);
        mutex_exit(&bkt->bk_lock);

        thread_join(ss->ei_lsobufs_monitor);
        return (EIB_E_SUCCESS);
}

void
eib_stop_manage_keepalives(eib_t *ss)
{
        mutex_enter(&ss->ei_ka_vnics_lock);

        ss->ei_ka_vnics_event |= EIB_KA_VNICS_DIE;

        cv_signal(&ss->ei_ka_vnics_cv);
        mutex_exit(&ss->ei_ka_vnics_lock);

        thread_join(ss->ei_keepalives_manager);
}

void
eib_flush_vnic_reqs(eib_t *ss)
{
        eib_vnic_req_t *vrq;

        vrq = kmem_zalloc(sizeof (eib_vnic_req_t), KM_SLEEP);
        vrq->vr_req = EIB_CR_REQ_FLUSH;
        vrq->vr_next = NULL;

        eib_vnic_enqueue_req(ss, vrq);
}

/*ARGSUSED*/
void
eib_gw_alive_cb(dev_info_t *dip, ddi_eventcookie_t cookie, void *arg,
    void *impl_data)
{
        eib_t *ss = (eib_t *)arg;
        eib_event_t *evi;

        evi = kmem_zalloc(sizeof (eib_event_t), KM_NOSLEEP);
        if (evi == NULL) {
                EIB_DPRINTF_WARN(ss->ei_instance, "eib_gw_alive_cb: "
                    "no memory, ignoring this gateway alive event");
        } else {
                evi->ev_code = EIB_EV_GW_UP;
                evi->ev_arg = NULL;
                eib_svc_enqueue_event(ss, evi);
        }
}

/*ARGSUSED*/
void
eib_login_ack_cb(dev_info_t *dip, ddi_eventcookie_t cookie, void *arg,
    void *impl_data)
{
        eib_t *ss = (eib_t *)arg;
        uint8_t *pkt = (uint8_t *)impl_data;
        eib_login_data_t ld;

        /*
         * We have received a login ack message from the gateway via the EoIB
         * nexus (solicitation qpn).  The packet is passed to us raw (unparsed)
         * and we have to figure out if this is a vnic login ack.
         */
        if (eib_fip_parse_login_ack(ss, pkt + EIB_GRH_SZ, &ld) == EIB_E_SUCCESS)
                eib_vnic_login_ack(ss, &ld);
}

/*ARGSUSED*/
void
eib_gw_info_cb(dev_info_t *dip, ddi_eventcookie_t cookie, void *arg,
    void *impl_data)
{
        eib_t *ss = (eib_t *)arg;
        eib_event_t *evi;

        evi = kmem_zalloc(sizeof (eib_event_t), KM_NOSLEEP);
        if (evi == NULL) {
                EIB_DPRINTF_WARN(ss->ei_instance, "eib_gw_info_cb: "
                    "no memory, ignoring this gateway props update event");
                return;
        }
        evi->ev_arg = kmem_zalloc(sizeof (eib_gw_info_t), KM_NOSLEEP);
        if (evi->ev_arg == NULL) {
                EIB_DPRINTF_WARN(ss->ei_instance, "eib_gw_info_cb: "
                    "no memory, ignoring this gateway props update event");
                kmem_free(evi, sizeof (eib_event_t));
                return;
        }
        bcopy(impl_data, evi->ev_arg, sizeof (eib_gw_info_t));
        evi->ev_code = EIB_EV_GW_INFO_UPDATE;

        eib_svc_enqueue_event(ss, evi);
}
Illumos