root/usr/src/uts/common/io/ib/adapters/hermon/hermon_misc.c
/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */

/*
 * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
 */

/*
 * hermon_misc.c
 *    Hermon Miscellaneous routines - Address Handle, Multicast, Protection
 *    Domain, and port-related operations
 *
 *    Implements all the routines necessary for allocating, freeing, querying
 *    and modifying Address Handles and Protection Domains.  Also implements
 *    all the routines necessary for adding and removing Queue Pairs to/from
 *    Multicast Groups.  Lastly, it implements the routines necessary for
 *    port-related query and modify operations.
 */

#include <sys/types.h>
#include <sys/conf.h>
#include <sys/ddi.h>
#include <sys/sunddi.h>
#include <sys/modctl.h>
#include <sys/bitmap.h>
#include <sys/sysmacros.h>

#include <sys/ib/adapters/hermon/hermon.h>

extern int hermon_rdma_debug;
int hermon_fmr_verbose = 0;

static int hermon_mcg_qplist_add(hermon_state_t *state, hermon_mcghdl_t mcg,
    hermon_hw_mcg_qp_list_t *mcg_qplist, hermon_qphdl_t qp, uint_t *qp_found);
static int hermon_mcg_qplist_remove(hermon_mcghdl_t mcg,
    hermon_hw_mcg_qp_list_t *mcg_qplist, hermon_qphdl_t qp);
static void hermon_qp_mcg_refcnt_inc(hermon_qphdl_t qp);
static void hermon_qp_mcg_refcnt_dec(hermon_qphdl_t qp);
static uint_t hermon_mcg_walk_mgid_hash(hermon_state_t *state,
    uint64_t start_indx, ib_gid_t mgid, uint_t *prev_indx);
static void hermon_mcg_setup_new_hdr(hermon_mcghdl_t mcg,
    hermon_hw_mcg_t *mcg_hdr, ib_gid_t mgid, hermon_rsrc_t *mcg_rsrc);
static int hermon_mcg_hash_list_remove(hermon_state_t *state, uint_t curr_indx,
    uint_t prev_indx, hermon_hw_mcg_t *mcg_entry);
static int hermon_mcg_entry_invalidate(hermon_state_t *state,
    hermon_hw_mcg_t *mcg_entry, uint_t indx);
static int hermon_mgid_is_valid(ib_gid_t gid);
static int hermon_mlid_is_valid(ib_lid_t lid);
static void hermon_fmr_cleanup(hermon_fmrhdl_t pool);


#define HERMON_MAX_DBR_PAGES_PER_USER   64
#define HERMON_DBR_KEY(index, page) \
        (((uint64_t)index) * HERMON_MAX_DBR_PAGES_PER_USER + (page))

static hermon_udbr_page_t *
hermon_dbr_new_user_page(hermon_state_t *state, uint_t index,
    uint_t page)
{
        hermon_udbr_page_t *pagep;
        ddi_dma_attr_t dma_attr;
        uint_t cookiecnt;
        int status;
        hermon_umap_db_entry_t *umapdb;
        ulong_t pagesize = PAGESIZE;

        pagep = kmem_alloc(sizeof (*pagep), KM_SLEEP);
        pagep->upg_index = page;
        pagep->upg_nfree = pagesize / sizeof (hermon_dbr_t);

        /* Allocate 1 bit per dbr for free/alloc management (0 => "free") */
        pagep->upg_free = kmem_zalloc(pagesize / sizeof (hermon_dbr_t) / 8,
            KM_SLEEP);
        pagep->upg_kvaddr = ddi_umem_alloc(pagesize, DDI_UMEM_SLEEP,
            &pagep->upg_umemcookie); /* not HERMON_PAGESIZE here */

        pagep->upg_buf = ddi_umem_iosetup(pagep->upg_umemcookie, 0,
            pagesize, B_WRITE, 0, 0, NULL, DDI_UMEM_SLEEP);

        hermon_dma_attr_init(state, &dma_attr);
#ifdef  __sparc
        if (state->hs_cfg_profile->cp_iommu_bypass == HERMON_BINDMEM_BYPASS)
                dma_attr.dma_attr_flags = DDI_DMA_FORCE_PHYSICAL;
#endif
        status = ddi_dma_alloc_handle(state->hs_dip, &dma_attr,
            DDI_DMA_SLEEP, NULL, &pagep->upg_dmahdl);
        if (status != DDI_SUCCESS) {
                IBTF_DPRINTF_L2("hermon", "hermon_new_user_page: "
                    "ddi_dma_buf_bind_handle failed: %d", status);
                return (NULL);
        }
        status = ddi_dma_buf_bind_handle(pagep->upg_dmahdl,
            pagep->upg_buf, DDI_DMA_RDWR | DDI_DMA_CONSISTENT,
            DDI_DMA_SLEEP, NULL, &pagep->upg_dmacookie, &cookiecnt);
        if (status != DDI_SUCCESS) {
                IBTF_DPRINTF_L2("hermon", "hermon_dbr_new_user_page: "
                    "ddi_dma_buf_bind_handle failed: %d", status);
                ddi_dma_free_handle(&pagep->upg_dmahdl);
                return (NULL);
        }
        ASSERT(cookiecnt == 1);

        /* create db entry for mmap */
        umapdb = hermon_umap_db_alloc(state->hs_instance,
            HERMON_DBR_KEY(index, page), MLNX_UMAP_DBRMEM_RSRC,
            (uint64_t)(uintptr_t)pagep);
        hermon_umap_db_add(umapdb);
        return (pagep);
}


/*ARGSUSED*/
static int
hermon_user_dbr_alloc(hermon_state_t *state, uint_t index,
    ddi_acc_handle_t *acchdl, hermon_dbr_t **vdbr, uint64_t *pdbr,
    uint64_t *mapoffset)
{
        hermon_user_dbr_t *udbr;
        hermon_udbr_page_t *pagep;
        uint_t next_page;
        int dbr_index;
        int i1, i2, i3, last;
        uint64_t u64, mask;

        mutex_enter(&state->hs_dbr_lock);
        for (udbr = state->hs_user_dbr; udbr != NULL; udbr = udbr->udbr_link)
                if (udbr->udbr_index == index)
                        break;
        if (udbr == NULL) {
                udbr = kmem_alloc(sizeof (*udbr), KM_SLEEP);
                udbr->udbr_link = state->hs_user_dbr;
                state->hs_user_dbr = udbr;
                udbr->udbr_index = index;
                udbr->udbr_pagep = NULL;
        }
        pagep = udbr->udbr_pagep;
        next_page = (pagep == NULL) ? 0 : (pagep->upg_index + 1);
        while (pagep != NULL)
                if (pagep->upg_nfree > 0)
                        break;
                else
                        pagep = pagep->upg_link;
        if (pagep == NULL) {
                pagep = hermon_dbr_new_user_page(state, index, next_page);
                if (pagep == NULL) {
                        mutex_exit(&state->hs_dbr_lock);
                        return (DDI_FAILURE);
                }
                pagep->upg_link = udbr->udbr_pagep;
                udbr->udbr_pagep = pagep;
        }

        /* Since nfree > 0, we're assured the loops below will succeed */

        /* First, find a 64-bit (not ~0) that has a free dbr */
        last = PAGESIZE / sizeof (uint64_t) / 64;
        mask = ~0ull;
        for (i1 = 0; i1 < last; i1++)
                if ((pagep->upg_free[i1] & mask) != mask)
                        break;
        u64 = pagep->upg_free[i1];

        /* Second, find a byte (not 0xff) that has a free dbr */
        last = sizeof (uint64_t) / sizeof (uint8_t);
        for (i2 = 0, mask = 0xff; i2 < last; i2++, mask <<= 8)
                if ((u64 & mask) != mask)
                        break;

        /* Third, find a bit that is free (0) */
        for (i3 = 0; i3 < sizeof (uint64_t) / sizeof (uint8_t); i3++)
                if ((u64 & (1ul << (i3 + 8 * i2))) == 0)
                        break;

        /* Mark it as allocated */
        pagep->upg_free[i1] |= (1ul << (i3 + 8 * i2));

        dbr_index = ((i1 * sizeof (uint64_t)) + i2) * sizeof (uint64_t) + i3;
        pagep->upg_nfree--;
        ((uint64_t *)(void *)pagep->upg_kvaddr)[dbr_index] = 0; /* clear dbr */
        *mapoffset = ((HERMON_DBR_KEY(index, pagep->upg_index) <<
            MLNX_UMAP_RSRC_TYPE_SHIFT) | MLNX_UMAP_DBRMEM_RSRC) << PAGESHIFT;
        *vdbr = (hermon_dbr_t *)((uint64_t *)(void *)pagep->upg_kvaddr +
            dbr_index);
        *pdbr = pagep->upg_dmacookie.dmac_laddress + dbr_index *
            sizeof (uint64_t);

        mutex_exit(&state->hs_dbr_lock);
        return (DDI_SUCCESS);
}

static void
hermon_user_dbr_free(hermon_state_t *state, uint_t index, hermon_dbr_t *record)
{
        hermon_user_dbr_t       *udbr;
        hermon_udbr_page_t      *pagep;
        caddr_t                 kvaddr;
        uint_t                  dbr_index;
        uint_t                  max_free = PAGESIZE / sizeof (hermon_dbr_t);
        int                     i1, i2;

        dbr_index = (uintptr_t)record & PAGEOFFSET; /* offset (not yet index) */
        kvaddr = (caddr_t)record - dbr_index;
        dbr_index /= sizeof (hermon_dbr_t); /* now it's the index */

        mutex_enter(&state->hs_dbr_lock);
        for (udbr = state->hs_user_dbr; udbr != NULL; udbr = udbr->udbr_link)
                if (udbr->udbr_index == index)
                        break;
        if (udbr == NULL) {
                IBTF_DPRINTF_L2("hermon", "free user dbr: udbr struct not "
                    "found for index %x", index);
                mutex_exit(&state->hs_dbr_lock);
                return;
        }
        for (pagep = udbr->udbr_pagep; pagep != NULL; pagep = pagep->upg_link)
                if (pagep->upg_kvaddr == kvaddr)
                        break;
        if (pagep == NULL) {
                IBTF_DPRINTF_L2("hermon", "free user dbr: pagep struct not"
                    " found for index %x, kvaddr %p, DBR index %x",
                    index, kvaddr, dbr_index);
                mutex_exit(&state->hs_dbr_lock);
                return;
        }
        if (pagep->upg_nfree >= max_free) {
                IBTF_DPRINTF_L2("hermon", "free user dbr: overflow: "
                    "UCE index %x, DBR index %x", index, dbr_index);
                mutex_exit(&state->hs_dbr_lock);
                return;
        }
        ASSERT(dbr_index < max_free);
        i1 = dbr_index / 64;
        i2 = dbr_index % 64;
        ASSERT((pagep->upg_free[i1] & (1ul << i2)) == (1ul << i2));
        pagep->upg_free[i1] &= ~(1ul << i2);
        pagep->upg_nfree++;
        mutex_exit(&state->hs_dbr_lock);
}

/*
 * hermon_dbr_page_alloc()
 *      first page allocation - called from attach or open
 *      in this case, we want exactly one page per call, and aligned on a
 *      page - and may need to be mapped to the user for access
 */
int
hermon_dbr_page_alloc(hermon_state_t *state, hermon_dbr_info_t **dinfo)
{
        int                     status;
        ddi_dma_handle_t        dma_hdl;
        ddi_acc_handle_t        acc_hdl;
        ddi_dma_attr_t          dma_attr;
        ddi_dma_cookie_t        cookie;
        uint_t                  cookie_cnt;
        int                     i;
        hermon_dbr_info_t       *info;
        caddr_t                 dmaaddr;
        uint64_t                dmalen;
        ulong_t                 pagesize = PAGESIZE;

        info = kmem_zalloc(sizeof (hermon_dbr_info_t), KM_SLEEP);

        /*
         * Initialize many of the default DMA attributes.  Then set additional
         * alignment restrictions if necessary for the dbr memory, meaning
         * page aligned.  Also use the configured value for IOMMU bypass
         */
        hermon_dma_attr_init(state, &dma_attr);
        dma_attr.dma_attr_align = pagesize;
        dma_attr.dma_attr_sgllen = 1;   /* make sure only one cookie */
#ifdef  __sparc
        if (state->hs_cfg_profile->cp_iommu_bypass == HERMON_BINDMEM_BYPASS)
                dma_attr.dma_attr_flags = DDI_DMA_FORCE_PHYSICAL;
#endif

        status = ddi_dma_alloc_handle(state->hs_dip, &dma_attr,
            DDI_DMA_SLEEP, NULL, &dma_hdl);
        if (status != DDI_SUCCESS) {
                kmem_free((void *)info, sizeof (hermon_dbr_info_t));
                cmn_err(CE_NOTE, "dbr DMA handle alloc failed\n");
                return (DDI_FAILURE);
        }

        status = ddi_dma_mem_alloc(dma_hdl, pagesize,
            &state->hs_reg_accattr, DDI_DMA_CONSISTENT, DDI_DMA_SLEEP,
            NULL, &dmaaddr, (size_t *)&dmalen, &acc_hdl);
        if (status != DDI_SUCCESS)      {
                ddi_dma_free_handle(&dma_hdl);
                cmn_err(CE_CONT, "dbr DMA mem alloc failed(status %d)", status);
                kmem_free((void *)info, sizeof (hermon_dbr_info_t));
                return (DDI_FAILURE);
        }

        /* this memory won't be IB registered, so do the bind here */
        status = ddi_dma_addr_bind_handle(dma_hdl, NULL,
            dmaaddr, (size_t)dmalen, DDI_DMA_RDWR |
            DDI_DMA_CONSISTENT, DDI_DMA_SLEEP, NULL, &cookie, &cookie_cnt);
        if (status != DDI_SUCCESS) {
                ddi_dma_mem_free(&acc_hdl);
                ddi_dma_free_handle(&dma_hdl);
                kmem_free((void *)info, sizeof (hermon_dbr_info_t));
                cmn_err(CE_CONT, "dbr DMA bind handle failed (status %d)",
                    status);
                return (DDI_FAILURE);
        }
        *dinfo = info;          /* Pass back the pointer */

        /* init the info structure with returned info */
        info->dbr_dmahdl = dma_hdl;
        info->dbr_acchdl = acc_hdl;
        info->dbr_page   = (hermon_dbr_t *)(void *)dmaaddr;
        info->dbr_link = NULL;
        /* extract the phys addr from the cookie */
        info->dbr_paddr = cookie.dmac_laddress;
        info->dbr_firstfree = 0;
        info->dbr_nfree = HERMON_NUM_DBR_PER_PAGE;
        /* link all DBrs onto the free list */
        for (i = 0; i < HERMON_NUM_DBR_PER_PAGE; i++) {
                info->dbr_page[i] = i + 1;
        }

        return (DDI_SUCCESS);
}


/*
 * hermon_dbr_alloc()
 *      DBr record allocation - called from alloc cq/qp/srq
 *      will check for available dbrs in current
 *      page - if needed it will allocate another and link them
 */

int
hermon_dbr_alloc(hermon_state_t *state, uint_t index, ddi_acc_handle_t *acchdl,
    hermon_dbr_t **vdbr, uint64_t *pdbr, uint64_t *mapoffset)
{
        hermon_dbr_t            *record = NULL;
        hermon_dbr_info_t       *info = NULL;
        uint32_t                idx;
        int                     status;

        if (index != state->hs_kernel_uar_index)
                return (hermon_user_dbr_alloc(state, index, acchdl, vdbr, pdbr,
                    mapoffset));

        mutex_enter(&state->hs_dbr_lock);
        for (info = state->hs_kern_dbr; info != NULL; info = info->dbr_link)
                if (info->dbr_nfree != 0)
                        break;          /* found a page w/ one available */

        if (info == NULL) {     /* did NOT find a page with one available */
                status = hermon_dbr_page_alloc(state, &info);
                if (status != DDI_SUCCESS) {
                        /* do error handling */
                        mutex_exit(&state->hs_dbr_lock);
                        return (DDI_FAILURE);
                }
                /* got a new page, so link it in. */
                info->dbr_link = state->hs_kern_dbr;
                state->hs_kern_dbr = info;
        }
        idx = info->dbr_firstfree;
        record = info->dbr_page + idx;
        info->dbr_firstfree = *record;
        info->dbr_nfree--;
        *record = 0;

        *acchdl = info->dbr_acchdl;
        *vdbr = record;
        *pdbr = info->dbr_paddr + idx * sizeof (hermon_dbr_t);
        mutex_exit(&state->hs_dbr_lock);
        return (DDI_SUCCESS);
}

/*
 * hermon_dbr_free()
 *      DBr record deallocation - called from free cq/qp
 *      will update the counter in the header, and invalidate
 *      the dbr, but will NEVER free pages of dbrs - small
 *      price to pay, but userland access never will anyway
 */
void
hermon_dbr_free(hermon_state_t *state, uint_t indx, hermon_dbr_t *record)
{
        hermon_dbr_t            *page;
        hermon_dbr_info_t       *info;

        if (indx != state->hs_kernel_uar_index) {
                hermon_user_dbr_free(state, indx, record);
                return;
        }
        page = (hermon_dbr_t *)(uintptr_t)((uintptr_t)record & PAGEMASK);
        mutex_enter(&state->hs_dbr_lock);
        for (info = state->hs_kern_dbr; info != NULL; info = info->dbr_link)
                if (info->dbr_page == page)
                        break;
        ASSERT(info != NULL);
        *record = info->dbr_firstfree;
        info->dbr_firstfree = record - info->dbr_page;
        info->dbr_nfree++;
        mutex_exit(&state->hs_dbr_lock);
}

/*
 * hermon_dbr_kern_free()
 *    Context: Can be called only from detach context.
 *
 *      Free all kernel dbr pages.  This includes the freeing of all the dma
 *      resources acquired during the allocation of the pages.
 *
 *      Also, free all the user dbr pages.
 */
void
hermon_dbr_kern_free(hermon_state_t *state)
{
        hermon_dbr_info_t       *info, *link;
        hermon_user_dbr_t       *udbr, *next;
        hermon_udbr_page_t      *pagep, *nextp;
        hermon_umap_db_entry_t  *umapdb;
        int                     instance, status;
        uint64_t                value;
        extern                  hermon_umap_db_t hermon_userland_rsrc_db;

        mutex_enter(&state->hs_dbr_lock);
        for (info = state->hs_kern_dbr; info != NULL; info = link) {
                (void) ddi_dma_unbind_handle(info->dbr_dmahdl);
                ddi_dma_mem_free(&info->dbr_acchdl);    /* free page */
                ddi_dma_free_handle(&info->dbr_dmahdl);
                link = info->dbr_link;
                kmem_free(info, sizeof (hermon_dbr_info_t));
        }

        udbr = state->hs_user_dbr;
        instance = state->hs_instance;
        mutex_enter(&hermon_userland_rsrc_db.hdl_umapdb_lock);
        while (udbr != NULL) {
                pagep = udbr->udbr_pagep;
                while (pagep != NULL) {
                        /* probably need to remove "db" */
                        (void) ddi_dma_unbind_handle(pagep->upg_dmahdl);
                        ddi_dma_free_handle(&pagep->upg_dmahdl);
                        freerbuf(pagep->upg_buf);
                        ddi_umem_free(pagep->upg_umemcookie);
                        status = hermon_umap_db_find_nolock(instance,
                            HERMON_DBR_KEY(udbr->udbr_index,
                            pagep->upg_index), MLNX_UMAP_DBRMEM_RSRC,
                            &value, HERMON_UMAP_DB_REMOVE, &umapdb);
                        if (status == DDI_SUCCESS)
                                hermon_umap_db_free(umapdb);
                        kmem_free(pagep->upg_free,
                            PAGESIZE / sizeof (hermon_dbr_t) / 8);
                        nextp = pagep->upg_link;
                        kmem_free(pagep, sizeof (*pagep));
                        pagep = nextp;
                }
                next = udbr->udbr_link;
                kmem_free(udbr, sizeof (*udbr));
                udbr = next;
        }
        mutex_exit(&hermon_userland_rsrc_db.hdl_umapdb_lock);
        mutex_exit(&state->hs_dbr_lock);
}

/*
 * hermon_ah_alloc()
 *    Context: Can be called only from user or kernel context.
 */
int
hermon_ah_alloc(hermon_state_t *state, hermon_pdhdl_t pd,
    ibt_adds_vect_t *attr_p, hermon_ahhdl_t *ahhdl, uint_t sleepflag)
{
        hermon_rsrc_t           *rsrc;
        hermon_hw_udav_t        *udav;
        hermon_ahhdl_t          ah;
        int                     status;

        /*
         * Someday maybe the "ibt_adds_vect_t *attr_p" will be NULL to
         * indicate that we wish to allocate an "invalid" (i.e. empty)
         * address handle XXX
         */

        /* Validate that specified port number is legal */
        if (!hermon_portnum_is_valid(state, attr_p->av_port_num)) {
                return (IBT_HCA_PORT_INVALID);
        }

        /*
         * Allocate the software structure for tracking the address handle
         * (i.e. the Hermon Address Handle struct).
         */
        status = hermon_rsrc_alloc(state, HERMON_AHHDL, 1, sleepflag, &rsrc);
        if (status != DDI_SUCCESS) {
                return (IBT_INSUFF_RESOURCE);
        }
        ah = (hermon_ahhdl_t)rsrc->hr_addr;
        _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*ah))

        /* Increment the reference count on the protection domain (PD) */
        hermon_pd_refcnt_inc(pd);

        udav = (hermon_hw_udav_t *)kmem_zalloc(sizeof (hermon_hw_udav_t),
            KM_SLEEP);
        _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*udav))

        /*
         * Fill in the UDAV data. We first zero out the UDAV, then populate
         * it by then calling hermon_set_addr_path() to fill in the common
         * portions that can be pulled from the "ibt_adds_vect_t" passed in
         */
        status = hermon_set_addr_path(state, attr_p,
            (hermon_hw_addr_path_t *)udav, HERMON_ADDRPATH_UDAV);
        if (status != DDI_SUCCESS) {
                hermon_pd_refcnt_dec(pd);
                hermon_rsrc_free(state, &rsrc);
                return (status);
        }
        udav->pd        = pd->pd_pdnum;
        udav->sl        = attr_p->av_srvl;

        /*
         * Fill in the rest of the Hermon Address Handle struct.
         *
         * NOTE: We are saving away a copy of the "av_dgid.gid_guid" field
         * here because we may need to return it later to the IBTF (as a
         * result of a subsequent query operation).  Unlike the other UDAV
         * parameters, the value of "av_dgid.gid_guid" is not always preserved.
         * The reason for this is described in hermon_set_addr_path().
         */
        ah->ah_rsrcp     = rsrc;
        ah->ah_pdhdl     = pd;
        ah->ah_udav      = udav;
        ah->ah_save_guid = attr_p->av_dgid.gid_guid;
        *ahhdl = ah;

        return (DDI_SUCCESS);
}


/*
 * hermon_ah_free()
 *    Context: Can be called only from user or kernel context.
 */
/* ARGSUSED */
int
hermon_ah_free(hermon_state_t *state, hermon_ahhdl_t *ahhdl, uint_t sleepflag)
{
        hermon_rsrc_t           *rsrc;
        hermon_pdhdl_t          pd;
        hermon_ahhdl_t          ah;

        /*
         * Pull all the necessary information from the Hermon Address Handle
         * struct.  This is necessary here because the resource for the
         * AH is going to be freed up as part of this operation.
         */
        ah    = *ahhdl;
        mutex_enter(&ah->ah_lock);
        rsrc  = ah->ah_rsrcp;
        pd    = ah->ah_pdhdl;
        mutex_exit(&ah->ah_lock);
        _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*ah))

        /* Free the UDAV memory */
        kmem_free(ah->ah_udav, sizeof (hermon_hw_udav_t));

        /* Decrement the reference count on the protection domain (PD) */
        hermon_pd_refcnt_dec(pd);

        /* Free the Hermon Address Handle structure */
        hermon_rsrc_free(state, &rsrc);

        /* Set the ahhdl pointer to NULL and return success */
        *ahhdl = NULL;

        return (DDI_SUCCESS);
}


/*
 * hermon_ah_query()
 *    Context: Can be called from interrupt or base context.
 */
/* ARGSUSED */
int
hermon_ah_query(hermon_state_t *state, hermon_ahhdl_t ah, hermon_pdhdl_t *pd,
    ibt_adds_vect_t *attr_p)
{
        mutex_enter(&ah->ah_lock);
        _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*attr_p))

        /*
         * Pull the PD and UDAV from the Hermon Address Handle structure
         */
        *pd = ah->ah_pdhdl;

        /*
         * Fill in "ibt_adds_vect_t".  We call hermon_get_addr_path() to fill
         * the common portions that can be pulled from the UDAV we pass in.
         *
         * NOTE: We will also fill the "av_dgid.gid_guid" field from the
         * "ah_save_guid" field we have previously saved away.  The reason
         * for this is described in hermon_ah_alloc() and hermon_ah_modify().
         */
        hermon_get_addr_path(state, (hermon_hw_addr_path_t *)ah->ah_udav,
            attr_p, HERMON_ADDRPATH_UDAV);

        attr_p->av_dgid.gid_guid = ah->ah_save_guid;

        mutex_exit(&ah->ah_lock);
        return (DDI_SUCCESS);
}


/*
 * hermon_ah_modify()
 *    Context: Can be called from interrupt or base context.
 */
/* ARGSUSED */
int
hermon_ah_modify(hermon_state_t *state, hermon_ahhdl_t ah,
    ibt_adds_vect_t *attr_p)
{
        hermon_hw_udav_t        old_udav;
        uint64_t                data_old;
        int                     status, size, i;

        /* Validate that specified port number is legal */
        if (!hermon_portnum_is_valid(state, attr_p->av_port_num)) {
                return (IBT_HCA_PORT_INVALID);
        }

        mutex_enter(&ah->ah_lock);

        /* Save a copy of the current UDAV data in old_udav. */
        bcopy(ah->ah_udav, &old_udav, sizeof (hermon_hw_udav_t));

        /*
         * Fill in the new UDAV with the caller's data, passed in via the
         * "ibt_adds_vect_t" structure.
         *
         * NOTE: We also need to save away a copy of the "av_dgid.gid_guid"
         * field here (just as we did during hermon_ah_alloc()) because we
         * may need to return it later to the IBTF (as a result of a
         * subsequent query operation).  As explained in hermon_ah_alloc(),
         * unlike the other UDAV parameters, the value of "av_dgid.gid_guid"
         * is not always preserved. The reason for this is described in
         * hermon_set_addr_path().
         */
        status = hermon_set_addr_path(state, attr_p,
            (hermon_hw_addr_path_t *)ah->ah_udav, HERMON_ADDRPATH_UDAV);
        if (status != DDI_SUCCESS) {
                mutex_exit(&ah->ah_lock);
                return (status);
        }
        ah->ah_save_guid = attr_p->av_dgid.gid_guid;
        _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*(ah->ah_udav)))
        ah->ah_udav->sl  = attr_p->av_srvl;

        /*
         * Copy changes into the new UDAV.
         *    Note:  We copy in 64-bit chunks.  For the first two of these
         *    chunks it is necessary to read the current contents of the
         *    UDAV, mask off the modifiable portions (maintaining any
         *    of the "reserved" portions), and then mask on the new data.
         */
        size = sizeof (hermon_hw_udav_t) >> 3;
        for (i = 0; i < size; i++) {
                data_old = ((uint64_t *)&old_udav)[i];

                /*
                 * Apply mask to change only the relevant values.
                 */
                if (i == 0) {
                        data_old = data_old & HERMON_UDAV_MODIFY_MASK0;
                } else if (i == 1) {
                        data_old = data_old & HERMON_UDAV_MODIFY_MASK1;
                } else {
                        data_old = 0;
                }

                /* Store the updated values to the UDAV */
                ((uint64_t *)ah->ah_udav)[i] |= data_old;
        }

        /*
         * Put the valid PD number back into the UDAV entry, as it
         * might have been clobbered above.
         */
        ah->ah_udav->pd = old_udav.pd;


        mutex_exit(&ah->ah_lock);
        return (DDI_SUCCESS);
}

/*
 * hermon_mcg_attach()
 *    Context: Can be called only from user or kernel context.
 */
int
hermon_mcg_attach(hermon_state_t *state, hermon_qphdl_t qp, ib_gid_t gid,
    ib_lid_t lid)
{
        hermon_rsrc_t           *rsrc;
        hermon_hw_mcg_t         *mcg_entry;
        hermon_hw_mcg_qp_list_t *mcg_entry_qplist;
        hermon_mcghdl_t         mcg, newmcg;
        uint64_t                mgid_hash;
        uint32_t                end_indx;
        int                     status;
        uint_t                  qp_found;

        /*
         * It is only allowed to attach MCG to UD queue pairs.  Verify
         * that the intended QP is of the appropriate transport type
         */
        if (qp->qp_serv_type != HERMON_QP_UD) {
                return (IBT_QP_SRV_TYPE_INVALID);
        }

        /*
         * Check for invalid Multicast DLID.  Specifically, all Multicast
         * LIDs should be within a well defined range.  If the specified LID
         * is outside of that range, then return an error.
         */
        if (hermon_mlid_is_valid(lid) == 0) {
                return (IBT_MC_MLID_INVALID);
        }
        /*
         * Check for invalid Multicast GID.  All Multicast GIDs should have
         * a well-defined pattern of bits and flags that are allowable.  If
         * the specified GID does not meet the criteria, then return an error.
         */
        if (hermon_mgid_is_valid(gid) == 0) {
                return (IBT_MC_MGID_INVALID);
        }

        /*
         * Compute the MGID hash value.  Since the MCG table is arranged as
         * a number of separate hash chains, this operation converts the
         * specified MGID into the starting index of an entry in the hash
         * table (i.e. the index for the start of the appropriate hash chain).
         * Subsequent operations below will walk the chain searching for the
         * right place to add this new QP.
         */
        status = hermon_mgid_hash_cmd_post(state, gid.gid_prefix, gid.gid_guid,
            &mgid_hash, HERMON_SLEEPFLAG_FOR_CONTEXT());
        if (status != HERMON_CMD_SUCCESS) {
                cmn_err(CE_CONT, "Hermon: MGID_HASH command failed: %08x\n",
                    status);
                if (status == HERMON_CMD_INVALID_STATUS) {
                        hermon_fm_ereport(state, HCA_SYS_ERR, HCA_ERR_SRV_LOST);
                }
                return (ibc_get_ci_failure(0));
        }

        /*
         * Grab the multicast group mutex.  Then grab the pre-allocated
         * temporary buffer used for holding and/or modifying MCG entries.
         * Zero out the temporary MCG entry before we begin.
         */
        mutex_enter(&state->hs_mcglock);
        mcg_entry = state->hs_mcgtmp;
        mcg_entry_qplist = HERMON_MCG_GET_QPLIST_PTR(mcg_entry);
        bzero(mcg_entry, HERMON_MCGMEM_SZ(state));

        /*
         * Walk through the array of MCG entries starting at "mgid_hash".
         * Try to find the appropriate place for this new QP to be added.
         * This could happen when the first entry of the chain has MGID == 0
         * (which means that the hash chain is empty), or because we find
         * an entry with the same MGID (in which case we'll add the QP to
         * that MCG), or because we come to the end of the chain (in which
         * case this is the first QP being added to the multicast group that
         * corresponds to the MGID.  The hermon_mcg_walk_mgid_hash() routine
         * walks the list and returns an index into the MCG table.  The entry
         * at this index is then checked to determine which case we have
         * fallen into (see below).  Note:  We are using the "shadow" MCG
         * list (of hermon_mcg_t structs) for this lookup because the real
         * MCG entries are in hardware (and the lookup process would be much
         * more time consuming).
         */
        end_indx = hermon_mcg_walk_mgid_hash(state, mgid_hash, gid, NULL);
        mcg      = &state->hs_mcghdl[end_indx];

        /*
         * If MGID == 0, then the hash chain is empty.  Just fill in the
         * current entry.  Note:  No need to allocate an MCG table entry
         * as all the hash chain "heads" are already preallocated.
         */
        if ((mcg->mcg_mgid_h == 0) && (mcg->mcg_mgid_l == 0)) {

                /* Fill in the current entry in the "shadow" MCG list */
                hermon_mcg_setup_new_hdr(mcg, mcg_entry, gid, NULL);

                /*
                 * Try to add the new QP number to the list.  This (and the
                 * above) routine fills in a temporary MCG.  The "mcg_entry"
                 * and "mcg_entry_qplist" pointers simply point to different
                 * offsets within the same temporary copy of the MCG (for
                 * convenience).  Note:  If this fails, we need to invalidate
                 * the entries we've already put into the "shadow" list entry
                 * above.
                 */
                status = hermon_mcg_qplist_add(state, mcg, mcg_entry_qplist, qp,
                    &qp_found);
                if (status != DDI_SUCCESS) {
                        bzero(mcg, sizeof (struct hermon_sw_mcg_list_s));
                        mutex_exit(&state->hs_mcglock);
                        return (status);
                }
                if (!qp_found)
                        mcg_entry->member_cnt = (mcg->mcg_num_qps + 1);
                            /* set the member count */

                /*
                 * Once the temporary MCG has been filled in, write the entry
                 * into the appropriate location in the Hermon MCG entry table.
                 * If it's successful, then drop the lock and return success.
                 * Note: In general, this operation shouldn't fail.  If it
                 * does, then it is an indication that something (probably in
                 * HW, but maybe in SW) has gone seriously wrong.  We still
                 * want to zero out the entries that we've filled in above
                 * (in the hermon_mcg_setup_new_hdr() routine).
                 */
                status = hermon_write_mgm_cmd_post(state, mcg_entry, end_indx,
                    HERMON_CMD_NOSLEEP_SPIN);
                if (status != HERMON_CMD_SUCCESS) {
                        bzero(mcg, sizeof (struct hermon_sw_mcg_list_s));
                        mutex_exit(&state->hs_mcglock);
                        HERMON_WARNING(state, "failed to write MCG entry");
                        cmn_err(CE_CONT, "Hermon: WRITE_MGM command failed: "
                            "%08x\n", status);
                        if (status == HERMON_CMD_INVALID_STATUS) {
                                hermon_fm_ereport(state, HCA_SYS_ERR,
                                    HCA_ERR_SRV_LOST);
                        }
                        return (ibc_get_ci_failure(0));
                }

                /*
                 * Now that we know all the Hermon firmware accesses have been
                 * successful, we update the "shadow" MCG entry by incrementing
                 * the "number of attached QPs" count.
                 *
                 * We increment only if the QP is not already part of the
                 * MCG by checking the 'qp_found' flag returned from the
                 * qplist_add above.
                 */
                if (!qp_found) {
                        mcg->mcg_num_qps++;

                        /*
                         * Increment the refcnt for this QP.  Because the QP
                         * was added to this MCG, the refcnt must be
                         * incremented.
                         */
                        hermon_qp_mcg_refcnt_inc(qp);
                }

                /*
                 * We drop the lock and return success.
                 */
                mutex_exit(&state->hs_mcglock);
                return (DDI_SUCCESS);
        }

        /*
         * If the specified MGID matches the MGID in the current entry, then
         * we need to try to add the QP to the current MCG entry.  In this
         * case, it means that we need to read the existing MCG entry (into
         * the temporary MCG), add the new QP number to the temporary entry
         * (using the same method we used above), and write the entry back
         * to the hardware (same as above).
         */
        if ((mcg->mcg_mgid_h == gid.gid_prefix) &&
            (mcg->mcg_mgid_l == gid.gid_guid)) {

                /*
                 * Read the current MCG entry into the temporary MCG.  Note:
                 * In general, this operation shouldn't fail.  If it does,
                 * then it is an indication that something (probably in HW,
                 * but maybe in SW) has gone seriously wrong.
                 */
                status = hermon_read_mgm_cmd_post(state, mcg_entry, end_indx,
                    HERMON_CMD_NOSLEEP_SPIN);
                if (status != HERMON_CMD_SUCCESS) {
                        mutex_exit(&state->hs_mcglock);
                        HERMON_WARNING(state, "failed to read MCG entry");
                        cmn_err(CE_CONT, "Hermon: READ_MGM command failed: "
                            "%08x\n", status);
                        if (status == HERMON_CMD_INVALID_STATUS) {
                                hermon_fm_ereport(state, HCA_SYS_ERR,
                                    HCA_ERR_SRV_LOST);
                        }
                        return (ibc_get_ci_failure(0));
                }

                /*
                 * Try to add the new QP number to the list.  This routine
                 * fills in the necessary pieces of the temporary MCG.  The
                 * "mcg_entry_qplist" pointer is used to point to the portion
                 * of the temporary MCG that holds the QP numbers.
                 *
                 * Note: hermon_mcg_qplist_add() returns SUCCESS if it
                 * already found the QP in the list.  In this case, the QP is
                 * not added on to the list again.  Check the flag 'qp_found'
                 * if this value is needed to be known.
                 *
                 */
                status = hermon_mcg_qplist_add(state, mcg, mcg_entry_qplist, qp,
                    &qp_found);
                if (status != DDI_SUCCESS) {
                        mutex_exit(&state->hs_mcglock);
                        return (status);
                }
                if (!qp_found)
                        mcg_entry->member_cnt = (mcg->mcg_num_qps + 1);
                            /* set the member count */

                /*
                 * Once the temporary MCG has been updated, write the entry
                 * into the appropriate location in the Hermon MCG entry table.
                 * If it's successful, then drop the lock and return success.
                 * Note: In general, this operation shouldn't fail.  If it
                 * does, then it is an indication that something (probably in
                 * HW, but maybe in SW) has gone seriously wrong.
                 */
                status = hermon_write_mgm_cmd_post(state, mcg_entry, end_indx,
                    HERMON_CMD_NOSLEEP_SPIN);
                if (status != HERMON_CMD_SUCCESS) {
                        mutex_exit(&state->hs_mcglock);
                        HERMON_WARNING(state, "failed to write MCG entry");
                        cmn_err(CE_CONT, "Hermon: WRITE_MGM command failed: "
                            "%08x\n", status);
                        if (status == HERMON_CMD_INVALID_STATUS) {
                                hermon_fm_ereport(state, HCA_SYS_ERR,
                                    HCA_ERR_SRV_LOST);
                        }
                        return (ibc_get_ci_failure(0));
                }

                /*
                 * Now that we know all the Hermon firmware accesses have been
                 * successful, we update the current "shadow" MCG entry by
                 * incrementing the "number of attached QPs" count.
                 *
                 * We increment only if the QP is not already part of the
                 * MCG by checking the 'qp_found' flag returned
                 * hermon_mcg_walk_mgid_hashfrom the qplist_add above.
                 */
                if (!qp_found) {
                        mcg->mcg_num_qps++;

                        /*
                         * Increment the refcnt for this QP.  Because the QP
                         * was added to this MCG, the refcnt must be
                         * incremented.
                         */
                        hermon_qp_mcg_refcnt_inc(qp);
                }

                /*
                 * We drop the lock and return success.
                 */
                mutex_exit(&state->hs_mcglock);
                return (DDI_SUCCESS);
        }

        /*
         * If we've reached here, then we're at the end of the hash chain.
         * We need to allocate a new MCG entry, fill it in, write it to Hermon,
         * and update the previous entry to link the new one to the end of the
         * chain.
         */

        /*
         * Allocate an MCG table entry.  This will be filled in with all
         * the necessary parameters to define the multicast group.  Then it
         * will be written to the hardware in the next-to-last step below.
         */
        status = hermon_rsrc_alloc(state, HERMON_MCG, 1, HERMON_NOSLEEP, &rsrc);
        if (status != DDI_SUCCESS) {
                mutex_exit(&state->hs_mcglock);
                return (IBT_INSUFF_RESOURCE);
        }

        /*
         * Fill in the new entry in the "shadow" MCG list.  Note:  Just as
         * it does above, hermon_mcg_setup_new_hdr() also fills in a portion
         * of the temporary MCG entry (the rest of which will be filled in by
         * hermon_mcg_qplist_add() below)
         */
        newmcg = &state->hs_mcghdl[rsrc->hr_indx];
        hermon_mcg_setup_new_hdr(newmcg, mcg_entry, gid, rsrc);

        /*
         * Try to add the new QP number to the list.  This routine fills in
         * the final necessary pieces of the temporary MCG.  The
         * "mcg_entry_qplist" pointer is used to point to the portion of the
         * temporary MCG that holds the QP numbers.  If we fail here, we
         * must undo the previous resource allocation.
         *
         * Note: hermon_mcg_qplist_add() can we return SUCCESS if it already
         * found the QP in the list.  In this case, the QP is not added on to
         * the list again.  Check the flag 'qp_found' if this value is needed
         * to be known.
         */
        status = hermon_mcg_qplist_add(state, newmcg, mcg_entry_qplist, qp,
            &qp_found);
        if (status != DDI_SUCCESS) {
                bzero(newmcg, sizeof (struct hermon_sw_mcg_list_s));
                hermon_rsrc_free(state, &rsrc);
                mutex_exit(&state->hs_mcglock);
                return (status);
        }
        mcg_entry->member_cnt = (newmcg->mcg_num_qps + 1);
            /* set the member count */

        /*
         * Once the temporary MCG has been updated, write the entry into the
         * appropriate location in the Hermon MCG entry table.  If this is
         * successful, then we need to chain the previous entry to this one.
         * Note: In general, this operation shouldn't fail.  If it does, then
         * it is an indication that something (probably in HW, but maybe in
         * SW) has gone seriously wrong.
         */
        status = hermon_write_mgm_cmd_post(state, mcg_entry, rsrc->hr_indx,
            HERMON_CMD_NOSLEEP_SPIN);
        if (status != HERMON_CMD_SUCCESS) {
                bzero(newmcg, sizeof (struct hermon_sw_mcg_list_s));
                hermon_rsrc_free(state, &rsrc);
                mutex_exit(&state->hs_mcglock);
                HERMON_WARNING(state, "failed to write MCG entry");
                cmn_err(CE_CONT, "Hermon: WRITE_MGM command failed: %08x\n",
                    status);
                if (status == HERMON_CMD_INVALID_STATUS) {
                        hermon_fm_ereport(state, HCA_SYS_ERR, HCA_ERR_SRV_LOST);
                }
                return (ibc_get_ci_failure(0));
        }

        /*
         * Now read the current MCG entry (the one previously at the end of
         * hash chain) into the temporary MCG.  We are going to update its
         * "next_gid_indx" now and write the entry back to the MCG table.
         * Note:  In general, this operation shouldn't fail.  If it does, then
         * it is an indication that something (probably in HW, but maybe in SW)
         * has gone seriously wrong.  We will free up the MCG entry resource,
         * but we will not undo the previously written MCG entry in the HW.
         * This is OK, though, because the MCG entry is not currently attached
         * to any hash chain.
         */
        status = hermon_read_mgm_cmd_post(state, mcg_entry, end_indx,
            HERMON_CMD_NOSLEEP_SPIN);
        if (status != HERMON_CMD_SUCCESS) {
                bzero(newmcg, sizeof (struct hermon_sw_mcg_list_s));
                hermon_rsrc_free(state, &rsrc);
                mutex_exit(&state->hs_mcglock);
                HERMON_WARNING(state, "failed to read MCG entry");
                cmn_err(CE_CONT, "Hermon: READ_MGM command failed: %08x\n",
                    status);
                if (status == HERMON_CMD_INVALID_STATUS) {
                        hermon_fm_ereport(state, HCA_SYS_ERR, HCA_ERR_SRV_LOST);
                }
                return (ibc_get_ci_failure(0));
        }

        /*
         * Finally, we update the "next_gid_indx" field in the temporary MCG
         * and attempt to write the entry back into the Hermon MCG table.  If
         * this succeeds, then we update the "shadow" list to reflect the
         * change, drop the lock, and return success.  Note:  In general, this
         * operation shouldn't fail.  If it does, then it is an indication
         * that something (probably in HW, but maybe in SW) has gone seriously
         * wrong.  Just as we do above, we will free up the MCG entry resource,
         * but we will not try to undo the previously written MCG entry.  This
         * is OK, though, because (since we failed here to update the end of
         * the chain) that other entry is not currently attached to any chain.
         */
        mcg_entry->next_gid_indx = rsrc->hr_indx;
        status = hermon_write_mgm_cmd_post(state, mcg_entry, end_indx,
            HERMON_CMD_NOSLEEP_SPIN);
        if (status != HERMON_CMD_SUCCESS) {
                bzero(newmcg, sizeof (struct hermon_sw_mcg_list_s));
                hermon_rsrc_free(state, &rsrc);
                mutex_exit(&state->hs_mcglock);
                HERMON_WARNING(state, "failed to write MCG entry");
                cmn_err(CE_CONT, "Hermon: WRITE_MGM command failed: %08x\n",
                    status);
                if (status == HERMON_CMD_INVALID_STATUS) {
                        hermon_fm_ereport(state, HCA_SYS_ERR, HCA_ERR_SRV_LOST);
                }
                return (ibc_get_ci_failure(0));
        }
        mcg = &state->hs_mcghdl[end_indx];
        mcg->mcg_next_indx = rsrc->hr_indx;

        /*
         * Now that we know all the Hermon firmware accesses have been
         * successful, we update the new "shadow" MCG entry by incrementing
         * the "number of attached QPs" count.  Then we drop the lock and
         * return success.
         */
        newmcg->mcg_num_qps++;

        /*
         * Increment the refcnt for this QP.  Because the QP
         * was added to this MCG, the refcnt must be
         * incremented.
         */
        hermon_qp_mcg_refcnt_inc(qp);

        mutex_exit(&state->hs_mcglock);
        return (DDI_SUCCESS);
}


/*
 * hermon_mcg_detach()
 *    Context: Can be called only from user or kernel context.
 */
int
hermon_mcg_detach(hermon_state_t *state, hermon_qphdl_t qp, ib_gid_t gid,
    ib_lid_t lid)
{
        hermon_hw_mcg_t         *mcg_entry;
        hermon_hw_mcg_qp_list_t *mcg_entry_qplist;
        hermon_mcghdl_t         mcg;
        uint64_t                mgid_hash;
        uint32_t                end_indx, prev_indx;
        int                     status;

        /*
         * Check for invalid Multicast DLID.  Specifically, all Multicast
         * LIDs should be within a well defined range.  If the specified LID
         * is outside of that range, then return an error.
         */
        if (hermon_mlid_is_valid(lid) == 0) {
                return (IBT_MC_MLID_INVALID);
        }

        /*
         * Compute the MGID hash value.  As described above, the MCG table is
         * arranged as a number of separate hash chains.  This operation
         * converts the specified MGID into the starting index of an entry in
         * the hash table (i.e. the index for the start of the appropriate
         * hash chain).  Subsequent operations below will walk the chain
         * searching for a matching entry from which to attempt to remove
         * the specified QP.
         */
        status = hermon_mgid_hash_cmd_post(state, gid.gid_prefix, gid.gid_guid,
            &mgid_hash, HERMON_SLEEPFLAG_FOR_CONTEXT());
        if (status != HERMON_CMD_SUCCESS) {
                cmn_err(CE_CONT, "Hermon: MGID_HASH command failed: %08x\n",
                    status);
                if (status == HERMON_CMD_INVALID_STATUS) {
                        hermon_fm_ereport(state, HCA_SYS_ERR, HCA_ERR_SRV_LOST);
                }
                return (ibc_get_ci_failure(0));
        }

        /*
         * Grab the multicast group mutex.  Then grab the pre-allocated
         * temporary buffer used for holding and/or modifying MCG entries.
         */
        mutex_enter(&state->hs_mcglock);
        mcg_entry = state->hs_mcgtmp;
        mcg_entry_qplist = HERMON_MCG_GET_QPLIST_PTR(mcg_entry);

        /*
         * Walk through the array of MCG entries starting at "mgid_hash".
         * Try to find an MCG entry with a matching MGID.  The
         * hermon_mcg_walk_mgid_hash() routine walks the list and returns an
         * index into the MCG table.  The entry at this index is checked to
         * determine whether it is a match or not.  If it is a match, then
         * we continue on to attempt to remove the QP from the MCG.  If it
         * is not a match (or not a valid MCG entry), then we return an error.
         */
        end_indx = hermon_mcg_walk_mgid_hash(state, mgid_hash, gid, &prev_indx);
        mcg      = &state->hs_mcghdl[end_indx];

        /*
         * If MGID == 0 (the hash chain is empty) or if the specified MGID
         * does not match the MGID in the current entry, then return
         * IBT_MC_MGID_INVALID (to indicate that the specified MGID is not
         * valid).
         */
        if (((mcg->mcg_mgid_h == 0) && (mcg->mcg_mgid_l == 0)) ||
            ((mcg->mcg_mgid_h != gid.gid_prefix) ||
            (mcg->mcg_mgid_l != gid.gid_guid))) {
                mutex_exit(&state->hs_mcglock);
                return (IBT_MC_MGID_INVALID);
        }

        /*
         * Read the current MCG entry into the temporary MCG.  Note: In
         * general, this operation shouldn't fail.  If it does, then it is
         * an indication that something (probably in HW, but maybe in SW)
         * has gone seriously wrong.
         */
        status = hermon_read_mgm_cmd_post(state, mcg_entry, end_indx,
            HERMON_CMD_NOSLEEP_SPIN);
        if (status != HERMON_CMD_SUCCESS) {
                mutex_exit(&state->hs_mcglock);
                HERMON_WARNING(state, "failed to read MCG entry");
                cmn_err(CE_CONT, "Hermon: READ_MGM command failed: %08x\n",
                    status);
                if (status == HERMON_CMD_INVALID_STATUS) {
                        hermon_fm_ereport(state, HCA_SYS_ERR, HCA_ERR_SRV_LOST);
                }
                return (ibc_get_ci_failure(0));
        }

        /*
         * Search the QP number list for a match.  If a match is found, then
         * remove the entry from the QP list.  Otherwise, if no match is found,
         * return an error.
         */
        status = hermon_mcg_qplist_remove(mcg, mcg_entry_qplist, qp);
        if (status != DDI_SUCCESS) {
                mutex_exit(&state->hs_mcglock);
                return (status);
        }

        /*
         * Decrement the MCG count for this QP.  When the 'qp_mcg'
         * field becomes 0, then this QP is no longer a member of any
         * MCG.
         */
        hermon_qp_mcg_refcnt_dec(qp);

        /*
         * If the current MCG's QP number list is about to be made empty
         * ("mcg_num_qps" == 1), then remove the entry itself from the hash
         * chain.  Otherwise, just write the updated MCG entry back to the
         * hardware.  In either case, once we successfully update the hardware
         * chain, then we decrement the "shadow" list entry's "mcg_num_qps"
         * count (or zero out the entire "shadow" list entry) before returning
         * success.  Note:  Zeroing out the "shadow" list entry is done
         * inside of hermon_mcg_hash_list_remove().
         */
        if (mcg->mcg_num_qps == 1) {

                /* Remove an MCG entry from the hash chain */
                status = hermon_mcg_hash_list_remove(state, end_indx, prev_indx,
                    mcg_entry);
                if (status != DDI_SUCCESS) {
                        mutex_exit(&state->hs_mcglock);
                        return (status);
                }

        } else {
                /*
                 * Write the updated MCG entry back to the Hermon MCG table.
                 * If this succeeds, then we update the "shadow" list to
                 * reflect the change (i.e. decrement the "mcg_num_qps"),
                 * drop the lock, and return success.  Note:  In general,
                 * this operation shouldn't fail.  If it does, then it is an
                 * indication that something (probably in HW, but maybe in SW)
                 * has gone seriously wrong.
                 */
                mcg_entry->member_cnt = (mcg->mcg_num_qps - 1);
                status = hermon_write_mgm_cmd_post(state, mcg_entry, end_indx,
                    HERMON_CMD_NOSLEEP_SPIN);
                if (status != HERMON_CMD_SUCCESS) {
                        mutex_exit(&state->hs_mcglock);
                        HERMON_WARNING(state, "failed to write MCG entry");
                        cmn_err(CE_CONT, "Hermon: WRITE_MGM command failed: "
                            "%08x\n", status);
                        if (status == HERMON_CMD_INVALID_STATUS) {
                                hermon_fm_ereport(state, HCA_SYS_ERR,
                                    HCA_ERR_SRV_LOST);
                        }
                        return (ibc_get_ci_failure(0));
                }
                mcg->mcg_num_qps--;
        }

        mutex_exit(&state->hs_mcglock);
        return (DDI_SUCCESS);
}

/*
 * hermon_qp_mcg_refcnt_inc()
 *    Context: Can be called from interrupt or base context.
 */
static void
hermon_qp_mcg_refcnt_inc(hermon_qphdl_t qp)
{
        /* Increment the QP's MCG reference count */
        mutex_enter(&qp->qp_lock);
        qp->qp_mcg_refcnt++;
        mutex_exit(&qp->qp_lock);
}


/*
 * hermon_qp_mcg_refcnt_dec()
 *    Context: Can be called from interrupt or base context.
 */
static void
hermon_qp_mcg_refcnt_dec(hermon_qphdl_t qp)
{
        /* Decrement the QP's MCG reference count */
        mutex_enter(&qp->qp_lock);
        qp->qp_mcg_refcnt--;
        mutex_exit(&qp->qp_lock);
}


/*
 * hermon_mcg_qplist_add()
 *    Context: Can be called from interrupt or base context.
 */
static int
hermon_mcg_qplist_add(hermon_state_t *state, hermon_mcghdl_t mcg,
    hermon_hw_mcg_qp_list_t *mcg_qplist, hermon_qphdl_t qp,
    uint_t *qp_found)
{
        uint_t          qplist_indx;

        ASSERT(MUTEX_HELD(&state->hs_mcglock));

        qplist_indx = mcg->mcg_num_qps;

        /*
         * Determine if we have exceeded the maximum number of QP per
         * multicast group.  If we have, then return an error
         */
        if (qplist_indx >= state->hs_cfg_profile->cp_num_qp_per_mcg) {
                return (IBT_HCA_MCG_QP_EXCEEDED);
        }

        /*
         * Determine if the QP is already attached to this MCG table.  If it
         * is, then we break out and treat this operation as a NO-OP
         */
        for (qplist_indx = 0; qplist_indx < mcg->mcg_num_qps;
            qplist_indx++) {
                if (mcg_qplist[qplist_indx].qpn == qp->qp_qpnum) {
                        break;
                }
        }

        /*
         * If the QP was already on the list, set 'qp_found' to TRUE.  We still
         * return SUCCESS in this case, but the qplist will not have been
         * updated because the QP was already on the list.
         */
        if (qplist_indx < mcg->mcg_num_qps) {
                *qp_found = 1;
        } else {
                /*
                 * Otherwise, append the new QP number to the end of the
                 * current QP list.  Note: We will increment the "mcg_num_qps"
                 * field on the "shadow" MCG list entry later (after we know
                 * that all necessary Hermon firmware accesses have been
                 * successful).
                 *
                 * Set 'qp_found' to 0 so we know the QP was added on to the
                 * list for sure.
                 */
                mcg_qplist[qplist_indx].qpn =
                    (qp->qp_qpnum | HERMON_MCG_QPN_BLOCK_LB);
                *qp_found = 0;
        }

        return (DDI_SUCCESS);
}



/*
 * hermon_mcg_qplist_remove()
 *    Context: Can be called from interrupt or base context.
 */
static int
hermon_mcg_qplist_remove(hermon_mcghdl_t mcg,
    hermon_hw_mcg_qp_list_t *mcg_qplist, hermon_qphdl_t qp)
{
        uint_t          i, qplist_indx;

        /*
         * Search the MCG QP list for a matching QPN.  When
         * it's found, we swap the last entry with the current
         * one, set the last entry to zero, decrement the last
         * entry, and return.  If it's not found, then it's
         * and error.
         */
        qplist_indx = mcg->mcg_num_qps;
        for (i = 0; i < qplist_indx; i++) {
                if (mcg_qplist[i].qpn == qp->qp_qpnum) {
                        mcg_qplist[i] = mcg_qplist[qplist_indx - 1];
                        mcg_qplist[qplist_indx - 1].qpn = 0;

                        return (DDI_SUCCESS);
                }
        }

        return (IBT_QP_HDL_INVALID);
}


/*
 * hermon_mcg_walk_mgid_hash()
 *    Context: Can be called from interrupt or base context.
 */
static uint_t
hermon_mcg_walk_mgid_hash(hermon_state_t *state, uint64_t start_indx,
    ib_gid_t mgid, uint_t *p_indx)
{
        hermon_mcghdl_t curr_mcghdl;
        uint_t          curr_indx, prev_indx;

        ASSERT(MUTEX_HELD(&state->hs_mcglock));

        /* Start at the head of the hash chain */
        curr_indx   = (uint_t)start_indx;
        prev_indx   = curr_indx;
        curr_mcghdl = &state->hs_mcghdl[curr_indx];

        /* If the first entry in the chain has MGID == 0, then stop */
        if ((curr_mcghdl->mcg_mgid_h == 0) &&
            (curr_mcghdl->mcg_mgid_l == 0)) {
                goto end_mgid_hash_walk;
        }

        /* If the first entry in the chain matches the MGID, then stop */
        if ((curr_mcghdl->mcg_mgid_h == mgid.gid_prefix) &&
            (curr_mcghdl->mcg_mgid_l == mgid.gid_guid)) {
                goto end_mgid_hash_walk;
        }

        /* Otherwise, walk the hash chain looking for a match */
        while (curr_mcghdl->mcg_next_indx != 0) {
                prev_indx = curr_indx;
                curr_indx = curr_mcghdl->mcg_next_indx;
                curr_mcghdl = &state->hs_mcghdl[curr_indx];

                if ((curr_mcghdl->mcg_mgid_h == mgid.gid_prefix) &&
                    (curr_mcghdl->mcg_mgid_l == mgid.gid_guid)) {
                        break;
                }
        }

end_mgid_hash_walk:
        /*
         * If necessary, return the index of the previous entry too.  This
         * is primarily used for detaching a QP from a multicast group.  It
         * may be necessary, in that case, to delete an MCG entry from the
         * hash chain and having the index of the previous entry is helpful.
         */
        if (p_indx != NULL) {
                *p_indx = prev_indx;
        }
        return (curr_indx);
}


/*
 * hermon_mcg_setup_new_hdr()
 *    Context: Can be called from interrupt or base context.
 */
static void
hermon_mcg_setup_new_hdr(hermon_mcghdl_t mcg, hermon_hw_mcg_t *mcg_hdr,
    ib_gid_t mgid, hermon_rsrc_t *mcg_rsrc)
{
        /*
         * Fill in the fields of the "shadow" entry used by software
         * to track MCG hardware entry
         */
        mcg->mcg_mgid_h    = mgid.gid_prefix;
        mcg->mcg_mgid_l    = mgid.gid_guid;
        mcg->mcg_rsrcp     = mcg_rsrc;
        mcg->mcg_next_indx = 0;
        mcg->mcg_num_qps   = 0;

        /*
         * Fill the header fields of the MCG entry (in the temporary copy)
         */
        mcg_hdr->mgid_h         = mgid.gid_prefix;
        mcg_hdr->mgid_l         = mgid.gid_guid;
        mcg_hdr->next_gid_indx  = 0;
}


/*
 * hermon_mcg_hash_list_remove()
 *    Context: Can be called only from user or kernel context.
 */
static int
hermon_mcg_hash_list_remove(hermon_state_t *state, uint_t curr_indx,
    uint_t prev_indx, hermon_hw_mcg_t *mcg_entry)
{
        hermon_mcghdl_t         curr_mcg, prev_mcg, next_mcg;
        uint_t                  next_indx;
        int                     status;

        /* Get the pointer to "shadow" list for current entry */
        curr_mcg = &state->hs_mcghdl[curr_indx];

        /*
         * If this is the first entry on a hash chain, then attempt to replace
         * the entry with the next entry on the chain.  If there are no
         * subsequent entries on the chain, then this is the only entry and
         * should be invalidated.
         */
        if (curr_indx == prev_indx) {

                /*
                 * If this is the only entry on the chain, then invalidate it.
                 * Note:  Invalidating an MCG entry means writing all zeros
                 * to the entry.  This is only necessary for those MCG
                 * entries that are the "head" entries of the individual hash
                 * chains.  Regardless of whether this operation returns
                 * success or failure, return that result to the caller.
                 */
                next_indx = curr_mcg->mcg_next_indx;
                if (next_indx == 0) {
                        status = hermon_mcg_entry_invalidate(state, mcg_entry,
                            curr_indx);
                        bzero(curr_mcg, sizeof (struct hermon_sw_mcg_list_s));
                        return (status);
                }

                /*
                 * Otherwise, this is just the first entry on the chain, so
                 * grab the next one
                 */
                next_mcg = &state->hs_mcghdl[next_indx];

                /*
                 * Read the next MCG entry into the temporary MCG.  Note:
                 * In general, this operation shouldn't fail.  If it does,
                 * then it is an indication that something (probably in HW,
                 * but maybe in SW) has gone seriously wrong.
                 */
                status = hermon_read_mgm_cmd_post(state, mcg_entry, next_indx,
                    HERMON_CMD_NOSLEEP_SPIN);
                if (status != HERMON_CMD_SUCCESS) {
                        HERMON_WARNING(state, "failed to read MCG entry");
                        cmn_err(CE_CONT, "Hermon: READ_MGM command failed: "
                            "%08x\n", status);
                        if (status == HERMON_CMD_INVALID_STATUS) {
                                hermon_fm_ereport(state, HCA_SYS_ERR,
                                    HCA_ERR_SRV_LOST);
                        }
                        return (ibc_get_ci_failure(0));
                }

                /*
                 * Copy/Write the temporary MCG back to the hardware MCG list
                 * using the current index.  This essentially removes the
                 * current MCG entry from the list by writing over it with
                 * the next one.  If this is successful, then we can do the
                 * same operation for the "shadow" list.  And we can also
                 * free up the Hermon MCG entry resource that was associated
                 * with the (old) next entry.  Note:  In general, this
                 * operation shouldn't fail.  If it does, then it is an
                 * indication that something (probably in HW, but maybe in SW)
                 * has gone seriously wrong.
                 */
                status = hermon_write_mgm_cmd_post(state, mcg_entry, curr_indx,
                    HERMON_CMD_NOSLEEP_SPIN);
                if (status != HERMON_CMD_SUCCESS) {
                        HERMON_WARNING(state, "failed to write MCG entry");
                        cmn_err(CE_CONT, "Hermon: WRITE_MGM command failed: "
                            "%08x\n", status);
                        if (status == HERMON_CMD_INVALID_STATUS) {
                                hermon_fm_ereport(state, HCA_SYS_ERR,
                                    HCA_ERR_SRV_LOST);
                        }
                        return (ibc_get_ci_failure(0));
                }

                /*
                 * Copy all the software tracking information from the next
                 * entry on the "shadow" MCG list into the current entry on
                 * the list.  Then invalidate (zero out) the other "shadow"
                 * list entry.
                 */
                bcopy(next_mcg, curr_mcg, sizeof (struct hermon_sw_mcg_list_s));
                bzero(next_mcg, sizeof (struct hermon_sw_mcg_list_s));

                /*
                 * Free up the Hermon MCG entry resource used by the "next"
                 * MCG entry.  That resource is no longer needed by any
                 * MCG entry which is first on a hash chain (like the "next"
                 * entry has just become).
                 */
                hermon_rsrc_free(state, &curr_mcg->mcg_rsrcp);

                return (DDI_SUCCESS);
        }

        /*
         * Else if this is the last entry on the hash chain (or a middle
         * entry, then we update the previous entry's "next_gid_index" field
         * to make it point instead to the next entry on the chain.  By
         * skipping over the removed entry in this way, we can then free up
         * any resources associated with the current entry.  Note:  We don't
         * need to invalidate the "skipped over" hardware entry because it
         * will no be longer connected to any hash chains, and if/when it is
         * finally re-used, it will be written with entirely new values.
         */

        /*
         * Read the next MCG entry into the temporary MCG.  Note:  In general,
         * this operation shouldn't fail.  If it does, then it is an
         * indication that something (probably in HW, but maybe in SW) has
         * gone seriously wrong.
         */
        status = hermon_read_mgm_cmd_post(state, mcg_entry, prev_indx,
            HERMON_CMD_NOSLEEP_SPIN);
        if (status != HERMON_CMD_SUCCESS) {
                HERMON_WARNING(state, "failed to read MCG entry");
                cmn_err(CE_CONT, "Hermon: READ_MGM command failed: %08x\n",
                    status);
                if (status == HERMON_CMD_INVALID_STATUS) {
                        hermon_fm_ereport(state, HCA_SYS_ERR, HCA_ERR_SRV_LOST);
                }
                return (ibc_get_ci_failure(0));
        }

        /*
         * Finally, we update the "next_gid_indx" field in the temporary MCG
         * and attempt to write the entry back into the Hermon MCG table.  If
         * this succeeds, then we update the "shadow" list to reflect the
         * change, free up the Hermon MCG entry resource that was associated
         * with the current entry, and return success.  Note:  In general,
         * this operation shouldn't fail.  If it does, then it is an indication
         * that something (probably in HW, but maybe in SW) has gone seriously
         * wrong.
         */
        mcg_entry->next_gid_indx = curr_mcg->mcg_next_indx;
        status = hermon_write_mgm_cmd_post(state, mcg_entry, prev_indx,
            HERMON_CMD_NOSLEEP_SPIN);
        if (status != HERMON_CMD_SUCCESS) {
                HERMON_WARNING(state, "failed to write MCG entry");
                cmn_err(CE_CONT, "Hermon: WRITE_MGM command failed: %08x\n",
                    status);
                if (status == HERMON_CMD_INVALID_STATUS) {
                        hermon_fm_ereport(state, HCA_SYS_ERR,
                            HCA_ERR_SRV_LOST);
                }
                return (ibc_get_ci_failure(0));
        }

        /*
         * Get the pointer to the "shadow" MCG list entry for the previous
         * MCG.  Update its "mcg_next_indx" to point to the next entry
         * the one after the current entry. Note:  This next index may be
         * zero, indicating the end of the list.
         */
        prev_mcg = &state->hs_mcghdl[prev_indx];
        prev_mcg->mcg_next_indx = curr_mcg->mcg_next_indx;

        /*
         * Free up the Hermon MCG entry resource used by the current entry.
         * This resource is no longer needed because the chain now skips over
         * the current entry.  Then invalidate (zero out) the current "shadow"
         * list entry.
         */
        hermon_rsrc_free(state, &curr_mcg->mcg_rsrcp);
        bzero(curr_mcg, sizeof (struct hermon_sw_mcg_list_s));

        return (DDI_SUCCESS);
}


/*
 * hermon_mcg_entry_invalidate()
 *    Context: Can be called only from user or kernel context.
 */
static int
hermon_mcg_entry_invalidate(hermon_state_t *state, hermon_hw_mcg_t *mcg_entry,
    uint_t indx)
{
        int             status;

        /*
         * Invalidate the hardware MCG entry by zeroing out this temporary
         * MCG and writing it the the hardware.  Note: In general, this
         * operation shouldn't fail.  If it does, then it is an indication
         * that something (probably in HW, but maybe in SW) has gone seriously
         * wrong.
         */
        bzero(mcg_entry, HERMON_MCGMEM_SZ(state));
        status = hermon_write_mgm_cmd_post(state, mcg_entry, indx,
            HERMON_CMD_NOSLEEP_SPIN);
        if (status != HERMON_CMD_SUCCESS) {
                HERMON_WARNING(state, "failed to write MCG entry");
                cmn_err(CE_CONT, "Hermon: WRITE_MGM command failed: %08x\n",
                    status);
                if (status == HERMON_CMD_INVALID_STATUS) {
                        hermon_fm_ereport(state, HCA_SYS_ERR, HCA_ERR_SRV_LOST);
                }
                return (ibc_get_ci_failure(0));
        }

        return (DDI_SUCCESS);
}


/*
 * hermon_mgid_is_valid()
 *    Context: Can be called from interrupt or base context.
 */
static int
hermon_mgid_is_valid(ib_gid_t gid)
{
        uint_t          topbits, flags, scope;

        /*
         * According to IBA 1.1 specification (section 4.1.1) a valid
         * "multicast GID" must have its top eight bits set to all ones
         */
        topbits = (gid.gid_prefix >> HERMON_MCG_TOPBITS_SHIFT) &
            HERMON_MCG_TOPBITS_MASK;
        if (topbits != HERMON_MCG_TOPBITS) {
                return (0);
        }

        /*
         * The next 4 bits are the "flag" bits.  These are valid only
         * if they are "0" (which correspond to permanently assigned/
         * "well-known" multicast GIDs) or "1" (for so-called "transient"
         * multicast GIDs).  All other values are reserved.
         */
        flags = (gid.gid_prefix >> HERMON_MCG_FLAGS_SHIFT) &
            HERMON_MCG_FLAGS_MASK;
        if (!((flags == HERMON_MCG_FLAGS_PERM) ||
            (flags == HERMON_MCG_FLAGS_NONPERM))) {
                return (0);
        }

        /*
         * The next 4 bits are the "scope" bits.  These are valid only
         * if they are "2" (Link-local), "5" (Site-local), "8"
         * (Organization-local) or "E" (Global).  All other values
         * are reserved (or currently unassigned).
         */
        scope = (gid.gid_prefix >> HERMON_MCG_SCOPE_SHIFT) &
            HERMON_MCG_SCOPE_MASK;
        if (!((scope == HERMON_MCG_SCOPE_LINKLOC) ||
            (scope == HERMON_MCG_SCOPE_SITELOC)  ||
            (scope == HERMON_MCG_SCOPE_ORGLOC)   ||
            (scope == HERMON_MCG_SCOPE_GLOBAL))) {
                return (0);
        }

        /*
         * If it passes all of the above checks, then we will consider it
         * a valid multicast GID.
         */
        return (1);
}


/*
 * hermon_mlid_is_valid()
 *    Context: Can be called from interrupt or base context.
 */
static int
hermon_mlid_is_valid(ib_lid_t lid)
{
        /*
         * According to IBA 1.1 specification (section 4.1.1) a valid
         * "multicast DLID" must be between 0xC000 and 0xFFFE.
         */
        if ((lid < IB_LID_MC_FIRST) || (lid > IB_LID_MC_LAST)) {
                return (0);
        }

        return (1);
}


/*
 * hermon_pd_alloc()
 *    Context: Can be called only from user or kernel context.
 */
int
hermon_pd_alloc(hermon_state_t *state, hermon_pdhdl_t *pdhdl, uint_t sleepflag)
{
        hermon_rsrc_t   *rsrc;
        hermon_pdhdl_t  pd;
        int             status;

        /*
         * Allocate the software structure for tracking the protection domain
         * (i.e. the Hermon Protection Domain handle).  By default each PD
         * structure will have a unique PD number assigned to it.  All that
         * is necessary is for software to initialize the PD reference count
         * (to zero) and return success.
         */
        status = hermon_rsrc_alloc(state, HERMON_PDHDL, 1, sleepflag, &rsrc);
        if (status != DDI_SUCCESS) {
                return (IBT_INSUFF_RESOURCE);
        }
        pd = (hermon_pdhdl_t)rsrc->hr_addr;
        _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*pd))

        pd->pd_refcnt = 0;
        *pdhdl = pd;

        return (DDI_SUCCESS);
}


/*
 * hermon_pd_free()
 *    Context: Can be called only from user or kernel context.
 */
int
hermon_pd_free(hermon_state_t *state, hermon_pdhdl_t *pdhdl)
{
        hermon_rsrc_t   *rsrc;
        hermon_pdhdl_t  pd;

        /*
         * Pull all the necessary information from the Hermon Protection Domain
         * handle.  This is necessary here because the resource for the
         * PD is going to be freed up as part of this operation.
         */
        pd   = *pdhdl;
        _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*pd))
        rsrc = pd->pd_rsrcp;

        /*
         * Check the PD reference count.  If the reference count is non-zero,
         * then it means that this protection domain is still referenced by
         * some memory region, queue pair, address handle, or other IB object
         * If it is non-zero, then return an error.  Otherwise, free the
         * Hermon resource and return success.
         */
        if (pd->pd_refcnt != 0) {
                return (IBT_PD_IN_USE);
        }

        /* Free the Hermon Protection Domain handle */
        hermon_rsrc_free(state, &rsrc);

        /* Set the pdhdl pointer to NULL and return success */
        *pdhdl = (hermon_pdhdl_t)NULL;

        return (DDI_SUCCESS);
}


/*
 * hermon_pd_refcnt_inc()
 *    Context: Can be called from interrupt or base context.
 */
void
hermon_pd_refcnt_inc(hermon_pdhdl_t pd)
{
        /* Increment the protection domain's reference count */
        atomic_inc_32(&pd->pd_refcnt);
}


/*
 * hermon_pd_refcnt_dec()
 *    Context: Can be called from interrupt or base context.
 */
void
hermon_pd_refcnt_dec(hermon_pdhdl_t pd)
{
        /* Decrement the protection domain's reference count */
        atomic_dec_32(&pd->pd_refcnt);
}


/*
 * hermon_port_query()
 *    Context: Can be called only from user or kernel context.
 */
int
hermon_port_query(hermon_state_t *state, uint_t port, ibt_hca_portinfo_t *pi)
{
        sm_portinfo_t           portinfo;
        sm_guidinfo_t           guidinfo;
        sm_pkey_table_t         pkeytable;
        ib_gid_t                *sgid;
        uint_t                  sgid_max, pkey_max, tbl_size;
        int                     i, j, indx, status;
        ib_pkey_t               *pkeyp;
        ib_guid_t               *guidp;

        _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*pi))
        _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*state))

        /* Validate that specified port number is legal */
        if (!hermon_portnum_is_valid(state, port)) {
                return (IBT_HCA_PORT_INVALID);
        }
        pkeyp = state->hs_pkey[port - 1];
        guidp = state->hs_guid[port - 1];

        /*
         * We use the Hermon MAD_IFC command to post a GetPortInfo MAD
         * to the firmware (for the specified port number).  This returns
         * a full PortInfo MAD (in "portinfo") which we subsequently
         * parse to fill in the "ibt_hca_portinfo_t" structure returned
         * to the IBTF.
         */
        status = hermon_getportinfo_cmd_post(state, port,
            HERMON_SLEEPFLAG_FOR_CONTEXT(), &portinfo);
        if (status != HERMON_CMD_SUCCESS) {
                cmn_err(CE_CONT, "Hermon: GetPortInfo (port %02d) command "
                    "failed: %08x\n", port, status);
                if (status == HERMON_CMD_INVALID_STATUS) {
                        hermon_fm_ereport(state, HCA_SYS_ERR, HCA_ERR_SRV_LOST);
                }
                return (ibc_get_ci_failure(0));
        }

        /*
         * Parse the PortInfo MAD and fill in the IBTF structure
         */
        pi->p_base_lid          = portinfo.LID;
        pi->p_qkey_violations   = portinfo.Q_KeyViolations;
        pi->p_pkey_violations   = portinfo.P_KeyViolations;
        pi->p_sm_sl             = portinfo.MasterSMSL;
        pi->p_sm_lid            = portinfo.MasterSMLID;
        pi->p_linkstate         = portinfo.PortState;
        pi->p_port_num          = portinfo.LocalPortNum;
        pi->p_phys_state        = portinfo.PortPhysicalState;
        pi->p_width_supported   = portinfo.LinkWidthSupported;
        pi->p_width_enabled     = portinfo.LinkWidthEnabled;
        pi->p_width_active      = portinfo.LinkWidthActive;
        pi->p_speed_supported   = portinfo.LinkSpeedSupported;
        pi->p_speed_enabled     = portinfo.LinkSpeedEnabled;
        pi->p_speed_active      = portinfo.LinkSpeedActive;
        pi->p_mtu               = portinfo.MTUCap;
        pi->p_lmc               = portinfo.LMC;
        pi->p_max_vl            = portinfo.VLCap;
        pi->p_subnet_timeout    = portinfo.SubnetTimeOut;
        pi->p_msg_sz            = ((uint32_t)1 << HERMON_QP_LOG_MAX_MSGSZ);
        tbl_size = state->hs_cfg_profile->cp_log_max_gidtbl;
        pi->p_sgid_tbl_sz       = (1 << tbl_size);
        tbl_size = state->hs_cfg_profile->cp_log_max_pkeytbl;
        pi->p_pkey_tbl_sz       = (1 << tbl_size);
        state->hs_sn_prefix[port - 1] = portinfo.GidPrefix;

        /*
         * Convert InfiniBand-defined port capability flags to the format
         * specified by the IBTF
         */
        if (portinfo.CapabilityMask & SM_CAP_MASK_IS_SM)
                pi->p_capabilities |= IBT_PORT_CAP_SM;
        if (portinfo.CapabilityMask & SM_CAP_MASK_IS_SM_DISABLED)
                pi->p_capabilities |= IBT_PORT_CAP_SM_DISABLED;
        if (portinfo.CapabilityMask & SM_CAP_MASK_IS_SNMP_SUPPD)
                pi->p_capabilities |= IBT_PORT_CAP_SNMP_TUNNEL;
        if (portinfo.CapabilityMask & SM_CAP_MASK_IS_DM_SUPPD)
                pi->p_capabilities |= IBT_PORT_CAP_DM;
        if (portinfo.CapabilityMask & SM_CAP_MASK_IS_VM_SUPPD)
                pi->p_capabilities |= IBT_PORT_CAP_VENDOR;
        if (portinfo.CapabilityMask & SM_CAP_MASK_IS_CLNT_REREG_SUPPD)
                pi->p_capabilities |= IBT_PORT_CAP_CLNT_REREG;

        /*
         * Fill in the SGID table.  Since the only access to the Hermon
         * GID tables is through the firmware's MAD_IFC interface, we
         * post as many GetGUIDInfo MADs as necessary to read in the entire
         * contents of the SGID table (for the specified port).  Note:  The
         * GetGUIDInfo command only gets eight GUIDs per operation.  These
         * GUIDs are then appended to the GID prefix for the port (from the
         * GetPortInfo above) to form the entire SGID table.
         */
        for (i = 0; i < pi->p_sgid_tbl_sz; i += 8) {
                status = hermon_getguidinfo_cmd_post(state, port, i >> 3,
                    HERMON_SLEEPFLAG_FOR_CONTEXT(), &guidinfo);
                if (status != HERMON_CMD_SUCCESS) {
                        cmn_err(CE_CONT, "Hermon: GetGUIDInfo (port %02d) "
                            "command failed: %08x\n", port, status);
                        if (status == HERMON_CMD_INVALID_STATUS) {
                                hermon_fm_ereport(state, HCA_SYS_ERR,
                                    HCA_ERR_SRV_LOST);
                        }
                        return (ibc_get_ci_failure(0));
                }

                /* Figure out how many of the entries are valid */
                sgid_max = min((pi->p_sgid_tbl_sz - i), 8);
                for (j = 0; j < sgid_max; j++) {
                        indx = (i + j);
                        sgid = &pi->p_sgid_tbl[indx];
                        _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*sgid))
                        sgid->gid_prefix = portinfo.GidPrefix;
                        guidp[indx] = sgid->gid_guid =
                            guidinfo.GUIDBlocks[j];
                }
        }

        /*
         * Fill in the PKey table.  Just as for the GID tables above, the
         * only access to the Hermon PKey tables is through the firmware's
         * MAD_IFC interface.  We post as many GetPKeyTable MADs as necessary
         * to read in the entire contents of the PKey table (for the specified
         * port).  Note:  The GetPKeyTable command only gets 32 PKeys per
         * operation.
         */
        for (i = 0; i < pi->p_pkey_tbl_sz; i += 32) {
                status = hermon_getpkeytable_cmd_post(state, port, i,
                    HERMON_SLEEPFLAG_FOR_CONTEXT(), &pkeytable);
                if (status != HERMON_CMD_SUCCESS) {
                        cmn_err(CE_CONT, "Hermon: GetPKeyTable (port %02d) "
                            "command failed: %08x\n", port, status);
                        if (status == HERMON_CMD_INVALID_STATUS) {
                                hermon_fm_ereport(state, HCA_SYS_ERR,
                                    HCA_ERR_SRV_LOST);
                        }
                        return (ibc_get_ci_failure(0));
                }

                /* Figure out how many of the entries are valid */
                pkey_max = min((pi->p_pkey_tbl_sz - i), 32);
                for (j = 0; j < pkey_max; j++) {
                        indx = (i + j);
                        pkeyp[indx] = pi->p_pkey_tbl[indx] =
                            pkeytable.P_KeyTableBlocks[j];
                }
        }

        return (DDI_SUCCESS);
}


/*
 * hermon_port_modify()
 *    Context: Can be called only from user or kernel context.
 */
/* ARGSUSED */
int
hermon_port_modify(hermon_state_t *state, uint8_t port,
    ibt_port_modify_flags_t flags, uint8_t init_type)
{
        sm_portinfo_t           portinfo;
        uint32_t                capmask;
        int                     status;
        hermon_hw_set_port_t    set_port;

        /*
         * Return an error if either of the unsupported flags are set
         */
        if ((flags & IBT_PORT_SHUTDOWN) ||
            (flags & IBT_PORT_SET_INIT_TYPE)) {
                return (IBT_NOT_SUPPORTED);
        }

        bzero(&set_port, sizeof (set_port));

        /*
         * Determine whether we are trying to reset the QKey counter
         */
        if (flags & IBT_PORT_RESET_QKEY)
                set_port.rqk = 1;

        /* Validate that specified port number is legal */
        if (!hermon_portnum_is_valid(state, port)) {
                return (IBT_HCA_PORT_INVALID);
        }

        /*
         * Use the Hermon MAD_IFC command to post a GetPortInfo MAD to the
         * firmware (for the specified port number).  This returns a full
         * PortInfo MAD (in "portinfo") from which we pull the current
         * capability mask.  We then modify the capability mask as directed
         * by the "pmod_flags" field, and write the updated capability mask
         * using the Hermon SET_IB command (below).
         */
        status = hermon_getportinfo_cmd_post(state, port,
            HERMON_SLEEPFLAG_FOR_CONTEXT(), &portinfo);
        if (status != HERMON_CMD_SUCCESS) {
                if (status == HERMON_CMD_INVALID_STATUS) {
                        hermon_fm_ereport(state, HCA_SYS_ERR, HCA_ERR_SRV_LOST);
                }
                return (ibc_get_ci_failure(0));
        }

        /*
         * Convert InfiniBand-defined port capability flags to the format
         * specified by the IBTF.  Specifically, we modify the capability
         * mask based on the specified values.
         */
        capmask = portinfo.CapabilityMask;

        if (flags & IBT_PORT_RESET_SM)
                capmask &= ~SM_CAP_MASK_IS_SM;
        else if (flags & IBT_PORT_SET_SM)
                capmask |= SM_CAP_MASK_IS_SM;

        if (flags & IBT_PORT_RESET_SNMP)
                capmask &= ~SM_CAP_MASK_IS_SNMP_SUPPD;
        else if (flags & IBT_PORT_SET_SNMP)
                capmask |= SM_CAP_MASK_IS_SNMP_SUPPD;

        if (flags & IBT_PORT_RESET_DEVMGT)
                capmask &= ~SM_CAP_MASK_IS_DM_SUPPD;
        else if (flags & IBT_PORT_SET_DEVMGT)
                capmask |= SM_CAP_MASK_IS_DM_SUPPD;

        if (flags & IBT_PORT_RESET_VENDOR)
                capmask &= ~SM_CAP_MASK_IS_VM_SUPPD;
        else if (flags & IBT_PORT_SET_VENDOR)
                capmask |= SM_CAP_MASK_IS_VM_SUPPD;

        set_port.cap_mask = capmask;

        /*
         * Use the Hermon SET_PORT command to update the capability mask and
         * (possibly) reset the QKey violation counter for the specified port.
         * Note: In general, this operation shouldn't fail.  If it does, then
         * it is an indication that something (probably in HW, but maybe in
         * SW) has gone seriously wrong.
         */
        status = hermon_set_port_cmd_post(state, &set_port, port,
            HERMON_SLEEPFLAG_FOR_CONTEXT());
        if (status != HERMON_CMD_SUCCESS) {
                HERMON_WARNING(state, "failed to modify port capabilities");
                cmn_err(CE_CONT, "Hermon: SET_IB (port %02d) command failed: "
                    "%08x\n", port, status);
                if (status == HERMON_CMD_INVALID_STATUS) {
                        hermon_fm_ereport(state, HCA_SYS_ERR, HCA_ERR_SRV_LOST);
                }
                return (ibc_get_ci_failure(0));
        }

        return (DDI_SUCCESS);
}


/*
 * hermon_set_addr_path()
 *    Context: Can be called from interrupt or base context.
 *
 * Note: This routine is used for two purposes.  It is used to fill in the
 * Hermon UDAV fields, and it is used to fill in the address path information
 * for QPs.  Because the two Hermon structures are similar, common fields can
 * be filled in here.  Because they are different, however, we pass
 * an additional flag to indicate which type is being filled and do each one
 * uniquely
 */

int hermon_srate_override = -1; /* allows ease of testing */

int
hermon_set_addr_path(hermon_state_t *state, ibt_adds_vect_t *av,
    hermon_hw_addr_path_t *path, uint_t type)
{
        uint_t          gidtbl_sz;
        hermon_hw_udav_t *udav;

        _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*av))
        _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*path))

        udav = (hermon_hw_udav_t *)(void *)path;
        _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*udav))
        path->mlid      = av->av_src_path;
        path->rlid      = av->av_dlid;

        switch (av->av_srate) {
        case IBT_SRATE_2:       /* 1xSDR-2.5Gb/s injection rate */
                path->max_stat_rate = 7; break;
        case IBT_SRATE_10:      /* 4xSDR-10.0Gb/s injection rate */
                path->max_stat_rate = 8; break;
        case IBT_SRATE_30:      /* 12xSDR-30Gb/s injection rate */
                path->max_stat_rate = 9; break;
        case IBT_SRATE_5:       /* 1xDDR-5Gb/s injection rate */
                path->max_stat_rate = 10; break;
        case IBT_SRATE_20:      /* 4xDDR-20Gb/s injection rate */
                path->max_stat_rate = 11; break;
        case IBT_SRATE_40:      /* 4xQDR-40Gb/s injection rate */
                path->max_stat_rate = 12; break;
        case IBT_SRATE_60:      /* 12xDDR-60Gb/s injection rate */
                path->max_stat_rate = 13; break;
        case IBT_SRATE_80:      /* 8xQDR-80Gb/s injection rate */
                path->max_stat_rate = 14; break;
        case IBT_SRATE_120:     /* 12xQDR-120Gb/s injection rate */
                path->max_stat_rate = 15; break;
        case IBT_SRATE_NOT_SPECIFIED:   /* Max */
                path->max_stat_rate = 0; break;
        default:
                return (IBT_STATIC_RATE_INVALID);
        }
        if (hermon_srate_override != -1) /* for evaluating HCA firmware */
                path->max_stat_rate = hermon_srate_override;

        /* If "grh" flag is set, then check for valid SGID index too */
        gidtbl_sz = (1 << state->hs_queryport.log_max_gid);
        if ((av->av_send_grh) && (av->av_sgid_ix > gidtbl_sz)) {
                return (IBT_SGID_INVALID);
        }

        /*
         * Fill in all "global" values regardless of the value in the GRH
         * flag.  Because "grh" is not set unless "av_send_grh" is set, the
         * hardware will ignore the other "global" values as necessary.  Note:
         * SW does this here to enable later query operations to return
         * exactly the same params that were passed when the addr path was
         * last written.
         */
        path->grh = av->av_send_grh;
        if (type == HERMON_ADDRPATH_QP) {
                path->mgid_index = av->av_sgid_ix;
        } else {
                /*
                 * For Hermon UDAV, the "mgid_index" field is the index into
                 * a combined table (not a per-port table), but having sections
                 * for each port. So some extra calculations are necessary.
                 */

                path->mgid_index = ((av->av_port_num - 1) * gidtbl_sz) +
                    av->av_sgid_ix;

                udav->portnum = av->av_port_num;
        }

        /*
         * According to Hermon PRM, the (31:0) part of rgid_l must be set to
         * "0x2" if the 'grh' or 'g' bit is cleared.  It also says that we
         * only need to do it for UDAV's.  So we enforce that here.
         *
         * NOTE: The entire 64 bits worth of GUID info is actually being
         * preserved (for UDAVs) by the callers of this function
         * (hermon_ah_alloc() and hermon_ah_modify()) and as long as the
         * 'grh' bit is not set, the upper 32 bits (63:32) of rgid_l are
         * "don't care".
         */
        if ((path->grh) || (type == HERMON_ADDRPATH_QP)) {
                path->flow_label = av->av_flow;
                path->tclass     = av->av_tclass;
                path->hop_limit  = av->av_hop;
                bcopy(&(av->av_dgid.gid_prefix), &(path->rgid_h),
                    sizeof (uint64_t));
                bcopy(&(av->av_dgid.gid_guid), &(path->rgid_l),
                    sizeof (uint64_t));
        } else {
                path->rgid_l     = 0x2;
                path->flow_label = 0;
                path->tclass     = 0;
                path->hop_limit  = 0;
                path->rgid_h     = 0;
        }
        /* extract the default service level */
        udav->sl = (HERMON_DEF_SCHED_SELECTION & 0x3C) >> 2;

        return (DDI_SUCCESS);
}


/*
 * hermon_get_addr_path()
 *    Context: Can be called from interrupt or base context.
 *
 * Note: Just like hermon_set_addr_path() above, this routine is used for two
 * purposes.  It is used to read in the Hermon UDAV fields, and it is used to
 * read in the address path information for QPs.  Because the two Hermon
 * structures are similar, common fields can be read in here.  But because
 * they are slightly different, we pass an additional flag to indicate which
 * type is being read.
 */
void
hermon_get_addr_path(hermon_state_t *state, hermon_hw_addr_path_t *path,
    ibt_adds_vect_t *av, uint_t type)
{
        uint_t          gidtbl_sz;

        _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*path))
        _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*av))

        av->av_src_path = path->mlid;
        av->av_dlid     = path->rlid;

        /* Set "av_ipd" value from max_stat_rate */
        switch (path->max_stat_rate) {
        case 7:                         /* 1xSDR-2.5Gb/s injection rate */
                av->av_srate = IBT_SRATE_2; break;
        case 8:                         /* 4xSDR-10.0Gb/s injection rate */
                av->av_srate = IBT_SRATE_10; break;
        case 9:                         /* 12xSDR-30Gb/s injection rate */
                av->av_srate = IBT_SRATE_30; break;
        case 10:                        /* 1xDDR-5Gb/s injection rate */
                av->av_srate = IBT_SRATE_5; break;
        case 11:                        /* 4xDDR-20Gb/s injection rate */
                av->av_srate = IBT_SRATE_20; break;
        case 12:                        /* xQDR-40Gb/s injection rate */
                av->av_srate = IBT_SRATE_40; break;
        case 13:                        /* 12xDDR-60Gb/s injection rate */
                av->av_srate = IBT_SRATE_60; break;
        case 14:                        /* 8xQDR-80Gb/s injection rate */
                av->av_srate = IBT_SRATE_80; break;
        case 15:                        /* 12xQDR-120Gb/s injection rate */
                av->av_srate = IBT_SRATE_120; break;
        case 0:                         /* max */
                av->av_srate = IBT_SRATE_NOT_SPECIFIED; break;
        default:                        /* 1x injection rate */
                av->av_srate = IBT_SRATE_1X;
        }

        /*
         * Extract all "global" values regardless of the value in the GRH
         * flag.  Because "av_send_grh" is set only if "grh" is set, software
         * knows to ignore the other "global" values as necessary.  Note: SW
         * does it this way to enable these query operations to return exactly
         * the same params that were passed when the addr path was last written.
         */
        av->av_send_grh         = path->grh;
        if (type == HERMON_ADDRPATH_QP) {
                av->av_sgid_ix  = path->mgid_index;
        } else {
                /*
                 * For Hermon UDAV, the "mgid_index" field is the index into
                 * a combined table (not a per-port table).
                 */
                gidtbl_sz = (1 << state->hs_queryport.log_max_gid);
                av->av_sgid_ix = path->mgid_index - ((av->av_port_num - 1) *
                    gidtbl_sz);

                av->av_port_num = ((hermon_hw_udav_t *)(void *)path)->portnum;
        }
        av->av_flow             = path->flow_label;
        av->av_tclass           = path->tclass;
        av->av_hop              = path->hop_limit;
        /* this is for alignment issue w/ the addr path struct in Hermon */
        bcopy(&(path->rgid_h), &(av->av_dgid.gid_prefix), sizeof (uint64_t));
        bcopy(&(path->rgid_l), &(av->av_dgid.gid_guid), sizeof (uint64_t));
}


/*
 * hermon_portnum_is_valid()
 *    Context: Can be called from interrupt or base context.
 */
int
hermon_portnum_is_valid(hermon_state_t *state, uint_t portnum)
{
        uint_t  max_port;

        max_port = state->hs_cfg_profile->cp_num_ports;
        if ((portnum <= max_port) && (portnum != 0)) {
                return (1);
        } else {
                return (0);
        }
}


/*
 * hermon_pkeyindex_is_valid()
 *    Context: Can be called from interrupt or base context.
 */
int
hermon_pkeyindex_is_valid(hermon_state_t *state, uint_t pkeyindx)
{
        uint_t  max_pkeyindx;

        max_pkeyindx = 1 << state->hs_cfg_profile->cp_log_max_pkeytbl;
        if (pkeyindx < max_pkeyindx) {
                return (1);
        } else {
                return (0);
        }
}


/*
 * hermon_queue_alloc()
 *    Context: Can be called from interrupt or base context.
 */
int
hermon_queue_alloc(hermon_state_t *state, hermon_qalloc_info_t *qa_info,
    uint_t sleepflag)
{
        ddi_dma_attr_t          dma_attr;
        int                     (*callback)(caddr_t);
        uint64_t                realsize, alloc_mask;
        int                     flag, status;

        _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*qa_info))

        /* Set the callback flag appropriately */
        callback = (sleepflag == HERMON_SLEEP) ? DDI_DMA_SLEEP :
            DDI_DMA_DONTWAIT;

        /*
         * Initialize many of the default DMA attributes.  Then set additional
         * alignment restrictions as necessary for the queue memory.  Also
         * respect the configured value for IOMMU bypass
         */
        hermon_dma_attr_init(state, &dma_attr);
        dma_attr.dma_attr_align = qa_info->qa_bind_align;
#ifdef  __sparc
        if (state->hs_cfg_profile->cp_iommu_bypass == HERMON_BINDMEM_BYPASS) {
                dma_attr.dma_attr_flags = DDI_DMA_FORCE_PHYSICAL;
        }
#endif

        /* Allocate a DMA handle */
        status = ddi_dma_alloc_handle(state->hs_dip, &dma_attr, callback, NULL,
            &qa_info->qa_dmahdl);
        if (status != DDI_SUCCESS) {
                return (DDI_FAILURE);
        }

        /*
         * Determine the amount of memory to allocate, depending on the values
         * in "qa_bind_align" and "qa_alloc_align".  The problem we are trying
         * to solve here is that allocating a DMA handle with IOMMU bypass
         * (DDI_DMA_FORCE_PHYSICAL) constrains us to only requesting alignments
         * that are less restrictive than the page size.  Since we may need
         * stricter alignments on the memory allocated by ddi_dma_mem_alloc()
         * (e.g. in Hermon QP work queue memory allocation), we use the
         * following method to calculate how much additional memory to request,
         * and we enforce our own alignment on the allocated result.
         */
        alloc_mask = qa_info->qa_alloc_align - 1;
        if (qa_info->qa_bind_align == qa_info->qa_alloc_align) {
                realsize = qa_info->qa_size;
        } else {
                realsize = qa_info->qa_size + alloc_mask;
        }

        /*
         * If we are to allocate the queue from system memory, then use
         * ddi_dma_mem_alloc() to find the space.  Otherwise, this is a
         * host memory allocation, use ddi_umem_alloc(). In either case,
         * return a pointer to the memory range allocated (including any
         * necessary alignment adjustments), the "real" memory pointer,
         * the "real" size, and a ddi_acc_handle_t to use when reading
         * from/writing to the memory.
         */
        if (qa_info->qa_location == HERMON_QUEUE_LOCATION_NORMAL) {
                /* Allocate system memory for the queue */
                status = ddi_dma_mem_alloc(qa_info->qa_dmahdl, realsize,
                    &state->hs_reg_accattr, DDI_DMA_CONSISTENT, callback, NULL,
                    (caddr_t *)&qa_info->qa_buf_real,
                    (size_t *)&qa_info->qa_buf_realsz, &qa_info->qa_acchdl);
                if (status != DDI_SUCCESS) {
                        ddi_dma_free_handle(&qa_info->qa_dmahdl);
                        return (DDI_FAILURE);
                }

                /*
                 * Save temporary copy of the real pointer.  (This may be
                 * modified in the last step below).
                 */
                qa_info->qa_buf_aligned = qa_info->qa_buf_real;

                bzero(qa_info->qa_buf_real, qa_info->qa_buf_realsz);

        } else { /* HERMON_QUEUE_LOCATION_USERLAND */

                /* Allocate userland mappable memory for the queue */
                flag = (sleepflag == HERMON_SLEEP) ? DDI_UMEM_SLEEP :
                    DDI_UMEM_NOSLEEP;
                qa_info->qa_buf_real = ddi_umem_alloc(realsize, flag,
                    &qa_info->qa_umemcookie);
                if (qa_info->qa_buf_real == NULL) {
                        ddi_dma_free_handle(&qa_info->qa_dmahdl);
                        return (DDI_FAILURE);
                }

                /*
                 * Save temporary copy of the real pointer.  (This may be
                 * modified in the last step below).
                 */
                qa_info->qa_buf_aligned = qa_info->qa_buf_real;

        }

        /*
         * The next to last step is to ensure that the final address
         * ("qa_buf_aligned") has the appropriate "alloc" alignment
         * restriction applied to it (if necessary).
         */
        if (qa_info->qa_bind_align != qa_info->qa_alloc_align) {
                qa_info->qa_buf_aligned = (uint32_t *)(uintptr_t)(((uintptr_t)
                    qa_info->qa_buf_aligned + alloc_mask) & ~alloc_mask);
        }
        /*
         * The last step is to figure out the offset of the start relative
         * to the first page of the region - will be used in the eqc/cqc
         * passed to the HW
         */
        qa_info->qa_pgoffs = (uint_t)((uintptr_t)
            qa_info->qa_buf_aligned & HERMON_PAGEOFFSET);

        return (DDI_SUCCESS);
}


/*
 * hermon_queue_free()
 *    Context: Can be called from interrupt or base context.
 */
void
hermon_queue_free(hermon_qalloc_info_t *qa_info)
{
        _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*qa_info))

        /*
         * Depending on how (i.e. from where) we allocated the memory for
         * this queue, we choose the appropriate method for releasing the
         * resources.
         */
        if (qa_info->qa_location == HERMON_QUEUE_LOCATION_NORMAL) {

                ddi_dma_mem_free(&qa_info->qa_acchdl);

        } else if (qa_info->qa_location == HERMON_QUEUE_LOCATION_USERLAND) {

                ddi_umem_free(qa_info->qa_umemcookie);

        }

        /* Always free the dma handle */
        ddi_dma_free_handle(&qa_info->qa_dmahdl);
}

/*
 * hermon_create_fmr_pool()
 * Create a pool of FMRs.
 *     Context: Can be called from kernel context only.
 */
int
hermon_create_fmr_pool(hermon_state_t *state, hermon_pdhdl_t pd,
    ibt_fmr_pool_attr_t *fmr_attr, hermon_fmrhdl_t *fmrpoolp)
{
        hermon_fmrhdl_t fmrpool;
        hermon_fmr_list_t *fmr, *fmr_next;
        hermon_mrhdl_t   mr;
        int             status;
        int             sleep;
        int             i;

        sleep = (fmr_attr->fmr_flags & IBT_MR_SLEEP) ? HERMON_SLEEP :
            HERMON_NOSLEEP;
        if ((sleep == HERMON_SLEEP) &&
            (sleep != HERMON_SLEEPFLAG_FOR_CONTEXT())) {
                return (IBT_INVALID_PARAM);
        }

        fmrpool = (hermon_fmrhdl_t)kmem_zalloc(sizeof (*fmrpool), sleep);
        if (fmrpool == NULL) {
                status = IBT_INSUFF_RESOURCE;
                goto fail;
        }
        _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*fmrpool))

        mutex_init(&fmrpool->fmr_lock, NULL, MUTEX_DRIVER,
            DDI_INTR_PRI(state->hs_intrmsi_pri));
        mutex_init(&fmrpool->remap_lock, NULL, MUTEX_DRIVER,
            DDI_INTR_PRI(state->hs_intrmsi_pri));
        mutex_init(&fmrpool->dirty_lock, NULL, MUTEX_DRIVER,
            DDI_INTR_PRI(state->hs_intrmsi_pri));

        fmrpool->fmr_state          = state;
        fmrpool->fmr_flush_function = fmr_attr->fmr_func_hdlr;
        fmrpool->fmr_flush_arg      = fmr_attr->fmr_func_arg;
        fmrpool->fmr_pool_size      = 0;
        fmrpool->fmr_max_pages      = fmr_attr->fmr_max_pages_per_fmr;
        fmrpool->fmr_page_sz        = fmr_attr->fmr_page_sz;
        fmrpool->fmr_dirty_watermark = fmr_attr->fmr_pool_size / 4;
        fmrpool->fmr_dirty_len      = 0;
        fmrpool->fmr_remap_watermark = fmr_attr->fmr_pool_size / 32;
        fmrpool->fmr_remap_len      = 0;
        fmrpool->fmr_flags          = fmr_attr->fmr_flags;
        fmrpool->fmr_stat_register  = 0;
        fmrpool->fmr_max_remaps     = state->hs_cfg_profile->cp_fmr_max_remaps;
        fmrpool->fmr_remap_gen      = 1;

        fmrpool->fmr_free_list_tail = &fmrpool->fmr_free_list;
        fmrpool->fmr_dirty_list = NULL;
        fmrpool->fmr_dirty_list_tail = &fmrpool->fmr_dirty_list;
        fmrpool->fmr_remap_list = NULL;
        fmrpool->fmr_remap_list_tail = &fmrpool->fmr_remap_list;
        fmrpool->fmr_pool_size = fmrpool->fmr_free_len =
            fmr_attr->fmr_pool_size;

        for (i = 0; i < fmr_attr->fmr_pool_size; i++) {
                status = hermon_mr_alloc_fmr(state, pd, fmrpool, &mr);
                if (status != DDI_SUCCESS) {
                        goto fail2;
                }

                fmr = (hermon_fmr_list_t *)kmem_zalloc(
                    sizeof (hermon_fmr_list_t), sleep);
                _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*fmr))

                fmr->fmr = mr;
                fmr->fmr_remaps = 0;
                fmr->fmr_remap_gen = fmrpool->fmr_remap_gen;
                fmr->fmr_pool = fmrpool;
                _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*mr))
                mr->mr_fmr = fmr;

                if (!i)         /* address of last entry's link */
                        fmrpool->fmr_free_list_tail = &fmr->fmr_next;
                fmr->fmr_next = fmrpool->fmr_free_list;
                fmrpool->fmr_free_list = fmr;
        }

        /* Set to return pool */
        *fmrpoolp = fmrpool;

        IBTF_DPRINTF_L2("fmr", "create_fmr_pool SUCCESS");
        return (IBT_SUCCESS);
fail2:
        for (fmr = fmrpool->fmr_free_list; fmr != NULL; fmr = fmr_next) {
                _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*fmr))
                fmr_next = fmr->fmr_next;
                (void) hermon_mr_dealloc_fmr(state, &fmr->fmr);
                kmem_free(fmr, sizeof (hermon_fmr_list_t));
        }
        kmem_free(fmrpool, sizeof (*fmrpool));
fail:
        *fmrpoolp = NULL;
        IBTF_DPRINTF_L2("fmr", "create_fmr_pool FAILED");
        if (status == DDI_FAILURE) {
                return (ibc_get_ci_failure(0));
        } else {
                return (status);
        }
}

/*
 * hermon_destroy_fmr_pool()
 * Destroy an FMR pool and free all associated resources.
 *     Context: Can be called from kernel context only.
 */
int
hermon_destroy_fmr_pool(hermon_state_t *state, hermon_fmrhdl_t fmrpool)
{
        hermon_fmr_list_t       *fmr, *fmr_next;

        mutex_enter(&fmrpool->fmr_lock);
        hermon_fmr_cleanup(fmrpool);

        for (fmr = fmrpool->fmr_free_list; fmr != NULL; fmr = fmr_next) {
                fmr_next = fmr->fmr_next;

                (void) hermon_mr_dealloc_fmr(state, &fmr->fmr);
                kmem_free(fmr, sizeof (hermon_fmr_list_t));

                --fmrpool->fmr_pool_size;
        }
        ASSERT(fmrpool->fmr_pool_size == 0);
        mutex_exit(&fmrpool->fmr_lock);

        mutex_destroy(&fmrpool->fmr_lock);
        mutex_destroy(&fmrpool->dirty_lock);
        mutex_destroy(&fmrpool->remap_lock);

        kmem_free(fmrpool, sizeof (*fmrpool));
        IBTF_DPRINTF_L2("fmr", "destroy_fmr_pool SUCCESS");
        return (DDI_SUCCESS);
}

/*
 * hermon_flush_fmr_pool()
 * Ensure that all unmapped FMRs are fully invalidated.
 *     Context: Can be called from kernel context only.
 */
/* ARGSUSED */
int
hermon_flush_fmr_pool(hermon_state_t *state, hermon_fmrhdl_t fmrpool)
{
        /*
         * Force the unmapping of all entries on the dirty list, regardless of
         * whether the watermark has been hit yet.
         */
        /* grab the pool lock */
        mutex_enter(&fmrpool->fmr_lock);
        hermon_fmr_cleanup(fmrpool);
        mutex_exit(&fmrpool->fmr_lock);
        return (DDI_SUCCESS);
}

/*
 * hermon_register_physical_fmr()
 * Map memory into FMR
 *    Context: Can be called from interrupt or base context.
 */
int
hermon_register_physical_fmr(hermon_state_t *state, hermon_fmrhdl_t fmrpool,
    ibt_pmr_attr_t *mem_pattr, hermon_mrhdl_t *mr,
    ibt_pmr_desc_t *mem_desc_p)
{
        hermon_fmr_list_t       *fmr;
        int                     status;

        /* Check length */
        if (mem_pattr->pmr_len < 1 || (mem_pattr->pmr_num_buf >
            fmrpool->fmr_max_pages)) {
                return (IBT_MR_LEN_INVALID);
        }

        mutex_enter(&fmrpool->fmr_lock);
        if (fmrpool->fmr_free_list == NULL) {
                if (hermon_fmr_verbose & 2)
                        IBTF_DPRINTF_L2("fmr", "register needs remap");
                mutex_enter(&fmrpool->remap_lock);
                if (fmrpool->fmr_remap_list) {
                        /* add to free list */
                        *(fmrpool->fmr_free_list_tail) =
                            fmrpool->fmr_remap_list;
                        fmrpool->fmr_remap_list = NULL;
                        fmrpool->fmr_free_list_tail =
                            fmrpool->fmr_remap_list_tail;

                        /* reset list */
                        fmrpool->fmr_remap_list_tail = &fmrpool->fmr_remap_list;
                        fmrpool->fmr_free_len += fmrpool->fmr_remap_len;
                        fmrpool->fmr_remap_len = 0;
                }
                mutex_exit(&fmrpool->remap_lock);
        }
        if (fmrpool->fmr_free_list == NULL) {
                if (hermon_fmr_verbose & 2)
                        IBTF_DPRINTF_L2("fmr", "register needs cleanup");
                hermon_fmr_cleanup(fmrpool);
        }

        /* grab next free entry */
        fmr = fmrpool->fmr_free_list;
        if (fmr == NULL) {
                IBTF_DPRINTF_L2("fmr", "WARNING: no free fmr resource");
                cmn_err(CE_CONT, "no free fmr resource\n");
                mutex_exit(&fmrpool->fmr_lock);
                return (IBT_INSUFF_RESOURCE);
        }

        if ((fmrpool->fmr_free_list = fmr->fmr_next) == NULL)
                fmrpool->fmr_free_list_tail = &fmrpool->fmr_free_list;
        fmr->fmr_next = NULL;
        fmrpool->fmr_stat_register++;
        mutex_exit(&fmrpool->fmr_lock);

        _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*fmr))
        status = hermon_mr_register_physical_fmr(state, mem_pattr, fmr->fmr,
            mem_desc_p);
        if (status != DDI_SUCCESS) {
                return (status);
        }
        _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*fmr->fmr))
        if (hermon_rdma_debug & 0x4)
                IBTF_DPRINTF_L2("fmr", "  reg: mr %p  key %x",
                    fmr->fmr, fmr->fmr->mr_rkey);
        _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*fmr->fmr))
        if (fmr->fmr_remap_gen != fmrpool->fmr_remap_gen) {
                fmr->fmr_remap_gen = fmrpool->fmr_remap_gen;
                fmr->fmr_remaps = 0;
        }

        fmr->fmr_remaps++;

        *mr = (hermon_mrhdl_t)fmr->fmr;

        return (DDI_SUCCESS);
}

/*
 * hermon_deregister_fmr()
 * Unmap FMR
 *    Context: Can be called from kernel context only.
 */
int
hermon_deregister_fmr(hermon_state_t *state, hermon_mrhdl_t mr)
{
        hermon_fmrhdl_t         fmrpool;
        hermon_fmr_list_t       *fmr, **fmrlast;
        int                     len;

        fmr = mr->mr_fmr;
        _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*fmr))
        fmrpool = fmr->fmr_pool;

        /* mark as owned by software */
        _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*(fmr->fmr)))
        *(uint8_t *)(fmr->fmr->mr_mptrsrcp->hr_addr) = 0xF0;

        if (fmr->fmr_remaps <
            state->hs_cfg_profile->cp_fmr_max_remaps) {
                /* add to remap list */
                _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*(fmr->fmr)))
                if (hermon_rdma_debug & 0x4)
                        IBTF_DPRINTF_L2("fmr", "dereg: mr %p  key %x",
                            fmr->fmr, fmr->fmr->mr_rkey);
                _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*(fmr->fmr)))
                mutex_enter(&fmrpool->remap_lock);
                fmr->fmr_next = NULL;
                *(fmrpool->fmr_remap_list_tail) = fmr;
                fmrpool->fmr_remap_list_tail = &fmr->fmr_next;
                fmrpool->fmr_remap_len++;

                /* conditionally add remap list back to free list */
                fmrlast = NULL;
                if (fmrpool->fmr_remap_len >=
                    fmrpool->fmr_remap_watermark) {
                        fmr = fmrpool->fmr_remap_list;
                        fmrlast = fmrpool->fmr_remap_list_tail;
                        len = fmrpool->fmr_remap_len;
                        fmrpool->fmr_remap_len = 0;
                        fmrpool->fmr_remap_list = NULL;
                        fmrpool->fmr_remap_list_tail =
                            &fmrpool->fmr_remap_list;
                }
                mutex_exit(&fmrpool->remap_lock);
                if (fmrlast) {
                        mutex_enter(&fmrpool->fmr_lock);
                        *(fmrpool->fmr_free_list_tail) = fmr;
                        fmrpool->fmr_free_list_tail = fmrlast;
                        fmrpool->fmr_free_len += len;
                        mutex_exit(&fmrpool->fmr_lock);
                }
        } else {
                /* add to dirty list */
                _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*(fmr->fmr)))
                if (hermon_rdma_debug & 0x4)
                        IBTF_DPRINTF_L2("fmr", "dirty: mr %p  key %x",
                            fmr->fmr, fmr->fmr->mr_rkey);
                _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*(fmr->fmr)))

                mutex_enter(&fmrpool->dirty_lock);
                fmr->fmr_next = NULL;
                *(fmrpool->fmr_dirty_list_tail) = fmr;
                fmrpool->fmr_dirty_list_tail = &fmr->fmr_next;
                fmrpool->fmr_dirty_len++;

                if (fmrpool->fmr_dirty_len >=
                    fmrpool->fmr_dirty_watermark) {
                        mutex_exit(&fmrpool->dirty_lock);
                        mutex_enter(&fmrpool->fmr_lock);
                        hermon_fmr_cleanup(fmrpool);
                        mutex_exit(&fmrpool->fmr_lock);
                } else
                        mutex_exit(&fmrpool->dirty_lock);
        }
        return (DDI_SUCCESS);
}

/*
 * hermon_fmr_cleanup()
 *     Context: Called from any context.
 */
static void
hermon_fmr_cleanup(hermon_fmrhdl_t fmrpool)
{
        int                     status;

        ASSERT(MUTEX_HELD(&fmrpool->fmr_lock));

        if (fmrpool->fmr_stat_register == 0)
                return;

        fmrpool->fmr_stat_register = 0;
        membar_producer();

        if (hermon_fmr_verbose)
                IBTF_DPRINTF_L2("fmr", "TPT_SYNC");
        status = hermon_sync_tpt_cmd_post(fmrpool->fmr_state,
            HERMON_CMD_NOSLEEP_SPIN);
        if (status != HERMON_CMD_SUCCESS) {
                cmn_err(CE_WARN, "fmr SYNC_TPT failed(%x)\n", status);
        }
        fmrpool->fmr_remap_gen++;

        /* add everything back to the free list */
        mutex_enter(&fmrpool->dirty_lock);
        if (fmrpool->fmr_dirty_list) {
                /* add to free list */
                *(fmrpool->fmr_free_list_tail) = fmrpool->fmr_dirty_list;
                fmrpool->fmr_dirty_list = NULL;
                fmrpool->fmr_free_list_tail = fmrpool->fmr_dirty_list_tail;

                /* reset list */
                fmrpool->fmr_dirty_list_tail = &fmrpool->fmr_dirty_list;
                fmrpool->fmr_free_len += fmrpool->fmr_dirty_len;
                fmrpool->fmr_dirty_len = 0;
        }
        mutex_exit(&fmrpool->dirty_lock);

        mutex_enter(&fmrpool->remap_lock);
        if (fmrpool->fmr_remap_list) {
                /* add to free list */
                *(fmrpool->fmr_free_list_tail) = fmrpool->fmr_remap_list;
                fmrpool->fmr_remap_list = NULL;
                fmrpool->fmr_free_list_tail = fmrpool->fmr_remap_list_tail;

                /* reset list */
                fmrpool->fmr_remap_list_tail = &fmrpool->fmr_remap_list;
                fmrpool->fmr_free_len += fmrpool->fmr_remap_len;
                fmrpool->fmr_remap_len = 0;
        }
        mutex_exit(&fmrpool->remap_lock);

        if (fmrpool->fmr_flush_function != NULL) {
                (void) fmrpool->fmr_flush_function(
                    (ibc_fmr_pool_hdl_t)fmrpool,
                    fmrpool->fmr_flush_arg);
        }
}