root/usr/src/uts/common/io/comstar/port/srpt/srpt_ioc.c
/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */

/*
 * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
 */

/*
 * I/O Controller functions for the Solaris COMSTAR SCSI RDMA Protocol
 * Target (SRPT) port provider.
 */

#include <sys/types.h>
#include <sys/ddi.h>
#include <sys/types.h>
#include <sys/sunddi.h>
#include <sys/atomic.h>
#include <sys/sysmacros.h>
#include <sys/ib/ibtl/ibti.h>
#include <sys/sdt.h>

#include "srp.h"
#include "srpt_impl.h"
#include "srpt_ioc.h"
#include "srpt_stp.h"
#include "srpt_ch.h"
#include "srpt_common.h"

/*
 * srpt_ioc_srq_size - Tunable parameter that specifies the number
 * of receive WQ entries that can be posted to the IOC shared
 * receive queue.
 */
uint32_t                srpt_ioc_srq_size = SRPT_DEFAULT_IOC_SRQ_SIZE;
extern uint16_t         srpt_send_msg_depth;
extern uint32_t         srpt_iu_size;
extern boolean_t        srpt_enable_by_default;

/* IOC profile capabilities mask must be big-endian */
typedef struct srpt_ioc_opcap_bits_s {
#if     defined(_BIT_FIELDS_LTOH)
        uint8_t         af:1,
                        at:1,
                        wf:1,
                        wt:1,
                        rf:1,
                        rt:1,
                        sf:1,
                        st:1;
#elif   defined(_BIT_FIELDS_HTOL)
        uint8_t         st:1,
                        sf:1,
                        rt:1,
                        rf:1,
                        wt:1,
                        wf:1,
                        at:1,
                        af:1;
#else
#error  One of _BIT_FIELDS_LTOH or _BIT_FIELDS_HTOL must be defined
#endif
} srpt_ioc_opcap_bits_t;

typedef union {
        srpt_ioc_opcap_bits_t   bits;
        uint8_t                 mask;
} srpt_ioc_opcap_mask_t;

/*
 * vmem arena variables - values derived from iSER
 */
#define SRPT_MR_QUANTSIZE       0x400                   /* 1K */
#define SRPT_MIN_CHUNKSIZE      0x100000                /* 1MB */

/* use less memory on 32-bit kernels as it's much more constrained */
#ifdef _LP64
#define SRPT_BUF_MR_CHUNKSIZE   0x1000000               /* 16MB */
#define SRPT_BUF_POOL_MAX       0x40000000              /* 1GB */
#else
#define SRPT_BUF_MR_CHUNKSIZE   0x400000                /* 4MB */
#define SRPT_BUF_POOL_MAX       0x4000000               /* 64MB */
#endif

static ibt_mr_flags_t   srpt_dbuf_mr_flags =
    IBT_MR_ENABLE_LOCAL_WRITE | IBT_MR_ENABLE_REMOTE_WRITE |
    IBT_MR_ENABLE_REMOTE_READ;

void srpt_ioc_ib_async_hdlr(void *clnt, ibt_hca_hdl_t hdl,
        ibt_async_code_t code, ibt_async_event_t *event);

static struct ibt_clnt_modinfo_s srpt_ibt_modinfo = {
        IBTI_V_CURR,
        IBT_STORAGE_DEV,
        srpt_ioc_ib_async_hdlr,
        NULL,
        "srpt"
};

static srpt_ioc_t *srpt_ioc_init(ib_guid_t guid);
static void srpt_ioc_fini(srpt_ioc_t *ioc);
static boolean_t srpt_check_hca_cfg_enabled(ib_guid_t hca_guid);

static srpt_vmem_pool_t *srpt_vmem_create(const char *name, srpt_ioc_t *ioc,
    ib_memlen_t chunksize, uint64_t maxsize, ibt_mr_flags_t flags);
static void *srpt_vmem_alloc(srpt_vmem_pool_t *vm_pool, size_t size);
static int srpt_vmem_mr_compare(const void *a, const void *b);
static srpt_mr_t *srpt_vmem_chunk_alloc(srpt_vmem_pool_t *ioc,
    ib_memlen_t chunksize);
static void srpt_vmem_destroy(srpt_vmem_pool_t *vm_pool);
static void srpt_vmem_free(srpt_vmem_pool_t *vm_pool, void *vaddr, size_t size);
static srpt_mr_t *srpt_reg_mem(srpt_vmem_pool_t *vm_pool, ib_vaddr_t vaddr,
    ib_memlen_t len);
static void srpt_vmem_chunk_free(srpt_vmem_pool_t *vm_pool, srpt_mr_t *mr);
static void srpt_dereg_mem(srpt_ioc_t *ioc, srpt_mr_t *mr);
static int srpt_vmem_mr(srpt_vmem_pool_t *vm_pool, void *vaddr, size_t size,
    srpt_mr_t *mr);

/*
 * srpt_ioc_attach() - I/O Controller attach
 *
 * Attach to IBTF and initialize I/O controllers. The srpt_ctxt->sc_rwlock
 * should be held outside of this call.
 */
int
srpt_ioc_attach()
{
        int             status;
        int             hca_cnt;
        int             hca_ndx;
        ib_guid_t       *guid;

        ASSERT(srpt_ctxt != NULL);

        /*
         * Attach to IBTF and initialize a list of IB devices.  Each
         * HCA will be represented by an I/O Controller.
         */
        status = ibt_attach(&srpt_ibt_modinfo, srpt_ctxt->sc_dip,
            srpt_ctxt,  &srpt_ctxt->sc_ibt_hdl);
        if (status != DDI_SUCCESS) {
                SRPT_DPRINTF_L1("ioc_attach, ibt_attach failed (0x%x)",
                    status);
                return (DDI_FAILURE);
        }

        hca_cnt = ibt_get_hca_list(&guid);
        if (hca_cnt < 1) {
                /*
                 * not a fatal error.  Service will be up and
                 * waiting for ATTACH events.
                 */
                SRPT_DPRINTF_L2("ioc_attach, no HCA found");
                return (DDI_SUCCESS);
        }

        for (hca_ndx = 0; hca_ndx < hca_cnt; hca_ndx++) {
                SRPT_DPRINTF_L2("ioc_attach, attaching HCA %016llx",
                    (u_longlong_t)guid[hca_ndx]);
                srpt_ioc_attach_hca(guid[hca_ndx], B_FALSE);
        }

        ibt_free_hca_list(guid, hca_cnt);
        SRPT_DPRINTF_L3("ioc_attach, added %d I/O Controller(s)",
            srpt_ctxt->sc_num_iocs);
        return (DDI_SUCCESS);
}

/*
 * Initialize I/O Controllers.  sprt_ctxt->sc_rwlock must be locked by the
 * caller.
 *
 * 'checked' indicates no need to lookup the hca in the HCA configuration
 * list.
 */
void
srpt_ioc_attach_hca(ib_guid_t hca_guid, boolean_t checked)
{
        boolean_t       enable_hca = B_TRUE;
        srpt_ioc_t      *ioc;

        if (!checked) {
                enable_hca = srpt_check_hca_cfg_enabled(hca_guid);

                if (!enable_hca) {
                        /* nothing to do */
                        SRPT_DPRINTF_L2(
                            "ioc_attach_hca, HCA %016llx disabled "
                            "by srpt config",
                            (u_longlong_t)hca_guid);
                        return;
                }
        }

        SRPT_DPRINTF_L2("ioc_attach_hca, adding I/O"
            " Controller (%016llx)", (u_longlong_t)hca_guid);

        ioc = srpt_ioc_init(hca_guid);
        if (ioc == NULL) {
                /*
                 * IOC already exists or an error occurred.  Already
                 * logged by srpt_ioc_init()
                 */
                return;
        }

        /*
         * Create the COMSTAR SRP Target for this IOC.  If this fails,
         * remove the IOC.
         */
        rw_enter(&ioc->ioc_rwlock, RW_WRITER);
        ioc->ioc_tgt_port = srpt_stp_alloc_port(ioc, ioc->ioc_guid);
        if (ioc->ioc_tgt_port == NULL) {
                SRPT_DPRINTF_L1("ioc_attach_hca: alloc SCSI"
                    " Target Port error on GUID(%016llx)",
                    (u_longlong_t)ioc->ioc_guid);
                rw_exit(&ioc->ioc_rwlock);
                srpt_ioc_fini(ioc);
                return;
        }
        rw_exit(&ioc->ioc_rwlock);

        /*
         * New HCA added with default SCSI Target Port, SRP service
         * will be started when SCSI Target Port is brought
         * on-line by STMF.
         */
        list_insert_tail(&srpt_ctxt->sc_ioc_list, ioc);
        SRPT_DPRINTF_L2("ioc_attach_hca, I/O Controller ibt HCA hdl (%p)",
            (void *)ioc->ioc_ibt_hdl);

        srpt_ctxt->sc_num_iocs++;
}

/*
 * srpt_check_hca_cfg_enabled()
 *
 * Function to check the configuration for the enabled status of a given
 * HCA.  Returns B_TRUE if SRPT services should be activated for this HCA,
 * B_FALSE if it should be disabled.
 */
static boolean_t
srpt_check_hca_cfg_enabled(ib_guid_t hca_guid)
{
        int             status;
        char            buf[32];
        nvlist_t        *hcanv;
        boolean_t       enable_hca;

        enable_hca = srpt_enable_by_default;

        SRPT_FORMAT_HCAKEY(buf, sizeof (buf), (u_longlong_t)hca_guid);

        if (srpt_ctxt->sc_cfg_hca_nv != NULL) {
                status = nvlist_lookup_nvlist(srpt_ctxt->sc_cfg_hca_nv,
                    buf, &hcanv);
                if (status == 0) {
                        SRPT_DPRINTF_L3("check_hca_cfg, found guid %s",  buf);
                        (void) nvlist_lookup_boolean_value(hcanv,
                            SRPT_PROP_ENABLED, &enable_hca);
                } else {
                        SRPT_DPRINTF_L3("check_hca_cfg, did not find guid %s",
                            buf);
                }
        }

        return (enable_hca);
}

/*
 * srpt_ioc_update()
 *
 * Using the configuration nvlist, enables or disables SRP services
 * the provided HCAs.  srpt_ctxt->sc_rwlock should be held outside of this call.
 */
void
srpt_ioc_update(void)
{
        boolean_t       enabled;
        nvpair_t        *nvp = NULL;
        uint64_t        hca_guid;
        nvlist_t        *nvl;
        nvlist_t        *cfg = srpt_ctxt->sc_cfg_hca_nv;

        if (cfg == NULL) {
                SRPT_DPRINTF_L2("ioc_update, no configuration data");
                return;
        }

        while ((nvp = nvlist_next_nvpair(cfg, nvp)) != NULL) {
                enabled = srpt_enable_by_default;

                if ((nvpair_value_nvlist(nvp, &nvl)) != 0) {
                        SRPT_DPRINTF_L2("ioc_update, did not find an nvlist");
                        continue;
                }

                if ((nvlist_lookup_uint64(nvl, SRPT_PROP_GUID, &hca_guid))
                    != 0) {
                        SRPT_DPRINTF_L2("ioc_update, did not find a guid");
                        continue;
                }

                (void) nvlist_lookup_boolean_value(nvl, SRPT_PROP_ENABLED,
                    &enabled);

                if (enabled) {
                        SRPT_DPRINTF_L2("ioc_update, enabling guid %016llx",
                            (u_longlong_t)hca_guid);
                        srpt_ioc_attach_hca(hca_guid, B_TRUE);
                } else {
                        SRPT_DPRINTF_L2("ioc_update, disabling guid %016llx",
                            (u_longlong_t)hca_guid);
                        srpt_ioc_detach_hca(hca_guid);
                }
        }
}

/*
 * srpt_ioc_detach() - I/O Controller detach
 *
 * srpt_ctxt->sc_rwlock should be held outside of this call.
 */
void
srpt_ioc_detach()
{
        srpt_ioc_t      *ioc;

        /*
         * All SRP targets must be destroyed before calling this
         * function.
         */
        while ((ioc = list_head(&srpt_ctxt->sc_ioc_list)) != NULL) {
                SRPT_DPRINTF_L2("ioc_detach, removing I/O Controller(%p)"
                    " (%016llx), ibt_hdl(%p)",
                    (void *)ioc,
                    ioc ? (u_longlong_t)ioc->ioc_guid : 0x0ll,
                    (void *)ioc->ioc_ibt_hdl);

                list_remove(&srpt_ctxt->sc_ioc_list, ioc);
                srpt_ioc_fini(ioc);
                srpt_ctxt->sc_num_iocs--;
        }

        srpt_ctxt->sc_ibt_hdl = NULL;
}

/*
 * srpt_ioc_detach_hca()
 *
 * Stop SRP Target services on this HCA
 *
 * Note that this is not entirely synchronous with srpt_ioc_attach_hca()
 * in that we don't need to check the configuration to know whether to
 * disable an HCA.  We get here either because the IB framework has told
 * us the HCA has been detached, or because the administrator has explicitly
 * disabled this HCA.
 *
 * Must be called with srpt_ctxt->sc_rwlock locked as RW_WRITER.
 */
void
srpt_ioc_detach_hca(ib_guid_t hca_guid)
{
        srpt_ioc_t              *ioc;
        srpt_target_port_t      *tgt;
        stmf_status_t           stmf_status = STMF_SUCCESS;

        ioc = srpt_ioc_get_locked(hca_guid);
        if (ioc == NULL) {
                /* doesn't exist, nothing to do */
                return;
        }

        rw_enter(&ioc->ioc_rwlock, RW_WRITER);
        tgt = ioc->ioc_tgt_port;

        if (tgt != NULL) {
                stmf_status = srpt_stp_destroy_port(tgt);
                if (stmf_status == STMF_SUCCESS) {
                        ioc->ioc_tgt_port = NULL;
                        (void) srpt_stp_free_port(tgt);
                }
        }

        rw_exit(&ioc->ioc_rwlock);

        if (stmf_status != STMF_SUCCESS) {
                /* should never happen */
                return;
        }

        list_remove(&srpt_ctxt->sc_ioc_list, ioc);
        srpt_ctxt->sc_num_iocs--;

        srpt_ioc_fini(ioc);
        SRPT_DPRINTF_L2("ioc_detach_hca, HCA %016llx detached",
            (u_longlong_t)hca_guid);
}

/*
 * srpt_ioc_init() - I/O Controller initialization
 *
 * Requires srpt_ctxt->rw_lock be held outside of call.
 */
static srpt_ioc_t *
srpt_ioc_init(ib_guid_t guid)
{
        ibt_status_t            status;
        srpt_ioc_t              *ioc;
        ibt_hca_attr_t          hca_attr;
        uint_t                  iu_ndx;
        uint_t                  err_ndx;
        ibt_mr_attr_t           mr_attr;
        ibt_mr_desc_t           mr_desc;
        srpt_iu_t               *iu;
        ibt_srq_sizes_t         srq_attr;
        char                    namebuf[32];
        size_t                  iu_offset;
        uint_t                  srq_sz;

        status = ibt_query_hca_byguid(guid, &hca_attr);
        if (status != IBT_SUCCESS) {
                SRPT_DPRINTF_L1("ioc_init, HCA query error (%d)",
                    status);
                return (NULL);
        }

        ioc = srpt_ioc_get_locked(guid);
        if (ioc != NULL) {
                SRPT_DPRINTF_L2("ioc_init, HCA already exists");
                return (NULL);
        }

        ioc = kmem_zalloc(sizeof (srpt_ioc_t), KM_SLEEP);

        rw_init(&ioc->ioc_rwlock, NULL, RW_DRIVER, NULL);
        rw_enter(&ioc->ioc_rwlock, RW_WRITER);

        bcopy(&hca_attr, &ioc->ioc_attr, sizeof (ibt_hca_attr_t));

        SRPT_DPRINTF_L2("ioc_init, HCA max mr=%d, mrlen=%lld",
            hca_attr.hca_max_memr, (u_longlong_t)hca_attr.hca_max_memr_len);
        ioc->ioc_guid   = guid;

        status = ibt_open_hca(srpt_ctxt->sc_ibt_hdl, guid, &ioc->ioc_ibt_hdl);
        if (status != IBT_SUCCESS) {
                SRPT_DPRINTF_L1("ioc_init, IBT open failed (%d)", status);
                goto hca_open_err;
        }

        status = ibt_alloc_pd(ioc->ioc_ibt_hdl, IBT_PD_NO_FLAGS,
            &ioc->ioc_pd_hdl);
        if (status != IBT_SUCCESS) {
                SRPT_DPRINTF_L1("ioc_init, IBT create PD failed (%d)", status);
                goto pd_alloc_err;
        }

        /*
         * We require hardware support for SRQs.  We use a common SRQ to
         * reduce channel memory consumption.
         */
        if ((ioc->ioc_attr.hca_flags & IBT_HCA_SRQ) == 0) {
                SRPT_DPRINTF_L0(
                    "ioc_init, no SRQ capability, HCA not supported");
                goto srq_alloc_err;
        }

        SRPT_DPRINTF_L3("ioc_init, Using shared receive queues, max srq work"
            " queue size(%d), def size = %d", ioc->ioc_attr.hca_max_srqs_sz,
            srpt_ioc_srq_size);
        srq_sz = srq_attr.srq_wr_sz = min(srpt_ioc_srq_size,
            ioc->ioc_attr.hca_max_srqs_sz) - 1;
        srq_attr.srq_sgl_sz = 1;

        status = ibt_alloc_srq(ioc->ioc_ibt_hdl, IBT_SRQ_NO_FLAGS,
            ioc->ioc_pd_hdl, &srq_attr, &ioc->ioc_srq_hdl,
            &ioc->ioc_srq_attr);
        if (status != IBT_SUCCESS) {
                SRPT_DPRINTF_L1("ioc_init, IBT create SRQ failed(%d)", status);
                goto srq_alloc_err;
        }

        SRPT_DPRINTF_L2("ioc_init, Using SRQ size(%d), MAX SG size(%d)",
            srq_sz, 1);

        ibt_set_srq_private(ioc->ioc_srq_hdl, ioc);

        /*
         * Allocate a pool of SRP IU message buffers and post them to
         * the I/O Controller SRQ.  We let the SRQ manage the free IU
         * messages.
         */
        ioc->ioc_num_iu_entries = srq_sz;

        ioc->ioc_iu_pool = kmem_zalloc(sizeof (srpt_iu_t) *
            ioc->ioc_num_iu_entries, KM_SLEEP);

        ioc->ioc_iu_bufs = kmem_alloc(srpt_iu_size *
            ioc->ioc_num_iu_entries, KM_SLEEP);

        if ((ioc->ioc_iu_pool == NULL) || (ioc->ioc_iu_bufs == NULL)) {
                SRPT_DPRINTF_L1("ioc_init, failed to allocate SRQ IUs");
                goto srq_iu_alloc_err;
        }

        mr_attr.mr_vaddr = (ib_vaddr_t)(uintptr_t)ioc->ioc_iu_bufs;
        mr_attr.mr_len   = srpt_iu_size * ioc->ioc_num_iu_entries;
        mr_attr.mr_as    = NULL;
        mr_attr.mr_flags = IBT_MR_ENABLE_LOCAL_WRITE;

        status = ibt_register_mr(ioc->ioc_ibt_hdl, ioc->ioc_pd_hdl,
            &mr_attr, &ioc->ioc_iu_mr_hdl, &mr_desc);
        if (status != IBT_SUCCESS) {
                SRPT_DPRINTF_L1("ioc_init, IU buffer pool MR err(%d)",
                    status);
                goto srq_iu_alloc_err;
        }

        for (iu_ndx = 0, iu = ioc->ioc_iu_pool; iu_ndx <
            ioc->ioc_num_iu_entries; iu_ndx++, iu++) {

                iu_offset = (iu_ndx * srpt_iu_size);
                iu->iu_buf = (void *)((uintptr_t)ioc->ioc_iu_bufs + iu_offset);

                mutex_init(&iu->iu_lock, NULL, MUTEX_DRIVER, NULL);

                iu->iu_sge.ds_va  = mr_desc.md_vaddr + iu_offset;
                iu->iu_sge.ds_key = mr_desc.md_lkey;
                iu->iu_sge.ds_len = srpt_iu_size;
                iu->iu_ioc        = ioc;
                iu->iu_pool_ndx   = iu_ndx;

                status = srpt_ioc_post_recv_iu(ioc, &ioc->ioc_iu_pool[iu_ndx]);
                if (status != IBT_SUCCESS) {
                        SRPT_DPRINTF_L1("ioc_init, SRQ IU post err(%d)",
                            status);
                        goto srq_iu_post_err;
                }
        }

        /*
         * Initialize the dbuf vmem arena
         */
        (void) snprintf(namebuf, sizeof (namebuf),
            "srpt_buf_pool_%16llX", (u_longlong_t)guid);
        ioc->ioc_dbuf_pool = srpt_vmem_create(namebuf, ioc,
            SRPT_BUF_MR_CHUNKSIZE, SRPT_BUF_POOL_MAX, srpt_dbuf_mr_flags);

        if (ioc->ioc_dbuf_pool == NULL) {
                goto stmf_db_alloc_err;
        }

        /*
         * Allocate the I/O Controller STMF data buffer allocator.  The
         * data store will span all targets associated with this IOC.
         */
        ioc->ioc_stmf_ds = stmf_alloc(STMF_STRUCT_DBUF_STORE, 0, 0);
        if (ioc->ioc_stmf_ds == NULL) {
                SRPT_DPRINTF_L1("ioc_attach, STMF DBUF alloc failure for IOC");
                goto stmf_db_alloc_err;
        }
        ioc->ioc_stmf_ds->ds_alloc_data_buf = &srpt_ioc_ds_alloc_dbuf;
        ioc->ioc_stmf_ds->ds_free_data_buf  = &srpt_ioc_ds_free_dbuf;
        ioc->ioc_stmf_ds->ds_port_private   = ioc;

        rw_exit(&ioc->ioc_rwlock);
        return (ioc);

stmf_db_alloc_err:
        if (ioc->ioc_dbuf_pool != NULL) {
                srpt_vmem_destroy(ioc->ioc_dbuf_pool);
        }

srq_iu_post_err:
        if (ioc->ioc_iu_mr_hdl != NULL) {
                status = ibt_deregister_mr(ioc->ioc_ibt_hdl,
                    ioc->ioc_iu_mr_hdl);
                if (status != IBT_SUCCESS) {
                        SRPT_DPRINTF_L1("ioc_init, error deregistering"
                            " memory region (%d)", status);
                }
        }
        for (err_ndx = 0, iu = ioc->ioc_iu_pool; err_ndx < iu_ndx;
            err_ndx++, iu++) {
                mutex_destroy(&iu->iu_lock);
        }

srq_iu_alloc_err:
        if (ioc->ioc_iu_bufs != NULL) {
                kmem_free(ioc->ioc_iu_bufs, srpt_iu_size *
                    ioc->ioc_num_iu_entries);
        }
        if (ioc->ioc_iu_pool != NULL) {
                kmem_free(ioc->ioc_iu_pool,
                    sizeof (srpt_iu_t) * ioc->ioc_num_iu_entries);
        }
        if (ioc->ioc_srq_hdl != NULL) {
                status = ibt_free_srq(ioc->ioc_srq_hdl);
                if (status != IBT_SUCCESS) {
                        SRPT_DPRINTF_L1("ioc_init, error freeing SRQ (%d)",
                            status);
                }

        }

srq_alloc_err:
        status = ibt_free_pd(ioc->ioc_ibt_hdl, ioc->ioc_pd_hdl);
        if (status != IBT_SUCCESS) {
                SRPT_DPRINTF_L1("ioc_init, free PD error (%d)", status);
        }

pd_alloc_err:
        status = ibt_close_hca(ioc->ioc_ibt_hdl);
        if (status != IBT_SUCCESS) {
                SRPT_DPRINTF_L1("ioc_init, close ioc error (%d)", status);
        }

hca_open_err:
        rw_exit(&ioc->ioc_rwlock);
        rw_destroy(&ioc->ioc_rwlock);
        kmem_free(ioc, sizeof (*ioc));
        return (NULL);
}

/*
 * srpt_ioc_fini() - I/O Controller Cleanup
 *
 * Requires srpt_ctxt->sc_rwlock be held outside of call.
 */
static void
srpt_ioc_fini(srpt_ioc_t *ioc)
{
        int             status;
        int             ndx;

        /*
         * Note driver flows will have already taken all SRP
         * services running on the I/O Controller off-line.
         */
        ASSERT(ioc->ioc_tgt_port == NULL);
        rw_enter(&ioc->ioc_rwlock, RW_WRITER);
        if (ioc->ioc_ibt_hdl != NULL) {
                if (ioc->ioc_stmf_ds != NULL) {
                        stmf_free(ioc->ioc_stmf_ds);
                }

                if (ioc->ioc_srq_hdl != NULL) {
                        SRPT_DPRINTF_L4("ioc_fini, freeing SRQ");
                        status = ibt_free_srq(ioc->ioc_srq_hdl);
                        if (status != IBT_SUCCESS) {
                                SRPT_DPRINTF_L1("ioc_fini, free SRQ"
                                    " error (%d)", status);
                        }
                }

                if (ioc->ioc_iu_mr_hdl != NULL) {
                        status = ibt_deregister_mr(
                            ioc->ioc_ibt_hdl, ioc->ioc_iu_mr_hdl);
                        if (status != IBT_SUCCESS) {
                                SRPT_DPRINTF_L1("ioc_fini, error deregistering"
                                    " memory region (%d)", status);
                        }
                }

                if (ioc->ioc_iu_bufs != NULL) {
                        kmem_free(ioc->ioc_iu_bufs, srpt_iu_size *
                            ioc->ioc_num_iu_entries);
                }

                if (ioc->ioc_iu_pool != NULL) {
                        SRPT_DPRINTF_L4("ioc_fini, freeing IU entries");
                        for (ndx = 0; ndx < ioc->ioc_num_iu_entries; ndx++) {
                                mutex_destroy(&ioc->ioc_iu_pool[ndx].iu_lock);
                        }

                        SRPT_DPRINTF_L4("ioc_fini, free IU pool struct");
                        kmem_free(ioc->ioc_iu_pool,
                            sizeof (srpt_iu_t) * (ioc->ioc_num_iu_entries));
                        ioc->ioc_iu_pool = NULL;
                        ioc->ioc_num_iu_entries = 0;
                }

                if (ioc->ioc_dbuf_pool != NULL) {
                        srpt_vmem_destroy(ioc->ioc_dbuf_pool);
                }

                if (ioc->ioc_pd_hdl != NULL) {
                        status = ibt_free_pd(ioc->ioc_ibt_hdl,
                            ioc->ioc_pd_hdl);
                        if (status != IBT_SUCCESS) {
                                SRPT_DPRINTF_L1("ioc_fini, free PD"
                                    " error (%d)", status);
                        }
                }

                status = ibt_close_hca(ioc->ioc_ibt_hdl);
                if (status != IBT_SUCCESS) {
                        SRPT_DPRINTF_L1(
                            "ioc_fini, close ioc error (%d)", status);
                }
        }
        rw_exit(&ioc->ioc_rwlock);
        rw_destroy(&ioc->ioc_rwlock);
        kmem_free(ioc, sizeof (srpt_ioc_t));
}

/*
 * srpt_ioc_port_active() - I/O Controller port active
 */
static void
srpt_ioc_port_active(ibt_async_event_t *event)
{
        ibt_status_t            status;
        srpt_ioc_t              *ioc;
        srpt_target_port_t      *tgt = NULL;
        boolean_t               online_target = B_FALSE;
        stmf_change_status_t    cstatus;

        ASSERT(event != NULL);

        SRPT_DPRINTF_L3("ioc_port_active event handler, invoked");

        /*
         * Find the HCA in question and if the HCA has completed
         * initialization, and the SRP Target service for the
         * the I/O Controller exists, then bind this port.
         */
        ioc = srpt_ioc_get(event->ev_hca_guid);

        if (ioc == NULL) {
                SRPT_DPRINTF_L2("ioc_port_active, I/O Controller not"
                    " active");
                return;
        }

        tgt = ioc->ioc_tgt_port;
        if (tgt == NULL) {
                SRPT_DPRINTF_L2("ioc_port_active, no I/O Controller target"
                    " undefined");
                return;
        }


        /*
         * We take the target lock here to serialize this operation
         * with any STMF initiated target state transitions.  If
         * SRP is off-line then the service handle is NULL.
         */
        mutex_enter(&tgt->tp_lock);

        if (tgt->tp_ibt_svc_hdl != NULL) {
                status = srpt_ioc_svc_bind(tgt, event->ev_port);
                if ((status != IBT_SUCCESS) &&
                    (status != IBT_HCA_PORT_NOT_ACTIVE)) {
                        SRPT_DPRINTF_L1("ioc_port_active, bind failed (%d)",
                            status);
                }
        } else {
                /* if we were offline because of no ports, try onlining now */
                if ((tgt->tp_num_active_ports == 0) &&
                    (tgt->tp_requested_state != tgt->tp_state) &&
                    (tgt->tp_requested_state == SRPT_TGT_STATE_ONLINE)) {
                        online_target = B_TRUE;
                        cstatus.st_completion_status = STMF_SUCCESS;
                        cstatus.st_additional_info = "port active";
                }
        }

        mutex_exit(&tgt->tp_lock);

        if (online_target) {
                stmf_status_t   ret;

                ret = stmf_ctl(STMF_CMD_LPORT_ONLINE, tgt->tp_lport, &cstatus);

                if (ret == STMF_SUCCESS) {
                        SRPT_DPRINTF_L1("ioc_port_active, port %d active, "
                            "target %016llx online requested", event->ev_port,
                            (u_longlong_t)ioc->ioc_guid);
                } else if (ret != STMF_ALREADY) {
                        SRPT_DPRINTF_L1("ioc_port_active, port %d active, "
                            "target %016llx failed online request: %d",
                            event->ev_port, (u_longlong_t)ioc->ioc_guid,
                            (int)ret);
                }
        }
}

/*
 * srpt_ioc_port_down()
 */
static void
srpt_ioc_port_down(ibt_async_event_t *event)
{
        srpt_ioc_t              *ioc;
        srpt_target_port_t      *tgt;
        srpt_channel_t          *ch;
        srpt_channel_t          *next_ch;
        boolean_t               offline_target = B_FALSE;
        stmf_change_status_t    cstatus;

        SRPT_DPRINTF_L3("ioc_port_down event handler, invoked");

        /*
         * Find the HCA in question and if the HCA has completed
         * initialization, and the SRP Target service for the
         * the I/O Controller exists, then logout initiators
         * through this port.
         */
        ioc = srpt_ioc_get(event->ev_hca_guid);

        if (ioc == NULL) {
                SRPT_DPRINTF_L2("ioc_port_down, I/O Controller not"
                    " active");
                return;
        }

        /*
         * We only have one target now, but we could go through all
         * SCSI target ports if more are added.
         */
        tgt = ioc->ioc_tgt_port;
        if (tgt == NULL) {
                SRPT_DPRINTF_L2("ioc_port_down, no I/O Controller target"
                    " undefined");
                return;
        }
        mutex_enter(&tgt->tp_lock);

        /*
         * For all channel's logged in through this port, initiate a
         * disconnect.
         */
        mutex_enter(&tgt->tp_ch_list_lock);
        ch = list_head(&tgt->tp_ch_list);
        while (ch != NULL) {
                next_ch = list_next(&tgt->tp_ch_list, ch);
                if (ch->ch_session && (ch->ch_session->ss_hw_port ==
                    event->ev_port)) {
                        srpt_ch_disconnect(ch);
                }
                ch = next_ch;
        }
        mutex_exit(&tgt->tp_ch_list_lock);

        tgt->tp_num_active_ports--;

        /* if we have no active ports, take the target offline */
        if ((tgt->tp_num_active_ports == 0) &&
            (tgt->tp_state == SRPT_TGT_STATE_ONLINE)) {
                cstatus.st_completion_status = STMF_SUCCESS;
                cstatus.st_additional_info = "no ports active";
                offline_target = B_TRUE;
        }

        mutex_exit(&tgt->tp_lock);

        if (offline_target) {
                stmf_status_t   ret;

                ret = stmf_ctl(STMF_CMD_LPORT_OFFLINE, tgt->tp_lport, &cstatus);

                if (ret == STMF_SUCCESS) {
                        SRPT_DPRINTF_L1("ioc_port_down, port %d down, target "
                            "%016llx offline requested", event->ev_port,
                            (u_longlong_t)ioc->ioc_guid);
                } else if (ret != STMF_ALREADY) {
                        SRPT_DPRINTF_L1("ioc_port_down, port %d down, target "
                            "%016llx failed offline request: %d",
                            event->ev_port,
                            (u_longlong_t)ioc->ioc_guid, (int)ret);
                }
        }
}

/*
 * srpt_ioc_ib_async_hdlr - I/O Controller IB asynchronous events
 */
/* ARGSUSED */
void
srpt_ioc_ib_async_hdlr(void *clnt, ibt_hca_hdl_t hdl,
        ibt_async_code_t code, ibt_async_event_t *event)
{
        srpt_channel_t          *ch;

        switch (code) {
        case IBT_EVENT_PORT_UP:
                srpt_ioc_port_active(event);
                break;

        case IBT_ERROR_PORT_DOWN:
                srpt_ioc_port_down(event);
                break;

        case IBT_HCA_ATTACH_EVENT:
                SRPT_DPRINTF_L2(
                    "ib_async_hdlr, received attach event for HCA 0x%016llx",
                    (u_longlong_t)event->ev_hca_guid);

                rw_enter(&srpt_ctxt->sc_rwlock, RW_WRITER);
                srpt_ioc_attach_hca(event->ev_hca_guid, B_FALSE);
                rw_exit(&srpt_ctxt->sc_rwlock);

                break;

        case IBT_HCA_DETACH_EVENT:
                SRPT_DPRINTF_L1(
                    "ioc_iob_async_hdlr, received HCA_DETACH_EVENT for "
                    "HCA 0x%016llx",
                    (u_longlong_t)event->ev_hca_guid);

                rw_enter(&srpt_ctxt->sc_rwlock, RW_WRITER);
                srpt_ioc_detach_hca(event->ev_hca_guid);
                rw_exit(&srpt_ctxt->sc_rwlock);

                break;

        case IBT_EVENT_EMPTY_CHAN:
                /* Channel in ERROR state is now empty */
                ch = (srpt_channel_t *)ibt_get_chan_private(event->ev_chan_hdl);
                SRPT_DPRINTF_L3(
                    "ioc_iob_async_hdlr, received empty channel error on %p",
                    (void *)ch);
                break;

        default:
                SRPT_DPRINTF_L2("ioc_ib_async_hdlr, event not "
                    "handled (%d)", code);
                break;
        }
}

/*
 * srpt_ioc_svc_bind()
 */
ibt_status_t
srpt_ioc_svc_bind(srpt_target_port_t *tgt, uint_t portnum)
{
        ibt_status_t            status;
        srpt_hw_port_t          *port;
        ibt_hca_portinfo_t      *portinfo;
        uint_t                  qportinfo_sz;
        uint_t                  qportnum;
        ib_gid_t                new_gid;
        srpt_ioc_t              *ioc;
        srpt_session_t          sess;

        ASSERT(tgt != NULL);
        ASSERT(tgt->tp_ioc != NULL);
        ioc = tgt->tp_ioc;

        if (tgt->tp_ibt_svc_hdl == NULL) {
                SRPT_DPRINTF_L2("ioc_svc_bind, NULL SCSI target port"
                    " service");
                return (IBT_INVALID_PARAM);
        }

        if (portnum == 0 || portnum > tgt->tp_nports) {
                SRPT_DPRINTF_L2("ioc_svc_bind, bad port (%d)", portnum);
                return (IBT_INVALID_PARAM);
        }
        status = ibt_query_hca_ports(ioc->ioc_ibt_hdl, portnum,
            &portinfo, &qportnum, &qportinfo_sz);
        if (status != IBT_SUCCESS) {
                SRPT_DPRINTF_L1("ioc_svc_bind, query port error (%d)",
                    portnum);
                return (IBT_INVALID_PARAM);
        }

        ASSERT(portinfo != NULL);

        /*
         * If port is not active do nothing, caller should attempt to bind
         * after the port goes active.
         */
        if (portinfo->p_linkstate != IBT_PORT_ACTIVE) {
                SRPT_DPRINTF_L2("ioc_svc_bind, port %d not in active state",
                    portnum);
                ibt_free_portinfo(portinfo, qportinfo_sz);
                return (IBT_HCA_PORT_NOT_ACTIVE);
        }

        port    = &tgt->tp_hw_port[portnum-1];
        new_gid = portinfo->p_sgid_tbl[0];
        ibt_free_portinfo(portinfo, qportinfo_sz);

        /*
         * If previously bound and the port GID has changed,
         * unbind the old GID.
         */
        if (port->hwp_bind_hdl != NULL) {
                if (new_gid.gid_guid != port->hwp_gid.gid_guid ||
                    new_gid.gid_prefix != port->hwp_gid.gid_prefix) {
                        SRPT_DPRINTF_L2("ioc_svc_bind, unregister current"
                            " bind");
                        (void) ibt_unbind_service(tgt->tp_ibt_svc_hdl,
                            port->hwp_bind_hdl);
                        port->hwp_bind_hdl = NULL;
                } else {
                        SRPT_DPRINTF_L2("ioc_svc_bind, port %d already bound",
                            portnum);
                }
        }

        /* bind the new port GID */
        if (port->hwp_bind_hdl == NULL) {
                SRPT_DPRINTF_L2("ioc_svc_bind, bind service, %016llx:%016llx",
                    (u_longlong_t)new_gid.gid_prefix,
                    (u_longlong_t)new_gid.gid_guid);

                /*
                 * Pass SCSI Target Port as CM private data, the target will
                 * always exist while this service is bound.
                 */
                status = ibt_bind_service(tgt->tp_ibt_svc_hdl, new_gid, NULL,
                    tgt, &port->hwp_bind_hdl);
                if (status != IBT_SUCCESS && status != IBT_CM_SERVICE_EXISTS) {
                        SRPT_DPRINTF_L1("ioc_svc_bind, bind error (%d)",
                            status);
                        return (status);
                }
                port->hwp_gid.gid_prefix = new_gid.gid_prefix;
                port->hwp_gid.gid_guid = new_gid.gid_guid;
        }

        /* port is now active */
        tgt->tp_num_active_ports++;

        /* setting up a transient structure for the dtrace probe. */
        bzero(&sess, sizeof (srpt_session_t));
        ALIAS_STR(sess.ss_t_gid, new_gid.gid_prefix, new_gid.gid_guid);
        EUI_STR(sess.ss_t_name, tgt->tp_ibt_svc_id);

        DTRACE_SRP_1(service__up, srpt_session_t, &sess);

        return (IBT_SUCCESS);
}

/*
 * srpt_ioc_svc_unbind()
 */
void
srpt_ioc_svc_unbind(srpt_target_port_t *tgt, uint_t portnum)
{
        srpt_hw_port_t          *port;
        srpt_session_t          sess;
        ibt_status_t            ret;

        if (tgt == NULL) {
                SRPT_DPRINTF_L2("ioc_svc_unbind, SCSI target does not exist");
                return;
        }

        if (portnum == 0 || portnum > tgt->tp_nports) {
                SRPT_DPRINTF_L2("ioc_svc_unbind, bad port (%d)", portnum);
                return;
        }
        port = &tgt->tp_hw_port[portnum-1];

        /* setting up a transient structure for the dtrace probe. */
        bzero(&sess, sizeof (srpt_session_t));
        ALIAS_STR(sess.ss_t_gid, port->hwp_gid.gid_prefix,
            port->hwp_gid.gid_guid);
        EUI_STR(sess.ss_t_name, tgt->tp_ibt_svc_id);

        DTRACE_SRP_1(service__down, srpt_session_t, &sess);

        if (tgt->tp_ibt_svc_hdl != NULL && port->hwp_bind_hdl != NULL) {
                SRPT_DPRINTF_L2("ioc_svc_unbind, unregister current bind");
                ret = ibt_unbind_service(tgt->tp_ibt_svc_hdl,
                    port->hwp_bind_hdl);
                if (ret != IBT_SUCCESS) {
                        SRPT_DPRINTF_L1(
                            "ioc_svc_unbind, unregister port %d failed: %d",
                            portnum, ret);
                } else {
                        port->hwp_bind_hdl = NULL;
                        port->hwp_gid.gid_prefix = 0;
                        port->hwp_gid.gid_guid = 0;
                }
        }
}

/*
 * srpt_ioc_svc_unbind_all()
 */
void
srpt_ioc_svc_unbind_all(srpt_target_port_t *tgt)
{
        uint_t          portnum;

        if (tgt == NULL) {
                SRPT_DPRINTF_L2("ioc_svc_unbind_all, NULL SCSI target port"
                    " specified");
                return;
        }
        for (portnum = 1; portnum <= tgt->tp_nports; portnum++) {
                srpt_ioc_svc_unbind(tgt, portnum);
        }
}

/*
 * srpt_ioc_get_locked()
 *
 * Requires srpt_ctxt->rw_lock be held outside of call.
 */
srpt_ioc_t *
srpt_ioc_get_locked(ib_guid_t guid)
{
        srpt_ioc_t      *ioc;

        ioc = list_head(&srpt_ctxt->sc_ioc_list);
        while (ioc != NULL) {
                if (ioc->ioc_guid == guid) {
                        break;
                }
                ioc = list_next(&srpt_ctxt->sc_ioc_list, ioc);
        }
        return (ioc);
}

/*
 * srpt_ioc_get()
 */
srpt_ioc_t *
srpt_ioc_get(ib_guid_t guid)
{
        srpt_ioc_t      *ioc;

        rw_enter(&srpt_ctxt->sc_rwlock, RW_READER);
        ioc = srpt_ioc_get_locked(guid);
        rw_exit(&srpt_ctxt->sc_rwlock);
        return (ioc);
}

/*
 * srpt_ioc_post_recv_iu()
 */
ibt_status_t
srpt_ioc_post_recv_iu(srpt_ioc_t *ioc, srpt_iu_t *iu)
{
        ibt_status_t            status;
        ibt_recv_wr_t           wr;
        uint_t                  posted;

        ASSERT(ioc != NULL);
        ASSERT(iu != NULL);

        wr.wr_id  = (ibt_wrid_t)(uintptr_t)iu;
        wr.wr_nds = 1;
        wr.wr_sgl = &iu->iu_sge;
        posted    = 0;

        status = ibt_post_srq(ioc->ioc_srq_hdl, &wr, 1, &posted);
        if (status != IBT_SUCCESS) {
                SRPT_DPRINTF_L2("ioc_post_recv_iu, post error (%d)",
                    status);
        }
        return (status);
}

/*
 * srpt_ioc_repost_recv_iu()
 */
void
srpt_ioc_repost_recv_iu(srpt_ioc_t *ioc, srpt_iu_t *iu)
{
        srpt_channel_t          *ch;
        ibt_status_t            status;

        ASSERT(iu != NULL);
        ASSERT(mutex_owned(&iu->iu_lock));

        /*
         * Some additional sanity checks while in debug state, all STMF
         * related task activities should be complete prior to returning
         * this IU to the available pool.
         */
        ASSERT(iu->iu_stmf_task == NULL);
        ASSERT(iu->iu_sq_posted_cnt == 0);

        ch = iu->iu_ch;
        iu->iu_ch = NULL;
        iu->iu_num_rdescs = 0;
        iu->iu_rdescs = NULL;
        iu->iu_tot_xfer_len = 0;
        iu->iu_tag = 0;
        iu->iu_flags = 0;
        iu->iu_sq_posted_cnt = 0;

        status = srpt_ioc_post_recv_iu(ioc, iu);

        if (status != IBT_SUCCESS) {
                /*
                 * Very bad, we should initiate a shutdown of the I/O
                 * Controller here, off-lining any targets associated
                 * with this I/O Controller (and therefore disconnecting
                 * any logins that remain).
                 *
                 * In practice this should never happen so we put
                 * the code near the bottom of the implementation list.
                 */
                SRPT_DPRINTF_L0("ioc_repost_recv_iu, error RX IU (%d)",
                    status);
                ASSERT(0);
        } else if (ch != NULL) {
                atomic_inc_32(&ch->ch_req_lim_delta);
        }
}

/*
 * srpt_ioc_init_profile()
 *
 * SRP I/O Controller serialization lock must be held when this
 * routine is invoked.
 */
void
srpt_ioc_init_profile(srpt_ioc_t *ioc)
{
        srpt_ioc_opcap_mask_t           capmask = {0};

        ASSERT(ioc != NULL);

        ioc->ioc_profile.ioc_guid = h2b64(ioc->ioc_guid);
        (void) memcpy(ioc->ioc_profile.ioc_id_string,
            "Solaris SRP Target 0.9a", 23);

        /*
         * Note vendor ID and subsystem ID are 24 bit values.  Low order
         * 8 bits in vendor ID field is slot and is initialized to zero.
         * Low order 8 bits of subsystem ID is a reserved field and
         * initialized to zero.
         */
        ioc->ioc_profile.ioc_vendorid =
            h2b32((uint32_t)(ioc->ioc_attr.hca_vendor_id << 8));
        ioc->ioc_profile.ioc_deviceid =
            h2b32((uint32_t)ioc->ioc_attr.hca_device_id);
        ioc->ioc_profile.ioc_device_ver =
            h2b16((uint16_t)ioc->ioc_attr.hca_version_id);
        ioc->ioc_profile.ioc_subsys_vendorid =
            h2b32((uint32_t)(ioc->ioc_attr.hca_vendor_id << 8));
        ioc->ioc_profile.ioc_subsys_id = h2b32(0);
        ioc->ioc_profile.ioc_io_class = h2b16(SRP_REV_16A_IO_CLASS);
        ioc->ioc_profile.ioc_io_subclass = h2b16(SRP_IO_SUBCLASS);
        ioc->ioc_profile.ioc_protocol = h2b16(SRP_PROTOCOL);
        ioc->ioc_profile.ioc_protocol_ver = h2b16(SRP_PROTOCOL_VERSION);
        ioc->ioc_profile.ioc_send_msg_qdepth = h2b16(srpt_send_msg_depth);
        ioc->ioc_profile.ioc_rdma_read_qdepth =
            ioc->ioc_attr.hca_max_rdma_out_chan;
        ioc->ioc_profile.ioc_send_msg_sz = h2b32(srpt_iu_size);
        ioc->ioc_profile.ioc_rdma_xfer_sz = h2b32(SRPT_DEFAULT_MAX_RDMA_SIZE);

        capmask.bits.st = 1;    /* Messages can be sent to IOC */
        capmask.bits.sf = 1;    /* Messages can be sent from IOC */
        capmask.bits.rf = 1;    /* RDMA Reads can be sent from IOC */
        capmask.bits.wf = 1;    /* RDMA Writes can be sent from IOC */
        ioc->ioc_profile.ioc_ctrl_opcap_mask = capmask.mask;

        /*
         * We currently only have one target, but if we had a list we would
         * go through that list and only count those that are ONLINE when
         * setting the services count and entries.
         */
        if (ioc->ioc_tgt_port->tp_srp_enabled) {
                ioc->ioc_profile.ioc_service_entries = 1;
                ioc->ioc_svc.srv_id = h2b64(ioc->ioc_guid);
                (void) snprintf((char *)ioc->ioc_svc.srv_name,
                    IB_DM_MAX_SVC_NAME_LEN, "SRP.T10:%016llx",
                    (u_longlong_t)ioc->ioc_guid);
        } else {
                ioc->ioc_profile.ioc_service_entries = 0;
                ioc->ioc_svc.srv_id = 0;
        }
}

/*
 * srpt_ioc_ds_alloc_dbuf()
 */
/* ARGSUSED */
stmf_data_buf_t *
srpt_ioc_ds_alloc_dbuf(struct scsi_task *task, uint32_t size,
        uint32_t *pminsize, uint32_t flags)
{
        srpt_iu_t               *iu;
        srpt_ioc_t              *ioc;
        srpt_ds_dbuf_t          *dbuf;
        stmf_data_buf_t         *stmf_dbuf;
        void                    *buf;
        srpt_mr_t               mr;

        ASSERT(task != NULL);
        iu  = task->task_port_private;
        ioc = iu->iu_ioc;

        SRPT_DPRINTF_L4("ioc_ds_alloc_dbuf, invoked ioc(%p)"
            " size(%d), flags(%x)",
            (void *)ioc, size, flags);

        buf = srpt_vmem_alloc(ioc->ioc_dbuf_pool, size);
        if (buf == NULL) {
                return (NULL);
        }

        if (srpt_vmem_mr(ioc->ioc_dbuf_pool, buf, size, &mr) != 0) {
                goto stmf_alloc_err;
        }

        stmf_dbuf = stmf_alloc(STMF_STRUCT_DATA_BUF, sizeof (srpt_ds_dbuf_t),
            0);
        if (stmf_dbuf == NULL) {
                SRPT_DPRINTF_L2("ioc_ds_alloc_dbuf, stmf_alloc failed");
                goto stmf_alloc_err;
        }

        dbuf = stmf_dbuf->db_port_private;
        dbuf->db_stmf_buf = stmf_dbuf;
        dbuf->db_mr_hdl = mr.mr_hdl;
        dbuf->db_ioc = ioc;
        dbuf->db_sge.ds_va = mr.mr_va;
        dbuf->db_sge.ds_key = mr.mr_lkey;
        dbuf->db_sge.ds_len = size;

        stmf_dbuf->db_buf_size = size;
        stmf_dbuf->db_data_size = size;
        stmf_dbuf->db_relative_offset = 0;
        stmf_dbuf->db_flags = 0;
        stmf_dbuf->db_xfer_status = 0;
        stmf_dbuf->db_sglist_length = 1;
        stmf_dbuf->db_sglist[0].seg_addr = buf;
        stmf_dbuf->db_sglist[0].seg_length = size;

        return (stmf_dbuf);

stmf_alloc_err:
        srpt_vmem_free(ioc->ioc_dbuf_pool, buf, size);

        return (NULL);
}

void
srpt_ioc_ds_free_dbuf(struct stmf_dbuf_store *ds,
        stmf_data_buf_t *dbuf)
{
        srpt_ioc_t      *ioc;

        SRPT_DPRINTF_L4("ioc_ds_free_dbuf, invoked buf (%p)",
            (void *)dbuf);
        ioc = ds->ds_port_private;

        srpt_vmem_free(ioc->ioc_dbuf_pool, dbuf->db_sglist[0].seg_addr,
            dbuf->db_buf_size);
        stmf_free(dbuf);
}

/* Memory arena routines */

static srpt_vmem_pool_t *
srpt_vmem_create(const char *name, srpt_ioc_t *ioc, ib_memlen_t chunksize,
    uint64_t maxsize, ibt_mr_flags_t flags)
{
        srpt_mr_t               *chunk;
        srpt_vmem_pool_t        *result;

        ASSERT(chunksize <= maxsize);

        result = kmem_zalloc(sizeof (srpt_vmem_pool_t), KM_SLEEP);

        result->svp_ioc = ioc;
        result->svp_chunksize = chunksize;
        result->svp_max_size = maxsize;
        result->svp_flags = flags;

        rw_init(&result->svp_lock, NULL, RW_DRIVER, NULL);
        avl_create(&result->svp_mr_list, srpt_vmem_mr_compare,
            sizeof (srpt_mr_t), offsetof(srpt_mr_t, mr_avl));

        chunk = srpt_vmem_chunk_alloc(result, chunksize);

        avl_add(&result->svp_mr_list, chunk);
        result->svp_total_size = chunksize;

        result->svp_vmem = vmem_create(name,
            (void*)(uintptr_t)chunk->mr_va,
            (size_t)chunk->mr_len, SRPT_MR_QUANTSIZE,
            NULL, NULL, NULL, 0, VM_SLEEP);

        return (result);
}

static void
srpt_vmem_destroy(srpt_vmem_pool_t *vm_pool)
{
        srpt_mr_t               *chunk;
        srpt_mr_t               *next;

        rw_enter(&vm_pool->svp_lock, RW_WRITER);
        vmem_destroy(vm_pool->svp_vmem);

        chunk = avl_first(&vm_pool->svp_mr_list);

        while (chunk != NULL) {
                next = AVL_NEXT(&vm_pool->svp_mr_list, chunk);
                avl_remove(&vm_pool->svp_mr_list, chunk);
                srpt_vmem_chunk_free(vm_pool, chunk);
                chunk = next;
        }

        avl_destroy(&vm_pool->svp_mr_list);

        rw_exit(&vm_pool->svp_lock);
        rw_destroy(&vm_pool->svp_lock);

        kmem_free(vm_pool, sizeof (srpt_vmem_pool_t));
}

static void *
srpt_vmem_alloc(srpt_vmem_pool_t *vm_pool, size_t size)
{
        void            *result;
        srpt_mr_t       *next;
        ib_memlen_t     chunklen;

        ASSERT(vm_pool != NULL);

        result = vmem_alloc(vm_pool->svp_vmem, size,
            VM_NOSLEEP | VM_FIRSTFIT);

        if (result != NULL) {
                /* memory successfully allocated */
                return (result);
        }

        /* need more vmem */
        rw_enter(&vm_pool->svp_lock, RW_WRITER);
        chunklen = vm_pool->svp_chunksize;

        if (vm_pool->svp_total_size >= vm_pool->svp_max_size) {
                /* no more room to alloc */
                rw_exit(&vm_pool->svp_lock);
                return (NULL);
        }

        if ((vm_pool->svp_total_size + chunklen) > vm_pool->svp_max_size) {
                chunklen = vm_pool->svp_max_size - vm_pool->svp_total_size;
        }

        next = srpt_vmem_chunk_alloc(vm_pool, chunklen);
        if (next != NULL) {
                /*
                 * Note that the size of the chunk we got
                 * may not be the size we requested.  Use the
                 * length returned in the chunk itself.
                 */
                if (vmem_add(vm_pool->svp_vmem, (void*)(uintptr_t)next->mr_va,
                    next->mr_len, VM_NOSLEEP) == NULL) {
                        srpt_vmem_chunk_free(vm_pool, next);
                        SRPT_DPRINTF_L2("vmem_add failed");
                } else {
                        vm_pool->svp_total_size += next->mr_len;
                        avl_add(&vm_pool->svp_mr_list, next);
                }
        }

        rw_exit(&vm_pool->svp_lock);

        result = vmem_alloc(vm_pool->svp_vmem, size, VM_NOSLEEP | VM_FIRSTFIT);

        return (result);
}

static void
srpt_vmem_free(srpt_vmem_pool_t *vm_pool, void *vaddr, size_t size)
{
        vmem_free(vm_pool->svp_vmem, vaddr, size);
}

static int
srpt_vmem_mr(srpt_vmem_pool_t *vm_pool, void *vaddr, size_t size,
    srpt_mr_t *mr)
{
        avl_index_t             where;
        ib_vaddr_t              mrva = (ib_vaddr_t)(uintptr_t)vaddr;
        srpt_mr_t               chunk;
        srpt_mr_t               *nearest;
        ib_vaddr_t              chunk_end;
        int                     status = DDI_FAILURE;

        rw_enter(&vm_pool->svp_lock, RW_READER);

        chunk.mr_va = mrva;
        nearest = avl_find(&vm_pool->svp_mr_list, &chunk, &where);

        if (nearest == NULL) {
                nearest = avl_nearest(&vm_pool->svp_mr_list, where,
                    AVL_BEFORE);
        }

        if (nearest != NULL) {
                /* Verify this chunk contains the specified address range */
                ASSERT(nearest->mr_va <= mrva);

                chunk_end = nearest->mr_va + nearest->mr_len;
                if (chunk_end >= mrva + size) {
                        mr->mr_hdl = nearest->mr_hdl;
                        mr->mr_va = mrva;
                        mr->mr_len = size;
                        mr->mr_lkey = nearest->mr_lkey;
                        mr->mr_rkey = nearest->mr_rkey;
                        status = DDI_SUCCESS;
                }
        }

        rw_exit(&vm_pool->svp_lock);
        return (status);
}

static srpt_mr_t *
srpt_vmem_chunk_alloc(srpt_vmem_pool_t *vm_pool, ib_memlen_t chunksize)
{
        void                    *chunk = NULL;
        srpt_mr_t               *result = NULL;

        while ((chunk == NULL) && (chunksize >= SRPT_MIN_CHUNKSIZE)) {
                chunk = kmem_alloc(chunksize, KM_NOSLEEP);
                if (chunk == NULL) {
                        SRPT_DPRINTF_L2("srpt_vmem_chunk_alloc: "
                            "failed to alloc chunk of %d, trying %d",
                            (int)chunksize, (int)chunksize/2);
                        chunksize /= 2;
                }
        }

        if (chunk != NULL) {
                result = srpt_reg_mem(vm_pool, (ib_vaddr_t)(uintptr_t)chunk,
                    chunksize);
                if (result == NULL) {
                        SRPT_DPRINTF_L2("srpt_vmem_chunk_alloc: "
                            "chunk registration failed");
                        kmem_free(chunk, chunksize);
                }
        }

        return (result);
}

static void
srpt_vmem_chunk_free(srpt_vmem_pool_t *vm_pool, srpt_mr_t *mr)
{
        void                    *chunk = (void *)(uintptr_t)mr->mr_va;
        ib_memlen_t             chunksize = mr->mr_len;

        srpt_dereg_mem(vm_pool->svp_ioc, mr);
        kmem_free(chunk, chunksize);
}

static srpt_mr_t *
srpt_reg_mem(srpt_vmem_pool_t *vm_pool, ib_vaddr_t vaddr, ib_memlen_t len)
{
        srpt_mr_t               *result = NULL;
        ibt_mr_attr_t           mr_attr;
        ibt_mr_desc_t           mr_desc;
        ibt_status_t            status;
        srpt_ioc_t              *ioc = vm_pool->svp_ioc;

        result = kmem_zalloc(sizeof (srpt_mr_t), KM_NOSLEEP);
        if (result == NULL) {
                SRPT_DPRINTF_L2("srpt_reg_mem: failed to allocate");
                return (NULL);
        }

        bzero(&mr_attr, sizeof (ibt_mr_attr_t));
        bzero(&mr_desc, sizeof (ibt_mr_desc_t));

        mr_attr.mr_vaddr = vaddr;
        mr_attr.mr_len = len;
        mr_attr.mr_as = NULL;
        mr_attr.mr_flags = vm_pool->svp_flags;

        status = ibt_register_mr(ioc->ioc_ibt_hdl, ioc->ioc_pd_hdl,
            &mr_attr, &result->mr_hdl, &mr_desc);
        if (status != IBT_SUCCESS) {
                SRPT_DPRINTF_L2("srpt_reg_mem: ibt_register_mr "
                    "failed %d", status);
                kmem_free(result, sizeof (srpt_mr_t));
                return (NULL);
        }

        result->mr_va = mr_attr.mr_vaddr;
        result->mr_len = mr_attr.mr_len;
        result->mr_lkey = mr_desc.md_lkey;
        result->mr_rkey = mr_desc.md_rkey;

        return (result);
}

static void
srpt_dereg_mem(srpt_ioc_t *ioc, srpt_mr_t *mr)
{
        ibt_status_t            status;

        status = ibt_deregister_mr(ioc->ioc_ibt_hdl, mr->mr_hdl);
        if (status != IBT_SUCCESS) {
                SRPT_DPRINTF_L1("srpt_dereg_mem, error deregistering MR (%d)",
                    status);
        }
        kmem_free(mr, sizeof (srpt_mr_t));
}

static int
srpt_vmem_mr_compare(const void *a, const void *b)
{
        srpt_mr_t               *mr1 = (srpt_mr_t *)a;
        srpt_mr_t               *mr2 = (srpt_mr_t *)b;

        /* sort and match by virtual address */
        if (mr1->mr_va < mr2->mr_va) {
                return (-1);
        } else if (mr1->mr_va > mr2->mr_va) {
                return (1);
        }

        return (0);
}