root/usr/src/uts/common/io/comstar/port/srpt/srpt_ch.c
/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */

/*
 * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
 */

/*
 * RDMA channel interface for Solaris SCSI RDMA Protocol Target (SRP)
 * transport port provider module for the COMSTAR framework.
 */

#include <sys/cpuvar.h>
#include <sys/types.h>
#include <sys/conf.h>
#include <sys/stat.h>
#include <sys/file.h>
#include <sys/ddi.h>
#include <sys/sunddi.h>
#include <sys/modctl.h>
#include <sys/sysmacros.h>
#include <sys/sdt.h>
#include <sys/taskq.h>
#include <sys/scsi/scsi.h>
#include <sys/ib/ibtl/ibti.h>

#include <sys/stmf.h>
#include <sys/stmf_ioctl.h>
#include <sys/portif.h>

#include "srp.h"
#include "srpt_impl.h"
#include "srpt_ioc.h"
#include "srpt_stp.h"
#include "srpt_ch.h"

extern srpt_ctxt_t *srpt_ctxt;
extern uint16_t srpt_send_msg_depth;

/*
 * Prototypes.
 */
static void srpt_ch_scq_hdlr(ibt_cq_hdl_t cq_dhl, void *arg);
static void srpt_ch_rcq_hdlr(ibt_cq_hdl_t cq_dhl, void *arg);
static void srpt_ch_process_iu(srpt_channel_t *ch, srpt_iu_t *iu);

/*
 * srpt_ch_alloc()
 */
srpt_channel_t *
srpt_ch_alloc(srpt_target_port_t *tgt, uint8_t port)
{
        ibt_status_t                    status;
        srpt_channel_t                  *ch;
        ibt_cq_attr_t                   cq_attr;
        ibt_rc_chan_alloc_args_t        ch_args;
        uint32_t                        cq_real_size;
        srpt_ioc_t                      *ioc;

        ASSERT(tgt != NULL);
        ioc = tgt->tp_ioc;
        ASSERT(ioc != NULL);

        ch = kmem_zalloc(sizeof (*ch), KM_SLEEP);
        rw_init(&ch->ch_rwlock, NULL, RW_DRIVER, NULL);
        mutex_init(&ch->ch_reflock, NULL, MUTEX_DRIVER, NULL);
        cv_init(&ch->ch_cv_complete, NULL, CV_DRIVER, NULL);
        ch->ch_refcnt   = 1;
        ch->ch_cv_waiters = 0;

        ch->ch_state  = SRPT_CHANNEL_CONNECTING;
        ch->ch_tgt    = tgt;
        ch->ch_req_lim_delta = 0;
        ch->ch_ti_iu_len = 0;

        cq_attr.cq_size  = srpt_send_msg_depth * 2;
        cq_attr.cq_sched = 0;
        cq_attr.cq_flags = IBT_CQ_NO_FLAGS;

        status = ibt_alloc_cq(ioc->ioc_ibt_hdl, &cq_attr, &ch->ch_scq_hdl,
            &cq_real_size);
        if (status != IBT_SUCCESS) {
                SRPT_DPRINTF_L1("ch_alloc, send CQ alloc error (%d)",
                    status);
                goto scq_alloc_err;
        }

        cq_attr.cq_size  = srpt_send_msg_depth + 1;
        cq_attr.cq_sched = 0;
        cq_attr.cq_flags = IBT_CQ_NO_FLAGS;

        status = ibt_alloc_cq(ioc->ioc_ibt_hdl, &cq_attr, &ch->ch_rcq_hdl,
            &cq_real_size);
        if (status != IBT_SUCCESS) {
                SRPT_DPRINTF_L2("ch_alloc, receive CQ alloc error (%d)",
                    status);
                goto rcq_alloc_err;
        }

        ibt_set_cq_handler(ch->ch_scq_hdl, srpt_ch_scq_hdlr, ch);
        ibt_set_cq_handler(ch->ch_rcq_hdl, srpt_ch_rcq_hdlr, ch);
        (void) ibt_enable_cq_notify(ch->ch_scq_hdl, IBT_NEXT_COMPLETION);
        (void) ibt_enable_cq_notify(ch->ch_rcq_hdl, IBT_NEXT_COMPLETION);

        ch_args.rc_flags   = IBT_WR_SIGNALED;

        /* Maker certain initiator can not read/write our memory */
        ch_args.rc_control = 0;

        ch_args.rc_hca_port_num = port;

        /*
         * Any SRP IU can result in a number of STMF data buffer transfers
         * and those transfers themselves could span multiple initiator
         * buffers.  Therefore, the number of send WQE's actually required
         * can vary.  Here we assume that on average an I/O will require
         * no more than SRPT_MAX_OUT_IO_PER_CMD send WQE's.  In practice
         * this will prevent send work queue overrun, but we will also
         * inform STMF to throttle I/O should the work queue become full.
         *
         * If the HCA tells us the max outstanding WRs for a channel is
         * lower than our default, use the HCA value.
         */
        ch_args.rc_sizes.cs_sq = min(ioc->ioc_attr.hca_max_chan_sz,
            (srpt_send_msg_depth * SRPT_MAX_OUT_IO_PER_CMD));
        ch_args.rc_sizes.cs_rq =  0;
        ch_args.rc_sizes.cs_sq_sgl = 2;
        ch_args.rc_sizes.cs_rq_sgl = 0;

        ch_args.rc_scq = ch->ch_scq_hdl;
        ch_args.rc_rcq = ch->ch_rcq_hdl;
        ch_args.rc_pd  = ioc->ioc_pd_hdl;
        ch_args.rc_clone_chan = NULL;
        ch_args.rc_srq = ioc->ioc_srq_hdl;

        status = ibt_alloc_rc_channel(ioc->ioc_ibt_hdl, IBT_ACHAN_USES_SRQ,
            &ch_args, &ch->ch_chan_hdl, &ch->ch_sizes);
        if (status != IBT_SUCCESS) {
                SRPT_DPRINTF_L2("ch_alloc, IBT channel alloc error (%d)",
                    status);
                goto qp_alloc_err;
        }

        /*
         * Create pool of send WQE entries to map send wqe work IDs
         * to various types (specifically in error cases where OP
         * is not known).
         */
        ch->ch_num_swqe = ch->ch_sizes.cs_sq;
        SRPT_DPRINTF_L2("ch_alloc, number of SWQEs = %u", ch->ch_num_swqe);
        ch->ch_swqe = kmem_zalloc(sizeof (srpt_swqe_t) * ch->ch_num_swqe,
            KM_SLEEP);
        if (ch->ch_swqe == NULL) {
                SRPT_DPRINTF_L2("ch_alloc, SWQE alloc error");
                (void) ibt_free_channel(ch->ch_chan_hdl);
                goto qp_alloc_err;
        }
        mutex_init(&ch->ch_swqe_lock, NULL, MUTEX_DRIVER, NULL);
        ch->ch_head = 1;
        for (ch->ch_tail = 1; ch->ch_tail < ch->ch_num_swqe -1; ch->ch_tail++) {
                ch->ch_swqe[ch->ch_tail].sw_next = ch->ch_tail + 1;
        }
        ch->ch_swqe[ch->ch_tail].sw_next = 0;

        ibt_set_chan_private(ch->ch_chan_hdl, ch);
        return (ch);

qp_alloc_err:
        (void) ibt_free_cq(ch->ch_rcq_hdl);

rcq_alloc_err:
        (void) ibt_free_cq(ch->ch_scq_hdl);

scq_alloc_err:
        cv_destroy(&ch->ch_cv_complete);
        mutex_destroy(&ch->ch_reflock);
        rw_destroy(&ch->ch_rwlock);
        kmem_free(ch, sizeof (*ch));

        return (NULL);
}

/*
 * srpt_ch_add_ref()
 */
void
srpt_ch_add_ref(srpt_channel_t *ch)
{
        mutex_enter(&ch->ch_reflock);
        ch->ch_refcnt++;
        SRPT_DPRINTF_L4("ch_add_ref, ch (%p), refcnt (%d)",
            (void *)ch, ch->ch_refcnt);
        ASSERT(ch->ch_refcnt != 0);
        mutex_exit(&ch->ch_reflock);
}

/*
 * srpt_ch_release_ref()
 *
 * A non-zero value for wait causes thread to block until all references
 * to channel are released.
 */
void
srpt_ch_release_ref(srpt_channel_t *ch, uint_t wait)
{
        mutex_enter(&ch->ch_reflock);

        SRPT_DPRINTF_L4("ch_release_ref, ch (%p), refcnt (%d), wait (%d)",
            (void *)ch, ch->ch_refcnt, wait);

        ASSERT(ch->ch_refcnt != 0);

        ch->ch_refcnt--;

        if (ch->ch_refcnt != 0) {
                if (wait) {
                        ch->ch_cv_waiters++;
                        while (ch->ch_refcnt != 0) {
                                cv_wait(&ch->ch_cv_complete, &ch->ch_reflock);
                        }
                        ch->ch_cv_waiters--;
                } else {
                        mutex_exit(&ch->ch_reflock);
                        return;
                }
        }

        /*
         * Last thread out frees the IB resources, locks/conditions and memory
         */
        if (ch->ch_cv_waiters > 0) {
                /* we're not last, wake someone else up */
                cv_signal(&ch->ch_cv_complete);
                mutex_exit(&ch->ch_reflock);
                return;
        }

        SRPT_DPRINTF_L3("ch_release_ref - release resources");
        if (ch->ch_chan_hdl) {
                SRPT_DPRINTF_L3("ch_release_ref - free channel");
                (void) ibt_free_channel(ch->ch_chan_hdl);
        }

        if (ch->ch_scq_hdl) {
                (void) ibt_free_cq(ch->ch_scq_hdl);
        }

        if (ch->ch_rcq_hdl) {
                (void) ibt_free_cq(ch->ch_rcq_hdl);
        }

        /*
         * There should be no IU's associated with this
         * channel on the SCSI session.
         */
        if (ch->ch_session != NULL) {
                ASSERT(list_is_empty(&ch->ch_session->ss_task_list));

                /*
                 * Currently only have one channel per session, we will
                 * need to release a reference when support is added
                 * for multi-channel target login.
                 */
                srpt_stp_free_session(ch->ch_session);
                ch->ch_session = NULL;
        }

        kmem_free(ch->ch_swqe, sizeof (srpt_swqe_t) * ch->ch_num_swqe);
        mutex_destroy(&ch->ch_swqe_lock);
        mutex_exit(&ch->ch_reflock);
        mutex_destroy(&ch->ch_reflock);
        rw_destroy(&ch->ch_rwlock);
        kmem_free(ch, sizeof (srpt_channel_t));
}

/*
 * srpt_ch_disconnect()
 */
void
srpt_ch_disconnect(srpt_channel_t *ch)
{
        ibt_status_t            status;

        SRPT_DPRINTF_L3("ch_disconnect, invoked for ch (%p)",
            (void *)ch);

        rw_enter(&ch->ch_rwlock, RW_WRITER);

        /*
         * If we are already in the process of disconnecting then
         * nothing need be done, CM will call-back into us when done.
         */
        if (ch->ch_state == SRPT_CHANNEL_DISCONNECTING) {
                SRPT_DPRINTF_L2("ch_disconnect, called when"
                    " disconnect in progress");
                rw_exit(&ch->ch_rwlock);
                return;
        }
        ch->ch_state = SRPT_CHANNEL_DISCONNECTING;
        rw_exit(&ch->ch_rwlock);

        /*
         * Initiate the sending of the CM DREQ message, the private data
         * should be the SRP Target logout IU.  We don't really care about
         * the remote CM DREP message returned.  We issue this in an
         * asynchronous manner and will cleanup when called back by CM.
         */
        status = ibt_close_rc_channel(ch->ch_chan_hdl, IBT_NONBLOCKING,
            NULL, 0, NULL, NULL, 0);

        if (status != IBT_SUCCESS) {
                SRPT_DPRINTF_L2("ch_disconnect, close RC channel"
                    " err(%d)", status);
        }
}

/*
 * srpt_ch_cleanup()
 */
void
srpt_ch_cleanup(srpt_channel_t *ch)
{
        srpt_iu_t               *iu;
        srpt_iu_t               *next;
        ibt_wc_t                wc;
        srpt_target_port_t      *tgt;
        srpt_channel_t          *tgt_ch;
        scsi_task_t             *iutask;

        SRPT_DPRINTF_L3("ch_cleanup, invoked for ch(%p), state(%d)",
            (void *)ch, ch->ch_state);

        /* add a ref for the channel until we're done */
        srpt_ch_add_ref(ch);

        tgt = ch->ch_tgt;
        ASSERT(tgt != NULL);

        /*
         * Make certain the channel is in the target ports list of
         * known channels and remove it (releasing the target
         * ports reference to the channel).
         */
        mutex_enter(&tgt->tp_ch_list_lock);
        tgt_ch = list_head(&tgt->tp_ch_list);
        while (tgt_ch != NULL) {
                if (tgt_ch == ch) {
                        list_remove(&tgt->tp_ch_list, tgt_ch);
                        srpt_ch_release_ref(tgt_ch, 0);
                        break;
                }
                tgt_ch = list_next(&tgt->tp_ch_list, tgt_ch);
        }
        mutex_exit(&tgt->tp_ch_list_lock);

        if (tgt_ch == NULL) {
                SRPT_DPRINTF_L2("ch_cleanup, target channel no"
                    "longer known to target");
                srpt_ch_release_ref(ch, 0);
                return;
        }

        rw_enter(&ch->ch_rwlock, RW_WRITER);
        ch->ch_state = SRPT_CHANNEL_DISCONNECTING;
        rw_exit(&ch->ch_rwlock);

        /*
         * Don't accept any further incoming requests, and clean
         * up the receive queue.  The send queue is left alone
         * so tasks can finish and clean up (whether normally
         * or via abort).
         */
        if (ch->ch_rcq_hdl) {
                ibt_set_cq_handler(ch->ch_rcq_hdl, NULL, NULL);

                while (ibt_poll_cq(ch->ch_rcq_hdl, &wc, 1, NULL) ==
                    IBT_SUCCESS) {
                        iu = (srpt_iu_t *)(uintptr_t)wc.wc_id;
                        SRPT_DPRINTF_L4("ch_cleanup, recovering"
                            " outstanding RX iu(%p)", (void *)iu);
                        mutex_enter(&iu->iu_lock);
                        srpt_ioc_repost_recv_iu(iu->iu_ioc, iu);
                        /*
                         * Channel reference has not yet been added for this
                         * IU, so do not decrement.
                         */
                        mutex_exit(&iu->iu_lock);
                }
        }

        /*
         * Go through the list of outstanding IU for the channel's SCSI
         * session and for each either abort or complete an abort.
         */
        rw_enter(&ch->ch_rwlock, RW_READER);
        if (ch->ch_session != NULL) {
                rw_enter(&ch->ch_session->ss_rwlock, RW_READER);
                iu = list_head(&ch->ch_session->ss_task_list);
                while (iu != NULL) {
                        next = list_next(&ch->ch_session->ss_task_list, iu);

                        mutex_enter(&iu->iu_lock);
                        if (ch == iu->iu_ch) {
                                if (iu->iu_stmf_task == NULL) {
                                        cmn_err(CE_NOTE,
                                            "ch_cleanup, NULL stmf task");
                                        ASSERT(0);
                                }
                                iutask = iu->iu_stmf_task;
                        } else {
                                iutask = NULL;
                        }
                        mutex_exit(&iu->iu_lock);

                        if (iutask != NULL) {
                                SRPT_DPRINTF_L4("ch_cleanup, aborting "
                                    "task(%p)", (void *)iutask);
                                stmf_abort(STMF_QUEUE_TASK_ABORT, iutask,
                                    STMF_ABORTED, NULL);
                        }
                        iu = next;
                }
                rw_exit(&ch->ch_session->ss_rwlock);
        }
        rw_exit(&ch->ch_rwlock);

        srpt_ch_release_ref(ch, 0);
}

/*
 * srpt_ch_rsp_comp()
 *
 * Process a completion for an IB SEND message.  A SEND completion
 * is for a SRP response packet sent back to the initiator.  It
 * will not have a STMF SCSI task associated with it if it was
 * sent for a rejected IU, or was a task management abort response.
 */
static void
srpt_ch_rsp_comp(srpt_channel_t *ch, srpt_iu_t *iu,
        ibt_wc_status_t wc_status)
{
        stmf_status_t   st = STMF_SUCCESS;

        ASSERT(iu->iu_ch == ch);

        /*
         * Process the completion regardless whether it's a failure or
         * success.  At this point, we've processed as far as we can and
         * just need to complete the associated task.
         */

        if (wc_status != IBT_SUCCESS) {
                SRPT_DPRINTF_L2("ch_rsp_comp, WC status err(%d)",
                    wc_status);

                st = STMF_FAILURE;

                if (wc_status != IBT_WC_WR_FLUSHED_ERR) {
                        srpt_ch_disconnect(ch);
                }
        }

        /*
         * If the IU response completion is not associated with
         * with a SCSI task, release the IU to return the resource
         * and the reference to the channel it holds.
         */
        mutex_enter(&iu->iu_lock);
        atomic_dec_32(&iu->iu_sq_posted_cnt);

        if (iu->iu_stmf_task == NULL) {
                srpt_ioc_repost_recv_iu(iu->iu_ioc, iu);
                mutex_exit(&iu->iu_lock);
                srpt_ch_release_ref(ch, 0);
                return;
        }

        /*
         * We should not get a SEND completion where the task has already
         * completed aborting and STMF has been informed.
         */
        ASSERT((iu->iu_flags & SRPT_IU_ABORTED) == 0);

        /*
         * Let STMF know we are done.
         */
        mutex_exit(&iu->iu_lock);

        stmf_send_status_done(iu->iu_stmf_task, st, STMF_IOF_LPORT_DONE);
}

/*
 * srpt_ch_data_comp()
 *
 * Process an IB completion for a RDMA operation.  This completion
 * should be associated with the last RDMA operation for any
 * data buffer transfer.
 */
static void
srpt_ch_data_comp(srpt_channel_t *ch, stmf_data_buf_t *stmf_dbuf,
        ibt_wc_status_t wc_status)
{
        srpt_ds_dbuf_t          *dbuf;
        srpt_iu_t               *iu;
        stmf_status_t           status;

        ASSERT(stmf_dbuf != NULL);

        dbuf = (srpt_ds_dbuf_t *)stmf_dbuf->db_port_private;

        ASSERT(dbuf != NULL);

        iu = dbuf->db_iu;

        ASSERT(iu != NULL);
        ASSERT(iu->iu_ch == ch);

        /*
         * If work completion indicates non-flush failure, then
         * start a channel disconnect (asynchronous) and release
         * the reference to the IU.  The task will be cleaned
         * up with STMF during channel shutdown processing.
         */
        if (wc_status != IBT_SUCCESS) {
                SRPT_DPRINTF_L2("ch_data_comp, WC status err(%d)",
                    wc_status);
                if (wc_status != IBT_WC_WR_FLUSHED_ERR) {
                        srpt_ch_disconnect(ch);
                }
                atomic_dec_32(&iu->iu_sq_posted_cnt);
                return;
        }

        /*
         * If STMF has requested this task be aborted, then if this is the
         * last I/O operation outstanding, notify STMF the task has been
         *  aborted and ignore the completion.
         */
        mutex_enter(&iu->iu_lock);
        atomic_dec_32(&iu->iu_sq_posted_cnt);

        if ((iu->iu_flags & SRPT_IU_STMF_ABORTING) != 0) {
                scsi_task_t     *abort_task = iu->iu_stmf_task;

                mutex_exit(&iu->iu_lock);
                stmf_abort(STMF_REQUEUE_TASK_ABORT_LPORT, abort_task,
                    STMF_ABORTED, NULL);
                return;
        }

        /*
         * We should not get an RDMA completion where the task has already
         * completed aborting and STMF has been informed.
         */
        ASSERT((iu->iu_flags & SRPT_IU_ABORTED) == 0);

        /*
         * Good completion for last RDMA op associated with a data buffer
         * I/O, if specified initiate status otherwise let STMF know we are
         * done.
         */
        stmf_dbuf->db_xfer_status = STMF_SUCCESS;
        mutex_exit(&iu->iu_lock);

        DTRACE_SRP_8(xfer__done, srpt_channel_t, ch,
            ibt_wr_ds_t, &(dbuf->db_sge), srpt_iu_t, iu,
            ibt_send_wr_t, 0, uint32_t, stmf_dbuf->db_data_size,
            uint32_t, 0, uint32_t, 0,
            uint32_t, (stmf_dbuf->db_flags & DB_DIRECTION_TO_RPORT) ? 1 : 0);

        if ((stmf_dbuf->db_flags & DB_SEND_STATUS_GOOD) != 0) {
                status = srpt_stp_send_status(dbuf->db_iu->iu_stmf_task, 0);
                if (status == STMF_SUCCESS) {
                        return;
                }
                stmf_dbuf->db_xfer_status = STMF_FAILURE;
        }
        stmf_data_xfer_done(dbuf->db_iu->iu_stmf_task, stmf_dbuf, 0);
}

/*
 * srpt_ch_scq_hdlr()
 */
static void
srpt_ch_scq_hdlr(ibt_cq_hdl_t cq_hdl, void *arg)
{
        ibt_status_t            status;
        srpt_channel_t          *ch = arg;
        ibt_wc_t                wc[SRPT_SEND_WC_POLL_SIZE];
        ibt_wc_t                *wcp;
        int                     i;
        uint32_t                cq_rearmed = 0;
        uint32_t                entries;
        srpt_swqe_t             *swqe;

        ASSERT(ch != NULL);

        /* Reference channel for the duration of this call */
        srpt_ch_add_ref(ch);

        for (;;) {
                status = ibt_poll_cq(cq_hdl, &wc[0], SRPT_SEND_WC_POLL_SIZE,
                    &entries);

                if (status != IBT_SUCCESS) {
                        if (status != IBT_CQ_EMPTY) {
                                /*
                                 * This error should not happen. It indicates
                                 * something abnormal has gone wrong and means
                                 * either a hardware or programming logic error.
                                 */
                                SRPT_DPRINTF_L2(
                                    "ch_scq_hdlr, unexpected CQ err(%d)",
                                    status);
                                srpt_ch_disconnect(ch);
                        }

                        /*
                         * If we have not rearmed the CQ do so now and poll to
                         * eliminate race; otherwise we are done.
                         */
                        if (cq_rearmed == 0) {
                                (void) ibt_enable_cq_notify(ch->ch_scq_hdl,
                                    IBT_NEXT_COMPLETION);
                                cq_rearmed = 1;
                                continue;
                        } else {
                                break;
                        }
                }

                for (wcp = wc, i = 0; i < entries; i++, wcp++) {

                        /*
                         * A zero work ID indicates this CQE is associated
                         * with an intermediate post of a RDMA data transfer
                         * operation.  Since intermediate data requests are
                         * unsignaled, we should only get these if there was
                         * an error.  No action is required.
                         */
                        if (wcp->wc_id == 0) {
                                continue;
                        }
                        swqe = ch->ch_swqe + wcp->wc_id;

                        switch (swqe->sw_type) {
                        case SRPT_SWQE_TYPE_RESP:
                                srpt_ch_rsp_comp(ch, (srpt_iu_t *)
                                    swqe->sw_addr, wcp->wc_status);
                                break;

                        case SRPT_SWQE_TYPE_DATA:
                                srpt_ch_data_comp(ch, (stmf_data_buf_t *)
                                    swqe->sw_addr, wcp->wc_status);
                                break;

                        default:
                                SRPT_DPRINTF_L2("ch_scq_hdlr, bad type(%d)",
                                    swqe->sw_type);
                                ASSERT(0);
                        }

                        srpt_ch_free_swqe_wrid(ch, wcp->wc_id);
                }
        }

        srpt_ch_release_ref(ch, 0);
}

/*
 * srpt_ch_rcq_hdlr()
 */
static void
srpt_ch_rcq_hdlr(ibt_cq_hdl_t cq_hdl, void *arg)
{
        ibt_status_t            status;
        srpt_channel_t          *ch = arg;
        ibt_wc_t                wc[SRPT_RECV_WC_POLL_SIZE];
        ibt_wc_t                *wcp;
        int                     i;
        uint32_t                entries;
        srpt_iu_t               *iu;
        uint_t                  cq_rearmed = 0;

        /*
         * The channel object will exists while the CQ handler call-back
         * is installed.
         */
        ASSERT(ch != NULL);
        srpt_ch_add_ref(ch);

        /*
         * If we know a channel disconnect has started do nothing
         * and let channel cleanup code recover resources from the CQ.
         * We are not concerned about races with the state transition
         * since the code will do the correct thing either way. This
         * is simply to circumvent rearming the CQ, and it will
         * catch the state next time.
         */
        rw_enter(&ch->ch_rwlock, RW_READER);
        if (ch->ch_state == SRPT_CHANNEL_DISCONNECTING) {
                SRPT_DPRINTF_L2("ch_rcq_hdlr, channel disconnecting");
                rw_exit(&ch->ch_rwlock);
                srpt_ch_release_ref(ch, 0);
                return;
        }
        rw_exit(&ch->ch_rwlock);

        for (;;) {
                status = ibt_poll_cq(cq_hdl, &wc[0], SRPT_RECV_WC_POLL_SIZE,
                    &entries);

                if (status != IBT_SUCCESS) {
                        if (status != IBT_CQ_EMPTY) {
                                /*
                                 * This error should not happen. It indicates
                                 * something abnormal has gone wrong and means
                                 * either a hardware or programming logic error.
                                 */
                                SRPT_DPRINTF_L2(
                                    "ch_rcq_hdlr, unexpected CQ err(%d)",
                                    status);
                                srpt_ch_disconnect(ch);
                                break;
                        }

                        /*
                         * If we have not rearmed the CQ do so now and poll to
                         * eliminate race; otherwise we are done.
                         */
                        if (cq_rearmed == 0) {
                                (void) ibt_enable_cq_notify(ch->ch_rcq_hdl,
                                    IBT_NEXT_COMPLETION);
                                cq_rearmed = 1;
                                continue;
                        } else {
                                break;
                        }
                }

                for (wcp = wc, i = 0; i < entries; i++, wcp++) {

                        /*
                         *  Check wc_status before proceeding.  If the
                         *  status indicates a channel problem, stop processing.
                         */
                        if (wcp->wc_status != IBT_WC_SUCCESS) {
                                if (wcp->wc_status == IBT_WC_WR_FLUSHED_ERR) {
                                        SRPT_DPRINTF_L2(
                                            "ch_rcq, unexpected"
                                            " wc_status err(%d)",
                                            wcp->wc_status);
                                        srpt_ch_disconnect(ch);
                                        goto done;
                                } else {
                                        /* skip IUs with errors */
                                        SRPT_DPRINTF_L2(
                                            "ch_rcq, ERROR comp(%d)",
                                            wcp->wc_status);
                                        /* XXX - verify not leaking IUs */
                                        continue;
                                }
                        }

                        iu = (srpt_iu_t *)(uintptr_t)wcp->wc_id;
                        ASSERT(iu != NULL);

                        /*
                         * Process the IU.
                         */
                        ASSERT(wcp->wc_type == IBT_WRC_RECV);
                        srpt_ch_process_iu(ch, iu);
                }
        }

done:
        srpt_ch_release_ref(ch, 0);
}

/*
 * srpt_ch_srp_cmd()
 */
static int
srpt_ch_srp_cmd(srpt_channel_t *ch, srpt_iu_t *iu)
{
        srp_cmd_req_t           *cmd = (srp_cmd_req_t *)iu->iu_buf;
        srp_indirect_desc_t     *i_desc;
        uint_t                  i_di_cnt;
        uint_t                  i_do_cnt;
        uint8_t                 do_fmt;
        uint8_t                 di_fmt;
        uint32_t                *cur_desc_off;
        int                     i;
        ibt_status_t            status;
        uint8_t                 addlen;


        DTRACE_SRP_2(task__command, srpt_channel_t, ch, srp_cmd_req_t, cmd);
        iu->iu_ch  = ch;
        iu->iu_tag = cmd->cr_tag;

        /*
         * The SRP specification and SAM require support for bi-directional
         * data transfer, so we create a single buffer descriptor list that
         * in the IU buffer that covers the data-in and data-out buffers.
         * In practice we will just see unidirectional transfers with either
         * data-in or data out descriptors.  If we were to take that as fact,
         * we could reduce overhead slightly.
         */

        /*
         * additional length is a 6-bit number in 4-byte words, so multiply by 4
         * to get bytes.
         */
        addlen = cmd->cr_add_cdb_len & 0x3f;    /* mask off 6 bits */

        cur_desc_off = (uint32_t *)(void *)&cmd->cr_add_data;
        cur_desc_off  += addlen;                /* 32-bit arithmetic */
        iu->iu_num_rdescs = 0;
        iu->iu_rdescs = (srp_direct_desc_t *)(void *)cur_desc_off;

        /*
         * Examine buffer description for Data In (i.e. data flows
         * to the initiator).
         */
        i_do_cnt = i_di_cnt = 0;
        di_fmt = cmd->cr_buf_fmt >> 4;
        if (di_fmt == SRP_DATA_DESC_DIRECT) {
                iu->iu_num_rdescs = 1;
                cur_desc_off = (uint32_t *)(void *)&iu->iu_rdescs[1];
        } else if (di_fmt == SRP_DATA_DESC_INDIRECT) {
                i_desc = (srp_indirect_desc_t *)iu->iu_rdescs;
                i_di_cnt  = b2h32(i_desc->id_table.dd_len) /
                    sizeof (srp_direct_desc_t);

                /*
                 * Some initiators like OFED occasionally use the wrong counts,
                 * so check total to allow for this.  NOTE: we do not support
                 * reading of the descriptor table from the initiator, so if
                 * not all descriptors are in the IU we drop the task.
                 */
                if (i_di_cnt > (cmd->cr_dicnt + cmd->cr_docnt)) {
                        SRPT_DPRINTF_L2("ch_srp_cmd, remote RDMA of"
                            " descriptors not supported");
                        SRPT_DPRINTF_L2("ch_srp_cmd, sizeof entry (%d),"
                            " i_di_cnt(%d), cr_dicnt(%d)",
                            (uint_t)sizeof (srp_direct_desc_t),
                            i_di_cnt, cmd->cr_dicnt);
                        iu->iu_rdescs = NULL;
                        return (1);
                }
                bcopy(&i_desc->id_desc[0], iu->iu_rdescs,
                    sizeof (srp_direct_desc_t) * i_di_cnt);
                iu->iu_num_rdescs += i_di_cnt;
                cur_desc_off = (uint32_t *)(void *)&i_desc->id_desc[i_di_cnt];
        }

        /*
         * Examine buffer description for Data Out (i.e. data flows
         * from the initiator).
         */
        do_fmt = cmd->cr_buf_fmt & 0x0F;
        if (do_fmt == SRP_DATA_DESC_DIRECT) {
                if (di_fmt == SRP_DATA_DESC_DIRECT) {
                        bcopy(cur_desc_off, &iu->iu_rdescs[iu->iu_num_rdescs],
                            sizeof (srp_direct_desc_t));
                }
                iu->iu_num_rdescs++;
        } else if (do_fmt == SRP_DATA_DESC_INDIRECT) {
                i_desc = (srp_indirect_desc_t *)cur_desc_off;
                i_do_cnt  = b2h32(i_desc->id_table.dd_len) /
                    sizeof (srp_direct_desc_t);

                /*
                 * Some initiators like OFED occasionally use the wrong counts,
                 * so check total to allow for this.  NOTE: we do not support
                 * reading of the descriptor table from the initiator, so if
                 * not all descriptors are in the IU we drop the task.
                 */
                if ((i_di_cnt + i_do_cnt) > (cmd->cr_dicnt + cmd->cr_docnt)) {
                        SRPT_DPRINTF_L2("ch_srp_cmd, remote RDMA of"
                            " descriptors not supported");
                        SRPT_DPRINTF_L2("ch_srp_cmd, sizeof entry (%d),"
                            " i_do_cnt(%d), cr_docnt(%d)",
                            (uint_t)sizeof (srp_direct_desc_t),
                            i_do_cnt, cmd->cr_docnt);
                        iu->iu_rdescs = 0;
                        return (1);
                }
                bcopy(&i_desc->id_desc[0], &iu->iu_rdescs[iu->iu_num_rdescs],
                    sizeof (srp_direct_desc_t) * i_do_cnt);
                iu->iu_num_rdescs += i_do_cnt;
        }

        iu->iu_tot_xfer_len = 0;
        for (i = 0; i < iu->iu_num_rdescs; i++) {
                iu->iu_rdescs[i].dd_vaddr = b2h64(iu->iu_rdescs[i].dd_vaddr);
                iu->iu_rdescs[i].dd_hdl   = b2h32(iu->iu_rdescs[i].dd_hdl);
                iu->iu_rdescs[i].dd_len   = b2h32(iu->iu_rdescs[i].dd_len);
                iu->iu_tot_xfer_len += iu->iu_rdescs[i].dd_len;
        }

#ifdef DEBUG
        if (srpt_errlevel >= SRPT_LOG_L4) {
                SRPT_DPRINTF_L4("ch_srp_cmd, iu->iu_tot_xfer_len (%d)",
                    iu->iu_tot_xfer_len);
                for (i = 0; i < iu->iu_num_rdescs; i++) {
                        SRPT_DPRINTF_L4("ch_srp_cmd, rdescs[%d].dd_vaddr"
                            " (0x%08llx)",
                            i, (u_longlong_t)iu->iu_rdescs[i].dd_vaddr);
                        SRPT_DPRINTF_L4("ch_srp_cmd, rdescs[%d].dd_hdl"
                            " (0x%08x)", i, iu->iu_rdescs[i].dd_hdl);
                        SRPT_DPRINTF_L4("ch_srp_cmd, rdescs[%d].dd_len (%d)",
                            i, iu->iu_rdescs[i].dd_len);
                }
                SRPT_DPRINTF_L4("ch_srp_cmd, LUN (0x%08lx)",
                    (unsigned long int) *((uint64_t *)(void *) cmd->cr_lun));
        }
#endif
        rw_enter(&ch->ch_rwlock, RW_READER);

        if (ch->ch_state == SRPT_CHANNEL_DISCONNECTING) {
                /*
                 * The channel has begun disconnecting, so ignore the
                 * the command returning the IU resources.
                 */
                rw_exit(&ch->ch_rwlock);
                return (1);
        }

        /*
         * Once a SCSI task is allocated and assigned to the IU, it
         * owns those IU resources, which will be held until STMF
         * is notified the task is done (from a lport perspective).
         */
        iu->iu_stmf_task = stmf_task_alloc(ch->ch_tgt->tp_lport,
            ch->ch_session->ss_ss, cmd->cr_lun,
            SRP_CDB_SIZE + (addlen * 4), 0);
        if (iu->iu_stmf_task == NULL) {
                /*
                 * Could not allocate, return status to the initiator
                 * indicating that we are temporarily unable to process
                 * commands.  If unable to send, immediately return IU
                 * resource.
                 */
                SRPT_DPRINTF_L2("ch_srp_cmd, SCSI task allocation failure");
                rw_exit(&ch->ch_rwlock);
                mutex_enter(&iu->iu_lock);
                status = srpt_stp_send_response(iu, STATUS_BUSY, 0, 0, 0,
                    NULL, SRPT_NO_FENCE_SEND);
                mutex_exit(&iu->iu_lock);
                if (status != IBT_SUCCESS) {
                        SRPT_DPRINTF_L2("ch_srp_cmd, error(%d) posting error"
                            " response", status);
                        return (1);
                } else {
                        return (0);
                }
        }

        iu->iu_stmf_task->task_port_private = iu;
        iu->iu_stmf_task->task_flags = 0;

        if (di_fmt != 0) {
                iu->iu_stmf_task->task_flags |= TF_WRITE_DATA;
        }
        if (do_fmt != 0) {
                iu->iu_stmf_task->task_flags |= TF_READ_DATA;
        }

        switch (cmd->cr_task_attr) {
        case SRP_TSK_ATTR_QTYPE_SIMPLE:
                iu->iu_stmf_task->task_flags |= TF_ATTR_SIMPLE_QUEUE;
                break;

        case SRP_TSK_ATTR_QTYPE_HEAD_OF_Q:
                iu->iu_stmf_task->task_flags |= TF_ATTR_HEAD_OF_QUEUE;
                break;

        case SRP_TSK_ATTR_QTYPE_ORDERED:
                iu->iu_stmf_task->task_flags |= TF_ATTR_ORDERED_QUEUE;
                break;

        case SRP_TSK_ATTR_QTYPE_ACA_Q_TAG:
                iu->iu_stmf_task->task_flags |= TF_ATTR_ACA;
                break;

        default:
                SRPT_DPRINTF_L2("ch_srp_cmd, reserved task attr (%d)",
                    cmd->cr_task_attr);
                iu->iu_stmf_task->task_flags |= TF_ATTR_ORDERED_QUEUE;
                break;
        }
        iu->iu_stmf_task->task_additional_flags = 0;
        iu->iu_stmf_task->task_priority         = 0;
        iu->iu_stmf_task->task_mgmt_function    = TM_NONE;
        iu->iu_stmf_task->task_max_nbufs        = STMF_BUFS_MAX;
        iu->iu_stmf_task->task_expected_xfer_length = iu->iu_tot_xfer_len;
        iu->iu_stmf_task->task_csn_size         = 0;

        bcopy(cmd->cr_cdb, iu->iu_stmf_task->task_cdb,
            SRP_CDB_SIZE);
        if (addlen != 0) {
                bcopy(&cmd->cr_add_data,
                    iu->iu_stmf_task->task_cdb + SRP_CDB_SIZE,
                    addlen * 4);
        }

        /*
         * Add the IU/task to the session and post to STMF.  The task will
         * remain in the session's list until STMF is informed by SRP that
         * it is done with the task.
         */
        DTRACE_SRP_3(scsi__command, srpt_channel_t, iu->iu_ch,
            scsi_task_t, iu->iu_stmf_task, srp_cmd_req_t, cmd);
        srpt_stp_add_task(ch->ch_session, iu);

        SRPT_DPRINTF_L3("ch_srp_cmd, new task (%p) posted",
            (void *)iu->iu_stmf_task);
        stmf_post_task(iu->iu_stmf_task, NULL);
        rw_exit(&ch->ch_rwlock);

        return (0);
}

/*
 * srpt_ch_task_mgmt_abort()
 *
 * Returns 0 on success, indicating we've sent a management response.
 * Returns !0 to indicate failure; the IU should be reposted.
 */
static ibt_status_t
srpt_ch_task_mgmt_abort(srpt_channel_t *ch, srpt_iu_t *iu,
        uint64_t tag_to_abort)
{
        srpt_session_t  *session = ch->ch_session;
        srpt_iu_t       *ss_iu;
        ibt_status_t    status;

        /*
         * Locate the associated task (tag_to_abort) in the
         * session's active task list.
         */
        rw_enter(&session->ss_rwlock, RW_READER);
        ss_iu = list_head(&session->ss_task_list);
        while (ss_iu != NULL) {
                mutex_enter(&ss_iu->iu_lock);
                if ((tag_to_abort == ss_iu->iu_tag)) {
                        mutex_exit(&ss_iu->iu_lock);
                        break;
                }
                mutex_exit(&ss_iu->iu_lock);
                ss_iu = list_next(&session->ss_task_list, ss_iu);
        }
        rw_exit(&session->ss_rwlock);

        /*
         * Take appropriate action based on state of task
         * to be aborted:
         * 1) No longer exists - do nothing.
         * 2) Previously aborted or status queued - do nothing.
         * 3) Otherwise - initiate abort.
         */
        if (ss_iu == NULL)  {
                goto send_mgmt_resp;
        }

        mutex_enter(&ss_iu->iu_lock);
        if ((ss_iu->iu_flags & (SRPT_IU_STMF_ABORTING |
            SRPT_IU_ABORTED | SRPT_IU_RESP_SENT)) != 0) {
                mutex_exit(&ss_iu->iu_lock);
                goto send_mgmt_resp;
        }

        /*
         * Set aborting flag and notify STMF of abort request.  No
         * additional I/O will be queued for this IU.
         */
        SRPT_DPRINTF_L3("ch_task_mgmt_abort, task found");
        ss_iu->iu_flags |= SRPT_IU_SRP_ABORTING;
        mutex_exit(&ss_iu->iu_lock);
        stmf_abort(STMF_QUEUE_TASK_ABORT,
            ss_iu->iu_stmf_task, STMF_ABORTED, NULL);

send_mgmt_resp:
        mutex_enter(&iu->iu_lock);
        status = srpt_stp_send_mgmt_response(iu, SRP_TM_SUCCESS,
            SRPT_FENCE_SEND);
        mutex_exit(&iu->iu_lock);

        if (status != IBT_SUCCESS) {
                SRPT_DPRINTF_L2("ch_task_mgmt_abort, err(%d)"
                    " posting abort response", status);
        }

        return (status);
}

/*
 * srpt_ch_srp_task_mgmt()
 */
static int
srpt_ch_srp_task_mgmt(srpt_channel_t *ch, srpt_iu_t *iu)
{
        srp_tsk_mgmt_t          *tsk = (srp_tsk_mgmt_t *)iu->iu_buf;
        uint8_t                 tm_fn;
        ibt_status_t            status;

        SRPT_DPRINTF_L3("ch_srp_task_mgmt, SRP TASK MGMT func(%d)",
            tsk->tm_function);

        /*
         * Both tag and lun fileds have the same corresponding offsets
         * in both srp_tsk_mgmt_t and srp_cmd_req_t structures.  The
         * casting will allow us to use the same dtrace translator.
         */
        DTRACE_SRP_2(task__command, srpt_channel_t, ch,
            srp_cmd_req_t, (srp_cmd_req_t *)tsk);

        iu->iu_ch  = ch;
        iu->iu_tag = tsk->tm_tag;

        /*
         * Task management aborts are processed directly by the SRP driver;
         * all other task management requests are handed off to STMF.
         */
        switch (tsk->tm_function) {
        case SRP_TSK_MGMT_ABORT_TASK:
                /*
                 * Initiate SCSI transport protocol specific task abort
                 * logic.
                 */
                status = srpt_ch_task_mgmt_abort(ch, iu, tsk->tm_task_tag);
                if (status != IBT_SUCCESS) {
                        /* repost this IU */
                        return (1);
                } else {
                        return (0);
                }

        case SRP_TSK_MGMT_ABORT_TASK_SET:
                tm_fn = TM_ABORT_TASK_SET;
                break;

        case SRP_TSK_MGMT_CLEAR_TASK_SET:
                tm_fn = TM_CLEAR_TASK_SET;
                break;

        case SRP_TSK_MGMT_LUN_RESET:
                tm_fn = TM_LUN_RESET;
                break;

        case SRP_TSK_MGMT_CLEAR_ACA:
                tm_fn = TM_CLEAR_ACA;
                break;

        default:
                /*
                 * SRP does not support the requested task management
                 * function; return a not supported status in the response.
                 */
                SRPT_DPRINTF_L2("ch_srp_task_mgmt, SRP task mgmt fn(%d)"
                    " not supported", tsk->tm_function);
                mutex_enter(&iu->iu_lock);
                status = srpt_stp_send_mgmt_response(iu,
                    SRP_TM_NOT_SUPPORTED, SRPT_NO_FENCE_SEND);
                mutex_exit(&iu->iu_lock);
                if (status != IBT_SUCCESS) {
                        SRPT_DPRINTF_L2("ch_srp_task_mgmt, err(%d) posting"
                            " response", status);
                        return (1);
                }
                return (0);
        }

        rw_enter(&ch->ch_rwlock, RW_READER);
        if (ch->ch_state == SRPT_CHANNEL_DISCONNECTING) {
                /*
                 * The channel has begun disconnecting, so ignore the
                 * the command returning the IU resources.
                 */
                rw_exit(&ch->ch_rwlock);
                return (1);
        }

        /*
         * Once a SCSI mgmt task is allocated and assigned to the IU, it
         * owns those IU resources, which will be held until we inform
         * STMF that we are done with the task (from an lports perspective).
         */
        iu->iu_stmf_task = stmf_task_alloc(ch->ch_tgt->tp_lport,
            ch->ch_session->ss_ss, tsk->tm_lun, 0, STMF_TASK_EXT_NONE);
        if (iu->iu_stmf_task == NULL) {
                /*
                 * Could not allocate, return status to the initiator
                 * indicating that we are temporarily unable to process
                 * commands.  If unable to send, immediately return IU
                 * resource.
                 */
                SRPT_DPRINTF_L2("ch_srp_task_mgmt, SCSI task allocation"
                    " failure");
                rw_exit(&ch->ch_rwlock);
                mutex_enter(&iu->iu_lock);
                status = srpt_stp_send_response(iu, STATUS_BUSY, 0, 0, 0,
                    NULL, SRPT_NO_FENCE_SEND);
                mutex_exit(&iu->iu_lock);
                if (status != IBT_SUCCESS) {
                        SRPT_DPRINTF_L2("ch_srp_task_mgmt, err(%d) posting"
                            "busy response", status);
                        /* repost the IU */
                        return (1);
                }
                return (0);
        }

        iu->iu_stmf_task->task_port_private = iu;
        iu->iu_stmf_task->task_flags = 0;
        iu->iu_stmf_task->task_additional_flags =
            TASK_AF_NO_EXPECTED_XFER_LENGTH;
        iu->iu_stmf_task->task_priority = 0;
        iu->iu_stmf_task->task_mgmt_function = tm_fn;
        iu->iu_stmf_task->task_max_nbufs = STMF_BUFS_MAX;
        iu->iu_stmf_task->task_expected_xfer_length = 0;
        iu->iu_stmf_task->task_csn_size = 0;

        /*
         * Add the IU/task to the session and post to STMF.  The task will
         * remain in the session's list until STMF is informed by SRP that
         * it is done with the task.
         */
        srpt_stp_add_task(ch->ch_session, iu);

        SRPT_DPRINTF_L3("ch_srp_task_mgmt, new mgmt task(%p) posted",
            (void *)iu->iu_stmf_task);
        stmf_post_task(iu->iu_stmf_task, NULL);
        rw_exit(&ch->ch_rwlock);

        return (0);
}

/*
 * srpt_ch_process_iu()
 */
static void
srpt_ch_process_iu(srpt_channel_t *ch, srpt_iu_t *iu)
{
        srpt_iu_data_t  *iud;
        int             status = 1;

        /*
         * IU adds reference to channel which will represent a
         * a reference by STMF.  If for whatever reason the IU
         * is not handed off to STMF, then this reference will be
         * released.  Otherwise, the reference will be released when
         * SRP informs STMF that the associated SCSI task is done.
         */
        srpt_ch_add_ref(ch);

        /*
         * Validate login RC channel state. Normally active, if
         * not active then we need to handle a possible race between the
         * receipt of a implied RTU and CM calling back to notify of the
         * state transition.
         */
        rw_enter(&ch->ch_rwlock, RW_READER);
        if (ch->ch_state == SRPT_CHANNEL_DISCONNECTING) {
                rw_exit(&ch->ch_rwlock);
                goto repost_iu;
        }
        rw_exit(&ch->ch_rwlock);

        iud = iu->iu_buf;

        switch (iud->rx_iu.srp_op) {
        case SRP_IU_CMD:
                status = srpt_ch_srp_cmd(ch, iu);
                break;

        case SRP_IU_TASK_MGMT:
                status = srpt_ch_srp_task_mgmt(ch, iu);
                return;

        case SRP_IU_I_LOGOUT:
                SRPT_DPRINTF_L3("ch_process_iu, SRP INITIATOR LOGOUT");
                /*
                 * Initiators should logout by issuing a CM disconnect
                 * request (DREQ) with the logout IU in the private data;
                 * however some initiators have been known to send the
                 * IU in-band, if this happens just initiate the logout.
                 * Note that we do not return a response as per the
                 * specification.
                 */
                srpt_stp_logout(ch);
                break;

        case SRP_IU_AER_RSP:
        case SRP_IU_CRED_RSP:
        default:
                /*
                 * We don't send asynchronous events or ask for credit
                 * adjustments, so nothing need be done.  Log we got an
                 * unexpected IU but then just repost the IU to the SRQ.
                 */
                SRPT_DPRINTF_L2("ch_process_iu, invalid IU from initiator,"
                    " IU opcode(%d)", iud->rx_iu.srp_op);
                break;
        }

        if (status == 0) {
                return;
        }

repost_iu:
        SRPT_DPRINTF_L4("process_iu:  reposting iu %p", (void *)iu);
        mutex_enter(&iu->iu_lock);
        srpt_ioc_repost_recv_iu(iu->iu_ioc, iu);
        mutex_exit(&iu->iu_lock);
        srpt_ch_release_ref(ch, 0);
}

/*
 * srpt_ch_post_send
 */
ibt_status_t
srpt_ch_post_send(srpt_channel_t *ch, srpt_iu_t *iu, uint32_t len,
        uint_t fence)
{
        ibt_status_t            status;
        ibt_send_wr_t           wr;
        ibt_wr_ds_t             ds;
        uint_t                  posted;

        ASSERT(ch != NULL);
        ASSERT(iu != NULL);
        ASSERT(mutex_owned(&iu->iu_lock));

        rw_enter(&ch->ch_rwlock, RW_READER);
        if (ch->ch_state == SRPT_CHANNEL_DISCONNECTING) {
                rw_exit(&ch->ch_rwlock);
                SRPT_DPRINTF_L2("ch_post_send, bad ch state (%d)",
                    ch->ch_state);
                return (IBT_FAILURE);
        }
        rw_exit(&ch->ch_rwlock);

        wr.wr_id = srpt_ch_alloc_swqe_wrid(ch, SRPT_SWQE_TYPE_RESP,
            (void *)iu);
        if (wr.wr_id == 0) {
                SRPT_DPRINTF_L2("ch_post_send, queue full");
                return (IBT_FAILURE);
        }

        atomic_inc_32(&iu->iu_sq_posted_cnt);

        wr.wr_flags = IBT_WR_SEND_SIGNAL;
        if (fence == SRPT_FENCE_SEND) {
                wr.wr_flags |= IBT_WR_SEND_FENCE;
        }
        wr.wr_opcode = IBT_WRC_SEND;
        wr.wr_trans  = IBT_RC_SRV;
        wr.wr_nds = 1;
        wr.wr_sgl = &ds;

        ds.ds_va = iu->iu_sge.ds_va;
        ds.ds_key = iu->iu_sge.ds_key;
        ds.ds_len = len;

        SRPT_DPRINTF_L4("ch_post_send, posting SRP response to channel"
            " ds.ds_va (0x%16llx), ds.ds_key (0x%08x), "
            " ds.ds_len (%d)",
            (u_longlong_t)ds.ds_va, ds.ds_key, ds.ds_len);

        status = ibt_post_send(ch->ch_chan_hdl, &wr, 1, &posted);
        if (status != IBT_SUCCESS) {
                SRPT_DPRINTF_L2("ch_post_send, post_send failed (%d)",
                    status);
                atomic_dec_32(&iu->iu_sq_posted_cnt);
                srpt_ch_free_swqe_wrid(ch, wr.wr_id);
                return (status);
        }

        return (IBT_SUCCESS);
}

/*
 * srpt_ch_alloc_swqe_wrid()
 */
ibt_wrid_t
srpt_ch_alloc_swqe_wrid(srpt_channel_t *ch,
        srpt_swqe_type_t wqe_type, void *addr)
{
        ibt_wrid_t      wrid;

        mutex_enter(&ch->ch_swqe_lock);
        if (ch->ch_head == ch->ch_tail) {
                mutex_exit(&ch->ch_swqe_lock);
                return ((ibt_wrid_t)0);
        }
        wrid = (ibt_wrid_t)ch->ch_head;
        ch->ch_swqe[ch->ch_head].sw_type = wqe_type;
        ch->ch_swqe[ch->ch_head].sw_addr = addr;
        ch->ch_head = ch->ch_swqe[ch->ch_head].sw_next;
        ch->ch_swqe_posted++;
        mutex_exit(&ch->ch_swqe_lock);
        return (wrid);
}

/*
 * srpt_ch_free_swqe_wrid()
 */
void
srpt_ch_free_swqe_wrid(srpt_channel_t *ch, ibt_wrid_t id)
{
        mutex_enter(&ch->ch_swqe_lock);
        ch->ch_swqe[ch->ch_tail].sw_next = id;
        ch->ch_tail = (uint32_t)id;
        ch->ch_swqe_posted--;
        mutex_exit(&ch->ch_swqe_lock);
}