root/sys/dev/iser/iser_verbs.c
/*-
 * Copyright (c) 2015, Mellanox Technologies, Inc. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

#include "icl_iser.h"

static MALLOC_DEFINE(M_ISER_VERBS, "iser_verbs", "iser verbs backend");
static int iser_cq_poll_limit = 512;

static void
iser_cq_event_callback(struct ib_event *cause, void *context)
{
        ISER_ERR("got cq event %d", cause->event);
}

static void
iser_qp_event_callback(struct ib_event *cause, void *context)
{
        ISER_ERR("got qp event %d", cause->event);
}

static void
iser_event_handler(struct ib_event_handler *handler,
                                struct ib_event *event)
{
        ISER_ERR("async event %d on device %s port %d",
                 event->event, event->device->name,
                 event->element.port_num);
}

/**
 * is_iser_tx_desc - Indicate if the completion wr_id
 *     is a TX descriptor or not.
 * @iser_conn: iser connection
 * @wr_id: completion WR identifier
 *
 * Since we cannot rely on wc opcode in FLUSH errors
 * we must work around it by checking if the wr_id address
 * falls in the iser connection rx_descs buffer. If so
 * it is an RX descriptor, otherwize it is a TX.
 */
static inline bool
is_iser_tx_desc(struct iser_conn *iser_conn, void *wr_id)
{
        void *start = iser_conn->rx_descs;
        u64 len = iser_conn->num_rx_descs * sizeof(*iser_conn->rx_descs);
        void *end = (void *)((uintptr_t)start + (uintptr_t)len);

        if (start) {
                if (wr_id >= start && wr_id < end)
                        return false;
        } else {
                return ((uintptr_t)wr_id != (uintptr_t)iser_conn->login_resp_buf);
        }

        return true;
}

/**
 * iser_handle_comp_error() - Handle error completion
 * @ib_conn:   connection RDMA resources
 * @wc:        work completion
 *
 * Notes: Update post_recv_buf_count in case of recv error completion.
 *        For non-FLUSH error completion we should also notify iscsi layer that
 *        connection is failed (in case we passed bind stage).
 */
static void
iser_handle_comp_error(struct ib_conn *ib_conn,
                       struct ib_wc *wc)
{
        void *wr_id = (void *)(uintptr_t)wc->wr_id;
        struct iser_conn *iser_conn = container_of(ib_conn, struct iser_conn,
                                                   ib_conn);

        if (is_iser_tx_desc(iser_conn, wr_id)) {
                ISER_DBG("conn %p got send comp error", iser_conn);
        } else {
                ISER_DBG("conn %p got recv comp error", iser_conn);
                ib_conn->post_recv_buf_count--;
        }
        if (wc->status != IB_WC_WR_FLUSH_ERR)
                iser_conn->icl_conn.ic_error(&iser_conn->icl_conn);
}

/**
 * iser_handle_wc - handle a single work completion
 * @wc: work completion
 *
 * Soft-IRQ context, work completion can be either
 * SEND or RECV, and can turn out successful or
 * with error (or flush error).
 */
static void iser_handle_wc(struct ib_wc *wc)
{
        struct ib_conn *ib_conn;
        struct iser_tx_desc *tx_desc;
        struct iser_rx_desc *rx_desc;

        ib_conn = wc->qp->qp_context;
        if (likely(wc->status == IB_WC_SUCCESS)) {
                if (wc->opcode == IB_WC_RECV) {
                        rx_desc = (struct iser_rx_desc *)(uintptr_t)wc->wr_id;
                        iser_rcv_completion(rx_desc, wc->byte_len,
                                            ib_conn);
                } else
                if (wc->opcode == IB_WC_SEND) {
                        tx_desc = (struct iser_tx_desc *)(uintptr_t)wc->wr_id;
                        iser_snd_completion(tx_desc, ib_conn);
                } else {
                        ISER_ERR("Unknown wc opcode %d", wc->opcode);
                }
        } else {
                struct iser_conn *iser_conn = container_of(ib_conn, struct iser_conn,
                                        ib_conn);
                if (wc->status != IB_WC_WR_FLUSH_ERR) {
                        ISER_ERR("conn %p wr id %llx status %d vend_err %x",
                                 iser_conn, (unsigned long long)wc->wr_id,
                                 wc->status, wc->vendor_err);
                } else {
                        ISER_DBG("flush error: conn %p wr id %llx",
                                 iser_conn, (unsigned long long)wc->wr_id);
                }

                if (wc->wr_id == ISER_BEACON_WRID) {
                        /* all flush errors were consumed */
                        mtx_lock(&ib_conn->beacon.flush_lock);
                        ISER_DBG("conn %p got ISER_BEACON_WRID", iser_conn);
                        cv_signal(&ib_conn->beacon.flush_cv);
                        mtx_unlock(&ib_conn->beacon.flush_lock);
                } else {
                        iser_handle_comp_error(ib_conn, wc);
                }
        }
}

static void
iser_cq_tasklet_fn(void *data, int pending)
{
        struct iser_comp *comp = (struct iser_comp *)data;
        struct ib_cq *cq = comp->cq;
        struct ib_wc *const wcs = comp->wcs;
        int completed = 0;
        int i;
        int n;

        while ((n = ib_poll_cq(cq, ARRAY_SIZE(comp->wcs), wcs)) > 0) {
                for (i = 0; i < n; i++)
                        iser_handle_wc(&wcs[i]);

                completed += n;
                if (completed >= iser_cq_poll_limit)
                        break;
        }

        /*
         * It is assumed here that arming CQ only once its empty
         * would not cause interrupts to be missed.
         */
        ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
}

static void
iser_cq_callback(struct ib_cq *cq, void *cq_context)
{
        struct iser_comp *comp = cq_context;

        taskqueue_enqueue(comp->tq, &comp->task);
}

/**
 * iser_create_device_ib_res - creates Protection Domain (PD), Completion
 * Queue (CQ), DMA Memory Region (DMA MR) with the device associated with
 * the adapator.
 *
 * returns 0 on success, -1 on failure
 */
static int
iser_create_device_ib_res(struct iser_device *device)
{
        struct ib_device *ib_dev = device->ib_device;
        int i, max_cqe;

        if (!(ib_dev->attrs.device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS)) {
                ISER_ERR("device %s doesn't support Fastreg, "
                         "can't register memory", device->ib_device->name);
                return (1);
        }

        device->comps_used = min(mp_ncpus, device->ib_device->num_comp_vectors);

        device->comps = malloc(device->comps_used * sizeof(*device->comps),
                M_ISER_VERBS, M_WAITOK | M_ZERO);

        max_cqe = min(ISER_MAX_CQ_LEN, ib_dev->attrs.max_cqe);

        ISER_DBG("using %d CQs, device %s supports %d vectors max_cqe %d",
                 device->comps_used, device->ib_device->name,
                 device->ib_device->num_comp_vectors, max_cqe);

        device->pd = ib_alloc_pd(device->ib_device, IB_PD_UNSAFE_GLOBAL_RKEY);
        if (IS_ERR(device->pd))
                goto pd_err;

        for (i = 0; i < device->comps_used; i++) {
                struct iser_comp *comp = &device->comps[i];
                struct ib_cq_init_attr cq_attr = {
                        .cqe            = max_cqe,
                        .comp_vector    = i,
                };

                comp->device = device;
                comp->cq = ib_create_cq(device->ib_device,
                                        iser_cq_callback,
                                        iser_cq_event_callback,
                                        (void *)comp,
                                        &cq_attr);
                if (IS_ERR(comp->cq)) {
                        comp->cq = NULL;
                        goto cq_err;
                }

                if (ib_req_notify_cq(comp->cq, IB_CQ_NEXT_COMP))
                        goto cq_err;

                TASK_INIT(&comp->task, 0, iser_cq_tasklet_fn, comp);
                comp->tq = taskqueue_create_fast("iser_taskq", M_NOWAIT,
                                taskqueue_thread_enqueue, &comp->tq);
                if (!comp->tq)
                        goto tq_err;
                taskqueue_start_threads(&comp->tq, 1, PI_NET, "iser taskq");
        }

        device->mr = device->pd->__internal_mr;
        if (IS_ERR(device->mr))
                goto tq_err;

        INIT_IB_EVENT_HANDLER(&device->event_handler, device->ib_device,
                                iser_event_handler);
        if (ib_register_event_handler(&device->event_handler))
                goto tq_err;

        return (0);

tq_err:
        for (i = 0; i < device->comps_used; i++) {
                struct iser_comp *comp = &device->comps[i];
                if (comp->tq)
                        taskqueue_free(comp->tq);
        }
cq_err:
        for (i = 0; i < device->comps_used; i++) {
                struct iser_comp *comp = &device->comps[i];
                if (comp->cq)
                        ib_destroy_cq(comp->cq);
        }
        ib_dealloc_pd(device->pd);
pd_err:
        free(device->comps, M_ISER_VERBS);
        ISER_ERR("failed to allocate an IB resource");
        return (1);
}

/**
 * iser_free_device_ib_res - destroy/dealloc/dereg the DMA MR,
 * CQ and PD created with the device associated with the adapator.
 */
static void
iser_free_device_ib_res(struct iser_device *device)
{
        int i;

        for (i = 0; i < device->comps_used; i++) {
                struct iser_comp *comp = &device->comps[i];

                taskqueue_free(comp->tq);
                ib_destroy_cq(comp->cq);
                comp->cq = NULL;
        }

        (void)ib_unregister_event_handler(&device->event_handler);
        (void)ib_dealloc_pd(device->pd);

        free(device->comps, M_ISER_VERBS);
        device->comps = NULL;

        device->mr = NULL;
        device->pd = NULL;
}

static int
iser_alloc_reg_res(struct ib_device *ib_device,
                   struct ib_pd *pd,
                   struct iser_reg_resources *res)
{
        int ret;

        res->mr = ib_alloc_mr(pd, IB_MR_TYPE_MEM_REG, ISCSI_ISER_SG_TABLESIZE + 1);
        if (IS_ERR(res->mr)) {
                ret = -PTR_ERR(res->mr);
                ISER_ERR("Failed to allocate  fast reg mr err=%d", ret);
                return (ret);
        }
        res->mr_valid = 1;

        return (0);
}

static void
iser_free_reg_res(struct iser_reg_resources *rsc)
{
        ib_dereg_mr(rsc->mr);
}

static struct fast_reg_descriptor *
iser_create_fastreg_desc(struct ib_device *ib_device, struct ib_pd *pd)
{
        struct fast_reg_descriptor *desc;
        int ret;

        desc = malloc(sizeof(*desc), M_ISER_VERBS, M_WAITOK | M_ZERO);
        ret = iser_alloc_reg_res(ib_device, pd, &desc->rsc);
        if (ret) {
                ISER_ERR("failed to allocate reg_resources");
                goto err;
        }

        return (desc);
err:
        free(desc, M_ISER_VERBS);
        return (NULL);
}

/**
 * iser_create_fmr_pool - Creates FMR pool and page_vector
 *
 * returns 0 on success, or errno code on failure
 */
int
iser_create_fastreg_pool(struct ib_conn *ib_conn, unsigned cmds_max)
{
        struct iser_device *device = ib_conn->device;
        struct fast_reg_descriptor *desc;
        int i;

        INIT_LIST_HEAD(&ib_conn->fastreg.pool);
        ib_conn->fastreg.pool_size = 0;
        for (i = 0; i < cmds_max; i++) {
                desc = iser_create_fastreg_desc(device->ib_device, device->pd);
                if (!desc) {
                        ISER_ERR("Failed to create fastreg descriptor");
                        goto err;
                }

                list_add_tail(&desc->list, &ib_conn->fastreg.pool);
                ib_conn->fastreg.pool_size++;
        }

        return (0);

err:
        iser_free_fastreg_pool(ib_conn);
        return (ENOMEM);
}

/**
 * iser_free_fmr_pool - releases the FMR pool and page vec
 */
void
iser_free_fastreg_pool(struct ib_conn *ib_conn)
{
        struct fast_reg_descriptor *desc, *tmp;
        int i = 0;

        if (list_empty(&ib_conn->fastreg.pool))
                return;

        ISER_DBG("freeing conn %p fr pool", ib_conn);

        list_for_each_entry_safe(desc, tmp, &ib_conn->fastreg.pool, list) {
                list_del(&desc->list);
                iser_free_reg_res(&desc->rsc);
                free(desc, M_ISER_VERBS);
                ++i;
        }

        if (i < ib_conn->fastreg.pool_size)
                ISER_WARN("pool still has %d regions registered",
                          ib_conn->fastreg.pool_size - i);
}

/**
 * iser_create_ib_conn_res - Queue-Pair (QP)
 *
 * returns 0 on success, 1 on failure
 */
static int
iser_create_ib_conn_res(struct ib_conn *ib_conn)
{
        struct iser_conn *iser_conn;
        struct iser_device *device;
        struct ib_device_attr *dev_attr;
        struct ib_qp_init_attr init_attr;
        int index, min_index = 0;
        int ret = -ENOMEM;

        iser_conn = container_of(ib_conn, struct iser_conn, ib_conn);
        device = ib_conn->device;
        dev_attr = &device->dev_attr;

        mtx_lock(&ig.connlist_mutex);
        /* select the CQ with the minimal number of usages */
        for (index = 0; index < device->comps_used; index++) {
                if (device->comps[index].active_qps <
                    device->comps[min_index].active_qps)
                        min_index = index;
        }
        ib_conn->comp = &device->comps[min_index];
        ib_conn->comp->active_qps++;
        mtx_unlock(&ig.connlist_mutex);
        ISER_INFO("cq index %d used for ib_conn %p", min_index, ib_conn);

        memset(&init_attr, 0, sizeof init_attr);
        init_attr.event_handler = iser_qp_event_callback;
        init_attr.qp_context    = (void *)ib_conn;
        init_attr.send_cq       = ib_conn->comp->cq;
        init_attr.recv_cq       = ib_conn->comp->cq;
        init_attr.cap.max_recv_wr  = ISER_QP_MAX_RECV_DTOS;
        init_attr.cap.max_send_sge = 2;
        init_attr.cap.max_recv_sge = 1;
        init_attr.sq_sig_type   = IB_SIGNAL_REQ_WR;
        init_attr.qp_type       = IB_QPT_RC;

        if (dev_attr->max_qp_wr > ISER_QP_MAX_REQ_DTOS) {
                init_attr.cap.max_send_wr  = ISER_QP_MAX_REQ_DTOS;
                iser_conn->max_cmds =
                        ISER_GET_MAX_XMIT_CMDS(ISER_QP_MAX_REQ_DTOS);
        } else {
                init_attr.cap.max_send_wr = dev_attr->max_qp_wr;
                iser_conn->max_cmds =
                        ISER_GET_MAX_XMIT_CMDS(dev_attr->max_qp_wr);
        }
        ISER_DBG("device %s supports max_send_wr %d",
                 device->ib_device->name, dev_attr->max_qp_wr);

        ret = rdma_create_qp(ib_conn->cma_id, device->pd, &init_attr);
        if (ret)
                goto out_err;

        ib_conn->qp = ib_conn->cma_id->qp;
        ISER_DBG("setting conn %p cma_id %p qp %p",
                 ib_conn, ib_conn->cma_id,
                 ib_conn->cma_id->qp);

        return (ret);

out_err:
        mtx_lock(&ig.connlist_mutex);
        ib_conn->comp->active_qps--;
        mtx_unlock(&ig.connlist_mutex);
        ISER_ERR("unable to alloc mem or create resource, err %d", ret);

        return (ret);
}

/**
 * based on the resolved device node GUID see if there already allocated
 * device for this device. If there's no such, create one.
 */
static struct iser_device *
iser_device_find_by_ib_device(struct rdma_cm_id *cma_id)
{
        struct iser_device *device;

        sx_xlock(&ig.device_list_mutex);

        list_for_each_entry(device, &ig.device_list, ig_list)
                /* find if there's a match using the node GUID */
                if (device->ib_device->node_guid == cma_id->device->node_guid)
                        goto inc_refcnt;

        device = malloc(sizeof *device, M_ISER_VERBS, M_WAITOK | M_ZERO);
        /* assign this device to the device */
        device->ib_device = cma_id->device;
        /* init the device and link it into ig device list */
        if (iser_create_device_ib_res(device)) {
                free(device, M_ISER_VERBS);
                device = NULL;
                goto out;
        }
        list_add(&device->ig_list, &ig.device_list);

inc_refcnt:
        device->refcount++;
        ISER_INFO("device %p refcount %d", device, device->refcount);
out:
        sx_xunlock(&ig.device_list_mutex);
        return (device);
}

/* if there's no demand for this device, release it */
static void
iser_device_try_release(struct iser_device *device)
{
        sx_xlock(&ig.device_list_mutex);
        device->refcount--;
        ISER_INFO("device %p refcount %d", device, device->refcount);
        if (!device->refcount) {
                iser_free_device_ib_res(device);
                list_del(&device->ig_list);
                free(device, M_ISER_VERBS);
                device = NULL;
        }
        sx_xunlock(&ig.device_list_mutex);
}

/**
 * Called with state mutex held
 **/
static int iser_conn_state_comp_exch(struct iser_conn *iser_conn,
                                     enum iser_conn_state comp,
                                     enum iser_conn_state exch)
{
        int ret;

        ret = (iser_conn->state == comp);
        if (ret)
                iser_conn->state = exch;

        return ret;
}

/**
 * iser_free_ib_conn_res - release IB related resources
 * @iser_conn: iser connection struct
 * @destroy: indicator if we need to try to release the
 *     iser device and memory regoins pool (only iscsi
 *     shutdown and DEVICE_REMOVAL will use this).
 *
 * This routine is called with the iser state mutex held
 * so the cm_id removal is out of here. It is Safe to
 * be invoked multiple times.
 */
void
iser_free_ib_conn_res(struct iser_conn *iser_conn,
                                  bool destroy)
{
        struct ib_conn *ib_conn = &iser_conn->ib_conn;
        struct iser_device *device = ib_conn->device;

        ISER_INFO("freeing conn %p cma_id %p qp %p",
                  iser_conn, ib_conn->cma_id, ib_conn->qp);

        if (ib_conn->qp != NULL) {
                mtx_lock(&ig.connlist_mutex);
                ib_conn->comp->active_qps--;
                mtx_unlock(&ig.connlist_mutex);
                rdma_destroy_qp(ib_conn->cma_id);
                ib_conn->qp = NULL;
        }

        if (destroy) {
                if (iser_conn->login_buf)
                        iser_free_login_buf(iser_conn);

                if (iser_conn->rx_descs)
                        iser_free_rx_descriptors(iser_conn);

                if (device != NULL) {
                        iser_device_try_release(device);
                        ib_conn->device = NULL;
                }
        }
}

/**
 * triggers start of the disconnect procedures and wait for them to be done
 * Called with state mutex held
 */
int
iser_conn_terminate(struct iser_conn *iser_conn)
{
        struct ib_conn *ib_conn = &iser_conn->ib_conn;
        const struct ib_send_wr *bad_send_wr;
        const struct ib_recv_wr *bad_recv_wr;
        int err = 0;

        /* terminate the iser conn only if the conn state is UP */
        if (!iser_conn_state_comp_exch(iser_conn, ISER_CONN_UP,
                                           ISER_CONN_TERMINATING))
                return (0);

        ISER_INFO("iser_conn %p state %d\n", iser_conn, iser_conn->state);

        if (ib_conn->qp == NULL) {
                /* HOW can this be??? */
                ISER_WARN("qp wasn't created");
                return (1);
        }

        /*
         * Todo: This is a temporary workaround.
         * We serialize the connection closure using global lock in order to
         * receive all posted beacons completions.
         * Without Serialization, in case we open many connections (QPs) on
         * the same CQ, we might miss beacons because of missing interrupts.
         */
        sx_xlock(&ig.close_conns_mutex);

        /*
         * In case we didn't already clean up the cma_id (peer initiated
         * a disconnection), we need to Cause the CMA to change the QP
         * state to ERROR.
         */
        if (ib_conn->cma_id) {
                err = rdma_disconnect(ib_conn->cma_id);
                if (err)
                        ISER_ERR("Failed to disconnect, conn: 0x%p err %d",
                                iser_conn, err);

                mtx_lock(&ib_conn->beacon.flush_lock);
                memset(&ib_conn->beacon.send, 0, sizeof(struct ib_send_wr));
                ib_conn->beacon.send.wr_id = ISER_BEACON_WRID;
                ib_conn->beacon.send.opcode = IB_WR_SEND;
                /* post an indication that all send flush errors were consumed */
                err = ib_post_send(ib_conn->qp, &ib_conn->beacon.send, &bad_send_wr);
                if (err) {
                        ISER_ERR("conn %p failed to post send_beacon", ib_conn);
                        mtx_unlock(&ib_conn->beacon.flush_lock);
                        goto out;
                }

                ISER_DBG("before send cv_wait: %p", iser_conn);
                cv_wait(&ib_conn->beacon.flush_cv, &ib_conn->beacon.flush_lock);
                ISER_DBG("after send cv_wait: %p", iser_conn);

                memset(&ib_conn->beacon.recv, 0, sizeof(struct ib_recv_wr));
                ib_conn->beacon.recv.wr_id = ISER_BEACON_WRID;
                /* post an indication that all recv flush errors were consumed */
                err = ib_post_recv(ib_conn->qp, &ib_conn->beacon.recv, &bad_recv_wr);
                if (err) {
                        ISER_ERR("conn %p failed to post recv_beacon", ib_conn);
                        mtx_unlock(&ib_conn->beacon.flush_lock);
                        goto out;
                }

                ISER_DBG("before recv cv_wait: %p", iser_conn);
                cv_wait(&ib_conn->beacon.flush_cv, &ib_conn->beacon.flush_lock);
                mtx_unlock(&ib_conn->beacon.flush_lock);
                ISER_DBG("after recv cv_wait: %p", iser_conn);
        }
out:
        sx_xunlock(&ig.close_conns_mutex);
        return (1);
}

/**
 * Called with state mutex held
 **/
static void
iser_connect_error(struct rdma_cm_id *cma_id)
{
        struct iser_conn *iser_conn;

        iser_conn = cma_id->context;

        ISER_ERR("conn %p", iser_conn);

        iser_conn->state = ISER_CONN_TERMINATING;

        cv_signal(&iser_conn->up_cv);
}

/**
 * Called with state mutex held
 **/
static void
iser_addr_handler(struct rdma_cm_id *cma_id)
{
        struct iser_device *device;
        struct iser_conn   *iser_conn;
        struct ib_conn   *ib_conn;
        int    ret;

        iser_conn = cma_id->context;

        ib_conn = &iser_conn->ib_conn;
        device = iser_device_find_by_ib_device(cma_id);
        if (!device) {
                ISER_ERR("conn %p device lookup/creation failed",
                         iser_conn);
                iser_connect_error(cma_id);
                return;
        }

        ib_conn->device = device;

        ret = rdma_resolve_route(cma_id, 1000);
        if (ret) {
                ISER_ERR("conn %p resolve route failed: %d", iser_conn, ret);
                iser_connect_error(cma_id);
                return;
        }
}

/**
 * Called with state mutex held
 **/
static void
iser_route_handler(struct rdma_cm_id *cma_id)
{
        struct rdma_conn_param conn_param;
        int    ret;
        struct iser_cm_hdr req_hdr;
        struct iser_conn *iser_conn = cma_id->context;
        struct ib_conn *ib_conn = &iser_conn->ib_conn;
        struct iser_device *device = ib_conn->device;

        ret = iser_create_ib_conn_res(ib_conn);
        if (ret)
                goto failure;

        memset(&conn_param, 0, sizeof conn_param);
        conn_param.responder_resources = device->dev_attr.max_qp_rd_atom;
        conn_param.retry_count         = 7;
        conn_param.rnr_retry_count     = 6;
        /*
         * Initiaotr depth should not be set, but in order to compat
         * with old targets, we keep this value set.
         */
        conn_param.initiator_depth     = 1;

        memset(&req_hdr, 0, sizeof(req_hdr));
        req_hdr.flags = (ISER_ZBVA_NOT_SUPPORTED |
                        ISER_SEND_W_INV_NOT_SUPPORTED);
        conn_param.private_data         = (void *)&req_hdr;
        conn_param.private_data_len     = sizeof(struct iser_cm_hdr);

        ret = rdma_connect(cma_id, &conn_param);
        if (ret) {
                ISER_ERR("conn %p failure connecting: %d", iser_conn, ret);
                goto failure;
        }

        return;
failure:
        iser_connect_error(cma_id);
}

/**
 * Called with state mutex held
 **/
static void
iser_connected_handler(struct rdma_cm_id *cma_id)
{
        struct iser_conn *iser_conn;
        struct ib_qp_attr attr;
        struct ib_qp_init_attr init_attr;

        iser_conn = cma_id->context;

        (void)ib_query_qp(cma_id->qp, &attr, ~0, &init_attr);

        ISER_INFO("remote qpn:%x my qpn:%x",
                  attr.dest_qp_num, cma_id->qp->qp_num);

        iser_conn->state = ISER_CONN_UP;

        cv_signal(&iser_conn->up_cv);
}

/**
 * Called with state mutex held
 **/
static void
iser_cleanup_handler(struct rdma_cm_id *cma_id, bool destroy)
{
        struct iser_conn *iser_conn = cma_id->context;

        if (iser_conn_terminate(iser_conn))
                iser_conn->icl_conn.ic_error(&iser_conn->icl_conn);

}

int
iser_cma_handler(struct rdma_cm_id *cma_id, struct rdma_cm_event *event)
{
        struct iser_conn *iser_conn;
        int ret = 0;

        iser_conn = cma_id->context;
        ISER_INFO("event %d status %d conn %p id %p",
                  event->event, event->status, cma_id->context, cma_id);

        sx_xlock(&iser_conn->state_mutex);
        switch (event->event) {
        case RDMA_CM_EVENT_ADDR_RESOLVED:
                iser_addr_handler(cma_id);
                break;
        case RDMA_CM_EVENT_ROUTE_RESOLVED:
                iser_route_handler(cma_id);
                break;
        case RDMA_CM_EVENT_ESTABLISHED:
                iser_connected_handler(cma_id);
                break;
        case RDMA_CM_EVENT_ADDR_ERROR:
        case RDMA_CM_EVENT_ROUTE_ERROR:
        case RDMA_CM_EVENT_CONNECT_ERROR:
        case RDMA_CM_EVENT_UNREACHABLE:
        case RDMA_CM_EVENT_REJECTED:
                iser_connect_error(cma_id);
                break;
        case RDMA_CM_EVENT_DISCONNECTED:
        case RDMA_CM_EVENT_ADDR_CHANGE:
        case RDMA_CM_EVENT_TIMEWAIT_EXIT:
                iser_cleanup_handler(cma_id, false);
                break;
        default:
                ISER_ERR("Unexpected RDMA CM event (%d)", event->event);
                break;
        }
        sx_xunlock(&iser_conn->state_mutex);

        return (ret);
}

int
iser_post_recvl(struct iser_conn *iser_conn)
{
        const struct ib_recv_wr *rx_wr_failed;
        struct ib_recv_wr rx_wr;
        struct ib_conn *ib_conn = &iser_conn->ib_conn;
        struct ib_sge     sge;
        int ib_ret;

        sge.addr   = iser_conn->login_resp_dma;
        sge.length = ISER_RX_LOGIN_SIZE;
        sge.lkey   = ib_conn->device->mr->lkey;

        rx_wr.wr_id   = (uintptr_t)iser_conn->login_resp_buf;
        rx_wr.sg_list = &sge;
        rx_wr.num_sge = 1;
        rx_wr.next    = NULL;

        ib_conn->post_recv_buf_count++;
        ib_ret  = ib_post_recv(ib_conn->qp, &rx_wr, &rx_wr_failed);
        if (ib_ret) {
                ISER_ERR("ib_post_recv failed ret=%d", ib_ret);
                ib_conn->post_recv_buf_count--;
        }

        return (ib_ret);
}

int
iser_post_recvm(struct iser_conn *iser_conn, int count)
{
        const struct ib_recv_wr *rx_wr_failed;
        struct ib_recv_wr *rx_wr;
        int i, ib_ret;
        struct ib_conn *ib_conn = &iser_conn->ib_conn;
        unsigned int my_rx_head = iser_conn->rx_desc_head;
        struct iser_rx_desc *rx_desc;

        for (rx_wr = ib_conn->rx_wr, i = 0; i < count; i++, rx_wr++) {
                rx_desc         = &iser_conn->rx_descs[my_rx_head];
                rx_wr->wr_id    = (uintptr_t)rx_desc;
                rx_wr->sg_list  = &rx_desc->rx_sg;
                rx_wr->num_sge  = 1;
                rx_wr->next     = rx_wr + 1;
                my_rx_head = (my_rx_head + 1) % iser_conn->qp_max_recv_dtos;
        }

        rx_wr--;
        rx_wr->next = NULL; /* mark end of work requests list */

        ib_conn->post_recv_buf_count += count;
        ib_ret  = ib_post_recv(ib_conn->qp, ib_conn->rx_wr, &rx_wr_failed);
        if (ib_ret) {
                ISER_ERR("ib_post_recv failed ret=%d", ib_ret);
                ib_conn->post_recv_buf_count -= count;
        } else
                iser_conn->rx_desc_head = my_rx_head;

        return (ib_ret);
}

/**
 * iser_start_send - Initiate a Send DTO operation
 *
 * returns 0 on success, -1 on failure
 */
int iser_post_send(struct ib_conn *ib_conn, struct iser_tx_desc *tx_desc,
                   bool signal)
{
        int               ib_ret;
        const struct ib_send_wr *send_wr_failed;
        struct ib_send_wr send_wr;

        ib_dma_sync_single_for_device(ib_conn->device->ib_device,
                                      tx_desc->dma_addr, ISER_HEADERS_LEN,
                                      DMA_TO_DEVICE);

        send_wr.next       = NULL;
        send_wr.wr_id      = (uintptr_t)tx_desc;
        send_wr.sg_list    = tx_desc->tx_sg;
        send_wr.num_sge    = tx_desc->num_sge;
        send_wr.opcode     = IB_WR_SEND;
        send_wr.send_flags = signal ? IB_SEND_SIGNALED : 0;

        ib_ret = ib_post_send(ib_conn->qp, &send_wr, &send_wr_failed);
        if (ib_ret)
                ISER_ERR("ib_post_send failed, ret:%d", ib_ret);

        return (ib_ret);
}