root/usr/src/lib/udapl/udapl_tavor/tavor/dapl_tavor_wr.c
/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */

/*
 * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

#include "dapl.h"
#include "dapl_tavor_wr.h"
#include "dapl_hash.h"
#include "dapl_tavor_ibtf_impl.h"

static dapls_tavor_wrid_entry_t *dapli_tavor_wrid_find_match(
        dapls_tavor_workq_hdr_t *, tavor_hw_cqe_t *);
static dapls_tavor_wrid_list_hdr_t *dapli_tavor_wrid_get_list(uint32_t, int);
static void dapli_tavor_wrid_reaplist_add(ib_cq_handle_t,
    dapls_tavor_workq_hdr_t *);
static dapls_tavor_workq_hdr_t *dapli_tavor_wrid_wqhdr_find(ib_cq_handle_t,
    uint_t, uint_t);
static uint32_t dapli_tavor_wrid_get_wqeaddrsz(dapls_tavor_workq_hdr_t *);
static dapls_tavor_workq_hdr_t *dapli_tavor_wrid_list_reap(
        dapls_tavor_wrid_list_hdr_t *);
static dapls_tavor_workq_hdr_t *dapli_tavor_wrid_wqhdr_create(ib_cq_handle_t,
    uint_t, uint_t, uint_t);
static void dapli_tavor_wrid_wqhdr_add(dapls_tavor_workq_hdr_t *,
    dapls_tavor_wrid_list_hdr_t *);
static void dapli_tavor_wrid_wqhdr_remove(dapls_tavor_workq_hdr_t *,
    dapls_tavor_wrid_list_hdr_t *);
static void dapli_tavor_wrid_wqhdr_lock_both(ib_qp_handle_t);
static void dapli_tavor_wrid_wqhdr_unlock_both(ib_qp_handle_t);
static DAT_RETURN dapli_tavor_cq_wqhdr_add(ib_cq_handle_t,
    dapls_tavor_workq_hdr_t *);
static void dapli_tavor_cq_wqhdr_remove(ib_cq_handle_t,
    dapls_tavor_workq_hdr_t *);

/*
 * dapls_tavor_wrid_get_entry()
 */
uint64_t
dapls_tavor_wrid_get_entry(ib_cq_handle_t cq, tavor_hw_cqe_t *cqe,
    uint_t send_or_recv, uint_t error, dapls_tavor_wrid_entry_t *wre)
{
        dapls_tavor_workq_hdr_t *wq;
        dapls_tavor_wrid_entry_t        *wre_tmp;
        uint64_t                wrid;
        uint_t                  qpnum;

        /* Lock the list of work queues associated with this CQ */
        dapl_os_lock(&cq->cq_wrid_wqhdr_lock);

        /* Find the work queue for this QP number (send or receive side) */
        qpnum = TAVOR_CQE_QPNUM_GET(cqe);
        wq = dapli_tavor_wrid_wqhdr_find(cq, qpnum, send_or_recv);

        dapl_os_assert(wq != NULL);

        /*
         * Regardless of whether the completion is the result of a "success"
         * or a "failure", we lock the list of "containers" and attempt to
         * search for the the first matching completion (i.e. the first WR
         * with a matching WQE addr and size).  Once we find it, we pull out
         * the "wrid" field and return it (see below).  Note: One possible
         * future enhancement would be to enable this routine to skip over
         * any "unsignaled" completions to go directly to the next "signaled"
         * entry on success. XXX
         */
        dapl_os_lock(&wq->wq_wrid_lock->wrl_lock);
        wre_tmp = dapli_tavor_wrid_find_match(wq, cqe);

        /*
         * If this is a "successful" completion, then we assert that this
         * completion must be a "signaled" completion.
         */
        dapl_os_assert(error || (wre_tmp->wr_signaled_dbd &
            TAVOR_WRID_ENTRY_SIGNALED));

        /*
         * If the completion is a "failed" completion, then we save away the
         * contents of the entry (into the "wre" field passed in) for use
         * in later CQE processing. Note: We use the
         * dapli_tavor_wrid_get_wqeaddrsz() function to grab "wqeaddrsz" from
         * the next entry in the container.
         * This is required for error processing (where updating these fields
         * properly is necessary to correct handling of the "error" CQE)
         */
        if (error && (wre != NULL)) {
                *wre = *wre_tmp;
                wre->wr_wqeaddrsz = dapli_tavor_wrid_get_wqeaddrsz(wq);
        }

        /* Pull out the WRID and return it */
        wrid = wre_tmp->wr_wrid;

        dapl_os_unlock(&wq->wq_wrid_lock->wrl_lock);
        dapl_os_unlock(&cq->cq_wrid_wqhdr_lock);

        return (wrid);
}


/*
 * dapli_tavor_wrid_find_match()
 */
static dapls_tavor_wrid_entry_t *
dapli_tavor_wrid_find_match(dapls_tavor_workq_hdr_t *wq, tavor_hw_cqe_t *cqe)
{
        dapls_tavor_wrid_entry_t        *curr = NULL;
        dapls_tavor_wrid_list_hdr_t     *container;
        uint32_t                wqeaddr_size;
        uint32_t                head, tail, size;
        int                     found = 0, last_container;

        /* dapl_os_assert(MUTEX_HELD(&wq->wq_wrid_lock)); */

        /* Pull the "wqeaddrsz" information from the CQE */
        wqeaddr_size = TAVOR_CQE_WQEADDRSZ_GET(cqe);

        /*
         * Walk the "containers" list(s), find first WR with a matching WQE
         * addr.  If the current "container" is not the last one on the list,
         * i.e. not the current one to which we are posting new WRID entries,
         * then we do not attempt to update the "q_head", "q_tail", and
         * "q_full" indicators on the main work queue header.  We do, however,
         * update the "head" and "full" indicators on the individual containers
         * as we go.  This is imperative because we need to be able to
         * determine when the current container has been emptied (so that we
         * can move on to the next container).
         */
        container = wq->wq_wrid_poll;
        while (container != NULL) {

                /* Is this the last/only "container" on the list */
                last_container = (container != wq->wq_wrid_post) ? 0 : 1;

                /*
                 * First check if we are on an SRQ.  If so, we grab the entry
                 * and break out.  Since SRQ wridlist's are never added to
                 * reaplist, they can only be the last container.
                 */
                if (container->wl_srq_en) {
                        dapl_os_assert(last_container == 1);
                        curr = dapli_tavor_wrid_find_match_srq(container, cqe);
                        break;
                }

                /*
                 * Grab the current "head", "tail" and "size" fields before
                 * walking the list in the current container. Note: the "size"
                 * field here must always be a power-of-2.  The "full"
                 * parameter is checked (and updated) here to distinguish the
                 * "queue full" condition from "queue empty".
                 */
                head = container->wl_head;
                tail = container->wl_tail;
                size = container->wl_size;
                while ((head != tail) || (container->wl_full)) {
                        container->wl_full = 0;
                        curr = &container->wl_wre[head];
                        head = ((head + 1) & (size - 1));
                        /*
                         * If the current entry's "wqeaddrsz" matches the one
                         * we're searching for, then this must correspond to
                         * the work request that caused the completion.  Set
                         * the "found" flag and bail out.
                         */
                        if (curr->wr_wqeaddrsz == wqeaddr_size) {
                                found = 1;
                                break;
                        }
                }

                /*
                 * If the current container is empty (having reached here the
                 * "head == tail" condition can only mean that the container
                 * is empty), then NULL out the "wrid_old_tail" field (see
                 * tavor_post_send() and tavor_post_recv() for more details)
                 * and (potentially) remove the current container from future
                 * searches.
                 */
                if (head == tail) {
                        container->wl_wre_old_tail = NULL;
                        /*
                         * If this wasn't the last "container" on the chain,
                         * i.e. the one to which new WRID entries will be
                         * added, then remove it from the list.
                         * Note: we don't "lose" the memory pointed to by this
                         * because we should have already put this container
                         * on the "reapable" list (from where it will later be
                         * pulled).
                         */
                        if (!last_container) {
                                wq->wq_wrid_poll = container->wl_next;
                        }
                }

                /* Update the head index for the container */
                container->wl_head = head;

                /*
                 * If the entry was found in this container, then continue to
                 * bail out.  Else reset the "curr" pointer and move on to the
                 * next container (if there is one).  Note: the only real
                 * reason for setting "curr = NULL" here is so that the ASSERT
                 * below can catch the case where no matching entry was found
                 * on any of the lists.
                 */
                if (found) {
                        break;
                } else {
                        curr = NULL;
                        container = container->wl_next;
                }
        }

        /*
         * Update work queue header's "head" and "full" conditions to match
         * the last entry on the container list.  (Note: Only if we're pulling
         * entries from the last work queue portion of the list, i.e. not from
         * the previous portions that may be the "reapable" list.)
         */
        if (last_container) {
                wq->wq_head = wq->wq_wrid_post->wl_head;
                wq->wq_full = wq->wq_wrid_post->wl_full;
        }

        /* Ensure that we've actually found what we were searching for */
        dapl_os_assert(curr != NULL);

        return (curr);
}

/*
 * tavor_wrid_find_match_srq()
 *    Context: Can be called from interrupt or base context.
 */
dapls_tavor_wrid_entry_t *
dapli_tavor_wrid_find_match_srq(dapls_tavor_wrid_list_hdr_t *wl,
    tavor_hw_cqe_t *cqe)
{
        dapls_tavor_wrid_entry_t        *wre;
        uint32_t                wqe_index;
        uint32_t                wqe_addr;
        uint32_t                qsize_msk;
        uint32_t                tail, next_tail;

        /* Grab the WQE addr out of the CQE */
        wqe_addr = TAVOR_CQE_WQEADDRSZ_GET(cqe) & 0xFFFFFFC0;

        /*
         * Given the 'wqe_addr' just calculated and the srq buf address, we
         * find the 'wqe_index'.  The 'wre' returned below contains the WRID
         * that we are looking for.  This indexes into the wre_list for this
         * specific WQE.
         */
        wqe_index = TAVOR_SRQ_WQ_INDEX(wl->wl_srq_desc_addr, wqe_addr,
            wl->wl_srq_wqesz);

        /* ASSERT on impossible wqe_index values */
        dapl_os_assert(wqe_index < wl->wl_size);

        /* Put this WQE back on the free list */

        qsize_msk = wl->wl_size - 1;
        tail      = wl->wl_freel_tail;

        next_tail = (tail + 1) & qsize_msk;
        wl->wl_freel_entries++;

        dapl_os_assert(wl->wl_freel_entries <= wl->wl_size);

        /* Get the descriptor (IO Address) of the WQE to be built */
        wl->wl_free_list[tail] = wqe_addr;
        wl->wl_freel_tail = next_tail;
        /* Using the index, return the Work Request ID Entry (wre) */
        wre = &wl->wl_wre[wqe_index];

        return (wre);
}

/*
 * dapls_tavor_wrid_cq_reap()
 */
void
dapls_tavor_wrid_cq_reap(ib_cq_handle_t cq)
{
        dapls_tavor_workq_hdr_t *consume_wqhdr;
        dapls_tavor_wrid_list_hdr_t     *container, *to_free;


        /* dapl_os_assert(MUTEX_HELD(&cq->cq_lock)); */

        /* Lock the list of work queues associated with this CQ */
        dapl_os_lock(&cq->cq_wrid_wqhdr_lock);

        /* Walk the "reapable" list and free up containers */
        container = cq->cq_wrid_reap_head;
        while (container != NULL) {
                to_free   = container;
                container = container->wl_reap_next;
                /*
                 * If reaping the WRID list containers pulls the last
                 * container from the given work queue header, then we free
                 * the work queue header as well.
                 */
                consume_wqhdr = dapli_tavor_wrid_list_reap(to_free);
                if (consume_wqhdr != NULL) {
                        dapli_tavor_cq_wqhdr_remove(cq, consume_wqhdr);
                }
        }

        /* Once finished reaping, we reset the CQ's reap list */
        cq->cq_wrid_reap_head = cq->cq_wrid_reap_tail = NULL;

        dapl_os_unlock(&cq->cq_wrid_wqhdr_lock);
}


/*
 * dapls_tavor_wrid_cq_force_reap()
 */
void
dapls_tavor_wrid_cq_force_reap(ib_cq_handle_t cq)
{
        DAPL_HASH_DATA          curr;
        DAT_RETURN              retval;
        dapls_tavor_workq_hdr_t         *to_free_wqhdr;
        dapls_tavor_wrid_list_hdr_t     *container, *to_free;

        /* dapl_os_assert(MUTEX_HELD(&cq->cq_lock)); */

        /*
         * The first step is to walk the "reapable" list and free up those
         * containers.  This is necessary because the containers on the
         * reapable list are not otherwise connected to the work queue headers
         * anymore.
         */
        dapls_tavor_wrid_cq_reap(cq);

        /* Now lock the list of work queues associated with this CQ */
        dapl_os_lock(&cq->cq_wrid_wqhdr_lock);

        /*
         * Walk the list of work queue headers and free up all the WRID list
         * containers chained to it.  Note: We don't need to grab the locks
         * for each of the individual WRID lists here because the only way
         * things can be added or removed from the list at this point would be
         * through post a work request to a QP.  But if we've come this far,
         * then we can be assured that there are no longer any QP associated
         * with the CQ that we are trying to free.
         */
        retval = dapls_hash_iterate(cq->cq_wrid_wqhdr_list,
            DAPL_HASH_ITERATE_INIT, &curr);
        dapl_os_assert(retval == DAT_SUCCESS);

        while (curr != NULL) {
                to_free_wqhdr = (dapls_tavor_workq_hdr_t *)curr;
                container = ((dapls_tavor_workq_hdr_t *)curr)->wq_wrid_poll;
                retval = dapls_hash_iterate(cq->cq_wrid_wqhdr_list,
                    DAPL_HASH_ITERATE_NEXT, &curr);
                dapl_os_assert(retval == DAT_SUCCESS);
                while (container != NULL) {
                        to_free   = container;
                        container = container->wl_next;
                        /*
                         * If reaping the WRID list containers pulls the last
                         * container from the given work queue header, then
                         * we free the work queue header as well.  Note: we
                         * ignore the return value because we know that the
                         * work queue header should always be freed once the
                         * list of containers has come to an end.
                         */
                        (void) dapli_tavor_wrid_list_reap(to_free);
                        if (container == NULL) {
                                dapli_tavor_cq_wqhdr_remove(cq, to_free_wqhdr);
                        }
                }
        }

        dapl_os_lock(&cq->cq_wrid_wqhdr_lock);
}


/*
 * dapli_tavor_wrid_get_list()
 */
static dapls_tavor_wrid_list_hdr_t *
dapli_tavor_wrid_get_list(uint32_t qsize, int wrid_for_srq)
{
        dapls_tavor_wrid_list_hdr_t     *wridlist;
        dapls_tavor_wrid_entry_t        *wl_wre;
        uint32_t                        *wl_freel;
        uint32_t                        size;
        uint32_t                        wl_wre_size;
        uint32_t                        wl_freel_size;

        wridlist = NULL;
        wl_wre = NULL;
        wl_freel = NULL;
        size = wl_wre_size = wl_freel_size = 0;
        /*
         * The WRID list "container" consists of the dapls_tavor_wrid_list_hdr_t
         * which holds the pointers necessary for maintaining the "reapable"
         * list, chaining together multiple "containers" old and new, and
         * tracking the head, tail, size, etc. for each container.  The
         * "container" also holds all the tavor_wrid_entry_t's, one for
         * each entry on the corresponding work queue.
         */

        /*
         * For wridlist associated with SRQs the wridlock needs to be
         * allocated and initialized here.
         */
        size = sizeof (dapls_tavor_wrid_list_hdr_t);
        if (wrid_for_srq) {
                size = size + sizeof (dapls_tavor_wrid_lock_t);
        }
        wridlist = dapl_os_alloc(size);
        if (wridlist == NULL) {
                goto bail;
        }
        if (wrid_for_srq) {
                wridlist->wl_lock = (dapls_tavor_wrid_lock_t *)(
                    (uintptr_t)wridlist + sizeof (dapls_tavor_wrid_list_hdr_t));
                dapl_os_lock_init(&wridlist->wl_lock->wrl_lock);
                wridlist->wl_lock->wrl_on_srq = wrid_for_srq;
        } else {
                wridlist->wl_lock = NULL;
        }
        wl_wre_size = qsize * sizeof (dapls_tavor_wrid_entry_t);
        wl_wre = dapl_os_alloc(wl_wre_size);
        if (wl_wre == NULL) {
                goto bail;
        }
        if (wrid_for_srq) { /* memory for the SRQ free list */
                wl_freel_size = qsize * sizeof (uint32_t);
                wl_freel = dapl_os_alloc(wl_freel_size);
                if (wl_freel == NULL) {
                        goto bail;
                }
        }


        /* Complete the "container" initialization */
        wridlist->wl_size = qsize;
        wridlist->wl_full = 0;
        wridlist->wl_head = 0;
        wridlist->wl_tail = 0;
        wridlist->wl_wre = wl_wre;
        wridlist->wl_wre_old_tail  = NULL;
        wridlist->wl_reap_next = NULL;
        wridlist->wl_next  = NULL;
        wridlist->wl_prev  = NULL;
        if (wrid_for_srq) {
                wridlist->wl_srq_en = 1;
                wridlist->wl_free_list = (uint32_t *)wl_freel;
                wridlist->wl_freel_head = 0;
                wridlist->wl_freel_tail = 0;
                wridlist->wl_freel_entries = qsize;
        } else {
                wridlist->wl_srq_en = 0;
                wridlist->wl_free_list = NULL;
                wridlist->wl_freel_head = 0;
                wridlist->wl_freel_tail = 0;
                wridlist->wl_freel_entries = 0;
                wridlist->wl_srq_wqesz = 0;
                wridlist->wl_srq_desc_addr = 0;
        }
        return (wridlist);
bail:
        if (wridlist) {
                if (wrid_for_srq) {
                        dapl_os_lock_destroy(&wridlist->wl_lock->wrl_lock);
                }
                dapl_os_free(wridlist, size);
        }
        if (wl_wre) {
                dapl_os_free(wl_wre, wl_wre_size);
        }
        if (wl_freel) {
                dapl_os_free(wl_freel, wl_freel_size);
        }
        return (NULL);
}


/*
 * dapli_tavor_wrid_reaplist_add()
 */
static void
dapli_tavor_wrid_reaplist_add(ib_cq_handle_t cq, dapls_tavor_workq_hdr_t *wq)
{
        /* dapl_os_assert(MUTEX_HELD(&cq->cq_wrid_wqhdr_lock)); */

        dapl_os_lock(&wq->wq_wrid_lock->wrl_lock);

        /*
         * Add the "post" container (the last one on the current chain) to
         * the CQ's "reapable" list
         */
        if ((cq->cq_wrid_reap_head == NULL) &&
            (cq->cq_wrid_reap_tail == NULL)) {
                cq->cq_wrid_reap_head = wq->wq_wrid_post;
                cq->cq_wrid_reap_tail = wq->wq_wrid_post;
        } else {
                cq->cq_wrid_reap_tail->wl_reap_next = wq->wq_wrid_post;
                cq->cq_wrid_reap_tail = wq->wq_wrid_post;
        }

        dapl_os_unlock(&wq->wq_wrid_lock->wrl_lock);
}


/*
 * dapli_tavor_wrid_wqhdr_find()
 */
static dapls_tavor_workq_hdr_t *
dapli_tavor_wrid_wqhdr_find(ib_cq_handle_t cq, uint_t qpn, uint_t send_or_recv)
{
        DAPL_HASH_DATA          curr;
        DAPL_HASH_KEY           key;
        DAT_RETURN              status;

        /* dapl_os_assert(MUTEX_HELD(&cq->cq_wrid_wqhdr_lock)); */

        /*
         * Walk the CQ's work queue list, trying to find a send or recv queue
         * with the same QP number.  We do this even if we are going to later
         * create a new entry because it helps us easily find the end of the
         * list.
         */
        key = (DAPL_HASH_KEY)(((uint64_t)send_or_recv << 32) | (uint32_t)qpn);

        status = dapls_hash_search(cq->cq_wrid_wqhdr_list, key, &curr);
        if (status == DAT_SUCCESS) {
                return ((dapls_tavor_workq_hdr_t *)curr);
        } else {
                return (NULL);
        }
}




/*
 * dapli_tavor_wrid_get_wqeaddrsz()
 */
static uint32_t
dapli_tavor_wrid_get_wqeaddrsz(dapls_tavor_workq_hdr_t *wq)
{
        dapls_tavor_wrid_entry_t        *wre;
        uint32_t                wqeaddrsz;
        uint32_t                head;

        /*
         * If the container is empty, then there is no next entry. So just
         * return zero.  Note: the "head == tail" condition here can only
         * mean that the container is empty because we have previously pulled
         * something from the container.
         *
         * If the container is not empty, then find the next entry and return
         * the contents of its "wqeaddrsz" field.
         */
        if (wq->wq_wrid_poll->wl_head == wq->wq_wrid_poll->wl_tail) {
                wqeaddrsz = 0;
        } else {
                /*
                 * We don't need to calculate the "next" head pointer here
                 * because "head" should already point to the next entry on
                 * the list (since we just pulled something off - in
                 * dapli_tavor_wrid_find_match() - and moved the head index
                 * forward.)
                 */
                head = wq->wq_wrid_poll->wl_head;
                wre = &wq->wq_wrid_poll->wl_wre[head];
                wqeaddrsz = wre->wr_wqeaddrsz;
        }
        return (wqeaddrsz);
}



/*
 * dapli_tavor_wrid_list_reap()
 *    Note: The "wqhdr_list_lock" must be held.
 */
static dapls_tavor_workq_hdr_t *
dapli_tavor_wrid_list_reap(dapls_tavor_wrid_list_hdr_t *wridlist)
{
        dapls_tavor_workq_hdr_t *wqhdr, *consume_wqhdr = NULL;
        dapls_tavor_wrid_list_hdr_t     *prev, *next;

        /* Get the back pointer to the work queue header (see below) */
        wqhdr = wridlist->wl_wqhdr;
        dapl_os_lock(&wqhdr->wq_wrid_lock->wrl_lock);

        /* Unlink the WRID list "container" from the work queue list */
        prev = wridlist->wl_prev;
        next = wridlist->wl_next;
        if (prev != NULL) {
                prev->wl_next = next;
        }
        if (next != NULL) {
                next->wl_prev = prev;
        }

        /*
         * If the back pointer to the work queue header shows that it
         * was pointing to the entry we are about to remove, then the work
         * queue header is reapable as well.
         */
        if ((wqhdr->wq_wrid_poll == wridlist) &&
            (wqhdr->wq_wrid_post == wridlist)) {
                consume_wqhdr = wqhdr;
        }

        /* Be sure to update the "poll" and "post" container pointers */
        if (wqhdr->wq_wrid_poll == wridlist) {
                wqhdr->wq_wrid_poll = next;
        }
        if (wqhdr->wq_wrid_post == wridlist) {
                wqhdr->wq_wrid_post = NULL;
        }

        /*
         * Calculate the size and free the container, for SRQ wridlist is
         * freed when srq gets freed
         */
        if (!wridlist->wl_srq_en) {
                if (wridlist->wl_wre) {
                        dapl_os_free(wridlist->wl_wre, wridlist->wl_size *
                            sizeof (dapls_tavor_wrid_entry_t));
                }
                dapl_os_assert(wridlist->wl_free_list == NULL);
                dapl_os_free(wridlist, sizeof (dapls_tavor_wrid_list_hdr_t));
        }

        dapl_os_unlock(&wqhdr->wq_wrid_lock->wrl_lock);

        return (consume_wqhdr);
}

/*
 * dapls_tavor_srq_wrid_init()
 */
DAT_RETURN
dapls_tavor_srq_wrid_init(ib_srq_handle_t srq)
{
        dapls_tavor_wrid_list_hdr_t     *wridlist;
        int i;

        wridlist = dapli_tavor_wrid_get_list(srq->srq_wq_numwqe, 1);


        if (wridlist == NULL) {
                srq->srq_wridlist = NULL;
                return (DAT_INSUFFICIENT_RESOURCES | DAT_RESOURCE_MEMORY);
        }

        /* initialize the free list with the descriptor addresses */
        wridlist->wl_free_list[0] = srq->srq_wq_desc_addr;
        for (i = 1; i < srq->srq_wq_numwqe; i++) {
                wridlist->wl_free_list[i] = wridlist->wl_free_list[i-1] +
                    srq->srq_wq_wqesz;
        }
        wridlist->wl_srq_wqesz = srq->srq_wq_wqesz;
        wridlist->wl_srq_desc_addr = srq->srq_wq_desc_addr;

        srq->srq_wridlist = wridlist;
        return (DAT_SUCCESS);
}

void
dapls_tavor_srq_wrid_free(ib_srq_handle_t srq)
{
        dapls_tavor_wrid_list_hdr_t     *wridlist;
        size_t                          size = 0;

        wridlist = srq->srq_wridlist;
        if (wridlist) {
                dapl_os_assert(wridlist->wl_srq_en == 1);
                if (wridlist->wl_wre) {
                        dapl_os_free(wridlist->wl_wre, wridlist->wl_size *
                            sizeof (dapls_tavor_wrid_entry_t));
                }
                if (wridlist->wl_free_list) {
                        dapl_os_free(wridlist->wl_free_list, wridlist->wl_size *
                            sizeof (uint32_t));
                }
                if (wridlist->wl_lock) {
                        dapl_os_assert(wridlist->wl_lock->wrl_on_srq == 1);
                        dapl_os_lock_destroy(&wridlist->wl_lock->wrl_lock);
                        size = sizeof (dapls_tavor_wrid_lock_t);
                }
                size = size; /* pacify lint */
                dapl_os_free(wridlist, size +
                    sizeof (dapls_tavor_wrid_list_hdr_t));
                srq->srq_wridlist = NULL;
        }
}


/*
 * dapls_tavor_wrid_init()
 */
DAT_RETURN
dapls_tavor_wrid_init(ib_qp_handle_t qp)
{
        dapls_tavor_workq_hdr_t         *swq;
        dapls_tavor_workq_hdr_t         *rwq;
        dapls_tavor_wrid_list_hdr_t     *s_wridlist;
        dapls_tavor_wrid_list_hdr_t     *r_wridlist;
        uint_t          create_new_swq = 0;
        uint_t          create_new_rwq = 0;

        /*
         * For each of this QP's Work Queues, make sure we have a (properly
         * initialized) Work Request ID list attached to the relevant
         * completion queue.  Grab the CQ lock(s) before manipulating the
         * lists.
         */
        dapli_tavor_wrid_wqhdr_lock_both(qp);
        swq = dapli_tavor_wrid_wqhdr_find(qp->qp_sq_cqhdl, qp->qp_num,
            TAVOR_WR_SEND);
        if (swq == NULL) {
                /* Couldn't find matching work queue header, create it */
                create_new_swq = 1;
                swq = dapli_tavor_wrid_wqhdr_create(qp->qp_sq_cqhdl,
                    qp->qp_num, TAVOR_WR_SEND, 1);
                if (swq == NULL) {
                        /*
                         * If we couldn't find/allocate space for the workq
                         * header, then drop the lock(s) and return failure.
                         */
                        dapli_tavor_wrid_wqhdr_unlock_both(qp);
                        return (DAT_INSUFFICIENT_RESOURCES);
                }
        }
        qp->qp_sq_wqhdr = swq;
        swq->wq_size = qp->qp_sq_numwqe;
        swq->wq_head = 0;
        swq->wq_tail = 0;
        swq->wq_full = 0;

        /*
         * Allocate space for the dapls_tavor_wrid_entry_t container
         */
        s_wridlist = dapli_tavor_wrid_get_list(swq->wq_size, 0);
        if (s_wridlist == NULL) {
                /*
                 * If we couldn't allocate space for tracking the WRID
                 * entries, then cleanup the workq header from above (if
                 * necessary, i.e. if we created the workq header).  Then
                 * drop the lock(s) and return failure.
                 */
                if (create_new_swq) {
                        dapli_tavor_cq_wqhdr_remove(qp->qp_sq_cqhdl, swq);
                }

                dapli_tavor_wrid_wqhdr_unlock_both(qp);
                return (DAT_INSUFFICIENT_RESOURCES | DAT_RESOURCE_MEMORY);
        }
        s_wridlist->wl_wqhdr = swq;
        /* Chain the new WRID list container to the workq hdr list */
        dapl_os_lock(&swq->wq_wrid_lock->wrl_lock);
        dapli_tavor_wrid_wqhdr_add(swq, s_wridlist);
        dapl_os_unlock(&swq->wq_wrid_lock->wrl_lock);


        /*
         * Now we repeat all the above operations for the receive work queue
         */
        rwq = dapli_tavor_wrid_wqhdr_find(qp->qp_rq_cqhdl, qp->qp_num,
            TAVOR_WR_RECV);
        if (rwq == NULL) {
                create_new_rwq = 1;
                /* if qp is attached to an SRQ don't need to alloc wrid_lock */
                rwq = dapli_tavor_wrid_wqhdr_create(qp->qp_rq_cqhdl,
                    qp->qp_num, TAVOR_WR_RECV, qp->qp_srq_enabled ? 0 : 1);
                if (rwq == NULL) {
                        /*
                         * If we couldn't find/allocate space for the workq
                         * header, then free all the send queue resources we
                         * just allocated and setup (above), drop the lock(s)
                         * and return failure.
                         */
                        dapl_os_lock(&swq->wq_wrid_lock->wrl_lock);
                        dapli_tavor_wrid_wqhdr_remove(swq, s_wridlist);
                        dapl_os_unlock(&swq->wq_wrid_lock->wrl_lock);
                        if (create_new_swq) {
                                dapli_tavor_cq_wqhdr_remove(qp->qp_sq_cqhdl,
                                    swq);
                        }

                        dapli_tavor_wrid_wqhdr_unlock_both(qp);
                        return (DAT_INSUFFICIENT_RESOURCES |
                            DAT_RESOURCE_MEMORY);
                }
        }
        qp->qp_rq_wqhdr = rwq;
        rwq->wq_size = qp->qp_rq_numwqe;
        rwq->wq_head = 0;
        rwq->wq_tail = 0;
        rwq->wq_full = 0;

        /*
         * Allocate space for the dapls_tavor_wrid_entry_t container
         * For qp associated with SRQs the SRQ wridlist is used
         */
        if (qp->qp_srq_enabled) {
                /* Use existing srq_wridlist pointer */
                r_wridlist = qp->qp_srq->srq_wridlist;
                dapl_os_assert(r_wridlist != NULL);
                /* store the wl_lock in the wqhdr */
                rwq->wq_wrid_lock = r_wridlist->wl_lock;
                dapl_os_assert(rwq->wq_wrid_lock != NULL);
        } else {
                /* Allocate memory for the r_wridlist */
                r_wridlist = dapli_tavor_wrid_get_list(rwq->wq_size, 0);
        }
        if (r_wridlist == NULL) {
                /*
                 * If we couldn't allocate space for tracking the WRID
                 * entries, then cleanup all the stuff from above.  Then
                 * drop the lock(s) and return failure.
                 */
                dapl_os_lock(&swq->wq_wrid_lock->wrl_lock);
                dapli_tavor_wrid_wqhdr_remove(swq, s_wridlist);
                dapl_os_unlock(&swq->wq_wrid_lock->wrl_lock);
                if (create_new_swq) {
                        dapli_tavor_cq_wqhdr_remove(qp->qp_sq_cqhdl, swq);
                }
                if (create_new_rwq) {
                        dapli_tavor_cq_wqhdr_remove(qp->qp_rq_cqhdl, rwq);
                }

                dapli_tavor_wrid_wqhdr_unlock_both(qp);
                return (DAT_INSUFFICIENT_RESOURCES | DAT_RESOURCE_MEMORY);
        }

        /* For SRQ based QPs r_wridlist does not point to recv wqhdr */
        if (!qp->qp_srq_enabled) {
                r_wridlist->wl_wqhdr = rwq;
        }

        /* Chain the new WRID list "container" to the workq hdr list */
        dapl_os_lock(&rwq->wq_wrid_lock->wrl_lock);
        dapli_tavor_wrid_wqhdr_add(rwq, r_wridlist);
        dapl_os_unlock(&rwq->wq_wrid_lock->wrl_lock);

        dapli_tavor_wrid_wqhdr_unlock_both(qp);

        return (DAT_SUCCESS);
}


/*
 * dapls_tavor_wrid_cleanup()
 */
void
dapls_tavor_wrid_cleanup(DAPL_EP *ep, ib_qp_handle_t qp)
{
        /*
         * For each of this QP's Work Queues, move the WRID "container" to
         * the "reapable" list.  Although there may still be unpolled
         * entries in these containers, it is not a big deal.  We will not
         * reap the list until either the Poll CQ command detects an empty
         * condition or the CQ itself is freed.  Grab the CQ lock(s) before
         * manipulating the lists.
         */
        dapli_tavor_wrid_wqhdr_lock_both(qp);
        dapli_tavor_wrid_reaplist_add(qp->qp_sq_cqhdl, qp->qp_sq_wqhdr);

        /*
         * Repeat the above operation for the Recv work queue "container".
         * However for qps with SRQ we flush the cq entries, remove the
         * wridlist and wqhdr.
         * Then drop the CQ lock(s) and return
         */
        if (qp->qp_srq_enabled) {
                /*
                 * Pull off all (if any) entries for this QP from CQ.  This
                 * only includes entries that have not yet been polled
                 */
                dapl_os_lock(&qp->qp_rq_wqhdr->wq_wrid_lock->wrl_lock);
                DAPL_FLUSH(ep)(qp);

                /* Remove wridlist from WQHDR */
                dapli_tavor_wrid_wqhdr_remove(qp->qp_rq_wqhdr,
                    qp->qp_rq_wqhdr->wq_wrid_post);

                dapl_os_assert(qp->qp_rq_wqhdr->wq_wrid_post == NULL);

                dapl_os_unlock(&qp->qp_rq_wqhdr->wq_wrid_lock->wrl_lock);

                /* Free the WQHDR */
                dapli_tavor_cq_wqhdr_remove(qp->qp_rq_cqhdl, qp->qp_rq_wqhdr);
        } else {
                dapli_tavor_wrid_reaplist_add(qp->qp_rq_cqhdl, qp->qp_rq_wqhdr);
        }
        dapli_tavor_wrid_wqhdr_unlock_both(qp);
}

/*
 * dapli_tavor_wrid_wqhdr_create()
 */
static dapls_tavor_workq_hdr_t *
dapli_tavor_wrid_wqhdr_create(ib_cq_handle_t cq, uint_t qpn,
    uint_t send_or_recv, uint_t alloc_wrl)
{
        dapls_tavor_workq_hdr_t *wqhdr_tmp;
        size_t                  size, aligned_size;

        /* dapl_os_assert(MUTEX_HELD(&cq->cq_wrid_wqhdr_lock)); */

        /*
         * Allocate space for a work queue header structure and initialize it.
         * Each work queue header structure includes a "wq_wrid_lock"
         * which needs to be initialized.
         *
         * Note: the address smashing is needed to ensure wq_wrid_lock is
         * 8-byte aligned, which is not always the case on 32-bit sparc.
         */
        size = (sizeof (dapls_tavor_workq_hdr_t) + 0x7) & ~0x7;
        aligned_size = size;
        if (alloc_wrl) {
                /* for non-srq wqhdr the lock is allocated with the wqhdr */
                size = size + sizeof (dapls_tavor_wrid_lock_t);
        }
        wqhdr_tmp = dapl_os_alloc(size);
        if (wqhdr_tmp == NULL) {
                return (NULL);
        }
        if (alloc_wrl) {
                wqhdr_tmp->wq_wrid_lock = (dapls_tavor_wrid_lock_t *)
                    (((uintptr_t)wqhdr_tmp + aligned_size) & ~0x7);
                dapl_os_lock_init(&wqhdr_tmp->wq_wrid_lock->wrl_lock);
                /* wrl allocated with wqhdr don't have srq enabled */
                wqhdr_tmp->wq_wrid_lock->wrl_on_srq = 0;
        }

        wqhdr_tmp->wq_qpn       = qpn;
        wqhdr_tmp->wq_send_or_recv = send_or_recv;

        wqhdr_tmp->wq_wrid_poll = NULL;
        wqhdr_tmp->wq_wrid_post = NULL;

        /* Chain the newly allocated work queue header to the CQ's list */
        if (dapli_tavor_cq_wqhdr_add(cq, wqhdr_tmp) != DAT_SUCCESS) {
                if (alloc_wrl) {
                        dapl_os_lock_destroy(&wqhdr_tmp->wq_wrid_lock->
                            wrl_lock);
                }
                dapl_os_free(wqhdr_tmp, size);
                wqhdr_tmp = NULL;
        }

        return (wqhdr_tmp);
}

/*
 * dapli_tavor_wrid_wqhdr_add()
 */
static void
dapli_tavor_wrid_wqhdr_add(dapls_tavor_workq_hdr_t *wqhdr,
    dapls_tavor_wrid_list_hdr_t *wridlist)
{
        /* dapl_os_assert(MUTEX_HELD(&wqhdr->wq_wrid_lock)); */

        /* Chain the new WRID list "container" to the work queue list */
        if ((wqhdr->wq_wrid_post == NULL) &&
            (wqhdr->wq_wrid_poll == NULL)) {
                wqhdr->wq_wrid_poll = wridlist;
                wqhdr->wq_wrid_post = wridlist;
        } else {
                wqhdr->wq_wrid_post->wl_next = wridlist;
                wridlist->wl_prev = wqhdr->wq_wrid_post;
                wqhdr->wq_wrid_post = wridlist;
        }
}


/*
 * dapli_tavor_wrid_wqhdr_remove()
 *    Note: this is only called to remove the most recently added WRID list
 *    container.
 */
static void
dapli_tavor_wrid_wqhdr_remove(dapls_tavor_workq_hdr_t *wqhdr,
    dapls_tavor_wrid_list_hdr_t *wridlist)
{
        dapls_tavor_wrid_list_hdr_t     *prev, *next;

        /* dapl_os_assert(MUTEX_HELD(&wqhdr->wq_wrid_lock)); */

        /* Unlink the WRID list "container" from the work queue list */
        prev = wridlist->wl_prev;
        next = wridlist->wl_next;
        if (prev != NULL) {
                prev->wl_next = next;
        }
        if (next != NULL) {
                next->wl_prev = prev;
        }

        /*
         * Update any pointers in the work queue hdr that may point to this
         * WRID list container
         */
        if (wqhdr->wq_wrid_post == wridlist) {
                wqhdr->wq_wrid_post = prev;
        }
        if (wqhdr->wq_wrid_poll == wridlist) {
                wqhdr->wq_wrid_poll = NULL;
        }
}


/*
 * dapli_tavor_wrid_wqhdr_lock_both()
 */
static void
dapli_tavor_wrid_wqhdr_lock_both(ib_qp_handle_t qp)
{
        ib_cq_handle_t  sq_cq, rq_cq;

        sq_cq = qp->qp_sq_cqhdl;
        rq_cq = qp->qp_rq_cqhdl;

        /*
         * If both work queues (send and recv) share a completion queue, then
         * grab the common lock.  If they use different CQs (hence different
         * "cq_wrid_wqhdr_list" locks), then grab the send one first, then the
         * receive.  We do this consistently and correctly in
         * tavor_wrid_wqhdr_unlock_both() below to avoid introducing any kind
         * of dead lock condition.
         */
        if (sq_cq == rq_cq) {
                dapl_os_lock(&sq_cq->cq_wrid_wqhdr_lock);
        } else {
                dapl_os_lock(&sq_cq->cq_wrid_wqhdr_lock);
                dapl_os_lock(&rq_cq->cq_wrid_wqhdr_lock);
        }
}

/*
 * dapli_tavor_wrid_wqhdr_unlock_both()
 */
static void
dapli_tavor_wrid_wqhdr_unlock_both(ib_qp_handle_t qp)
{
        ib_cq_handle_t  sq_cq, rq_cq;

        sq_cq = qp->qp_sq_cqhdl;
        rq_cq = qp->qp_rq_cqhdl;

        /*
         * See tavor_wrid_wqhdr_lock_both() above for more detail
         */
        if (sq_cq == rq_cq) {
                dapl_os_unlock(&sq_cq->cq_wrid_wqhdr_lock);
        } else {
                dapl_os_unlock(&rq_cq->cq_wrid_wqhdr_lock);
                dapl_os_unlock(&sq_cq->cq_wrid_wqhdr_lock);
        }
}


/*
 * dapli_tavor_cq_wqhdr_add()
 */
static DAT_RETURN
dapli_tavor_cq_wqhdr_add(ib_cq_handle_t cq, dapls_tavor_workq_hdr_t *wqhdr)
{
        DAPL_HASH_KEY           key;

        /* dapl_os_assert(MUTEX_HELD(&cq->cq_wrid_wqhdr_lock)); */

        /*
         * If the CQ's work queue list is empty, then just add it.
         * Otherwise, chain it to the beginning of the list.
         */
        key = (DAPL_HASH_KEY)(((uint64_t)wqhdr->wq_send_or_recv << 32) |
            wqhdr->wq_qpn);

        return (dapls_hash_insert(cq->cq_wrid_wqhdr_list, key, wqhdr));
}


/*
 * dapli_tavor_cq_wqhdr_remove
 */
static void
dapli_tavor_cq_wqhdr_remove(ib_cq_handle_t cq, dapls_tavor_workq_hdr_t *wqhdr)
{
        DAPL_HASH_DATA  curr;
        DAPL_HASH_KEY   key;
        size_t          size = 0;

        /* dapl_os_assert(MUTEX_HELD(&cq->cq_wrid_wqhdr_lock)); */

        /* Remove "wqhdr" from the work queue header list on "cq" */

        key = (DAPL_HASH_KEY)(((uint64_t)wqhdr->wq_send_or_recv << 32) |
            wqhdr->wq_qpn);

        (void) dapls_hash_remove(cq->cq_wrid_wqhdr_list, key,  &curr);

        size = (sizeof (dapls_tavor_workq_hdr_t) + 0x7) & ~0x7;
        if (wqhdr->wq_wrid_lock && (!wqhdr->wq_wrid_lock->wrl_on_srq)) {
                dapl_os_lock_destroy(&wqhdr->wq_wrid_lock->wrl_lock);
                size += sizeof (dapls_tavor_wrid_lock_t);
        }

        /* Free the memory associated with "wqhdr" */
        dapl_os_free(wqhdr, size);
}

/*
 * dapls_tavor_srq_wrid_resize() is called to resize the wridlist
 * associated with SRQS as a result of dat_srq_resize().
 *
 * Returns: DAT_TRUE if successful, otherwise DAT_FALSE
 */
DAT_BOOLEAN
dapls_tavor_srq_wrid_resize(ib_srq_handle_t srq_handle, uint32_t new_size)
{
        dapls_tavor_wrid_list_hdr_t     *wridlist;
        dapls_tavor_wrid_entry_t        *old_wl_wre;
        dapls_tavor_wrid_entry_t        *new_wl_wre;
        uint32_t                        *old_wl_freel;
        uint32_t                        *new_wl_freel;
        uint32_t                        old_size;
        uint32_t                        idx;
        uint32_t                        prev_idx;
        uint32_t                        i;

        wridlist = srq_handle->srq_wridlist;

        if (wridlist == NULL) {
                return (DAT_FALSE);
        }
        dapl_os_assert(wridlist->wl_srq_en);

        dapl_os_lock(&wridlist->wl_lock->wrl_lock);

        old_wl_wre = wridlist->wl_wre;
        old_wl_freel = wridlist->wl_free_list;
        old_size = wridlist->wl_size;

        new_wl_wre = (dapls_tavor_wrid_entry_t *)dapl_os_alloc(new_size *
            sizeof (dapls_tavor_wrid_entry_t));
        if (new_wl_wre == NULL) {
                goto bail;
        }
        new_wl_freel = dapl_os_alloc(new_size * sizeof (uint32_t));
        if (new_wl_freel == NULL) {
                goto bail;
        }
        /*
         * we just need to copy the old WREs to the new array. Since the
         * descriptors are relatively addressed the descriptor to index
         * mapping doesn't change.
         */
        (void) dapl_os_memcpy(&new_wl_wre[0], &old_wl_wre[0],
            old_size * sizeof (dapls_tavor_wrid_entry_t));
        /*
         * Copy the old free list to the new one
         */
        idx = wridlist->wl_freel_head;
        for (i = 0; i < wridlist->wl_freel_entries; i++) {
                new_wl_freel[i] = old_wl_freel[idx];
                idx = (idx + 1) % old_size;
        }
        /*
         * Add the new entries in wl_wre to the new free list
         */
        idx = wridlist->wl_freel_entries;
        new_wl_freel[idx] = wridlist->wl_srq_desc_addr + old_size *
            wridlist->wl_srq_wqesz;
        prev_idx = idx;
        idx = (idx + 1) % new_size;
        for (i = 0; i < new_size - old_size - 1; i++) {
                new_wl_freel[idx] = new_wl_freel[prev_idx] +
                    wridlist->wl_srq_wqesz;
                prev_idx = idx;
                idx = (idx + 1) % new_size;
        }
        wridlist->wl_size = new_size;
        wridlist->wl_wre = new_wl_wre;
        wridlist->wl_free_list = new_wl_freel;
        wridlist->wl_freel_head = 0;
        wridlist->wl_freel_tail = idx;
        wridlist->wl_freel_entries = wridlist->wl_freel_entries + new_size -
            old_size;

        dapl_os_unlock(&wridlist->wl_lock->wrl_lock);

        if (old_wl_wre) {
                dapl_os_free(old_wl_wre, old_size *
                    sizeof (dapls_tavor_wrid_entry_t));
        }
        if (old_wl_freel) {
                dapl_os_free(old_wl_freel, old_size * sizeof (uint32_t));
        }
        return (DAT_TRUE);
bail:
        dapl_os_unlock(&wridlist->wl_lock->wrl_lock);
        if (new_wl_wre) {
                dapl_os_free(new_wl_wre, new_size *
                    sizeof (dapls_tavor_wrid_entry_t));
        }
        return (DAT_FALSE);
}