root/usr/src/uts/common/io/ib/clients/rdsv3/ib_rdma.c
/*
 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
 */

/*
 * This file contains code imported from the OFED rds source file ib_rdma.c
 * Oracle elects to have and use the contents of ib_rdma.c under and governed
 * by the OpenIB.org BSD license (see below for full license text). However,
 * the following notice accompanied the original version of this file:
 */

/*
 * Copyright (c) 2006 Oracle.  All rights reserved.
 *
 * This software is available to you under a choice of one of two
 * licenses.  You may choose to be licensed under the terms of the GNU
 * General Public License (GPL) Version 2, available from the file
 * COPYING in the main directory of this source tree, or the
 * OpenIB.org BSD license below:
 *
 *     Redistribution and use in source and binary forms, with or
 *     without modification, are permitted provided that the following
 *     conditions are met:
 *
 *      - Redistributions of source code must retain the above
 *        copyright notice, this list of conditions and the following
 *        disclaimer.
 *
 *      - Redistributions in binary form must reproduce the above
 *        copyright notice, this list of conditions and the following
 *        disclaimer in the documentation and/or other materials
 *        provided with the distribution.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 *
 */
#include <sys/rds.h>
#include <netinet/in.h>

#include <sys/ib/clients/rdsv3/rdsv3.h>
#include <sys/ib/clients/rdsv3/rdma.h>
#include <sys/ib/clients/rdsv3/ib.h>
#include <sys/ib/clients/rdsv3/rdsv3_debug.h>

/*
 * This is stored as mr->r_trans_private.
 */
struct rdsv3_ib_mr {
        list_node_t             m_obj; /* list obj of rdsv3_fmr_pool list */
        struct rdsv3_ib_device  *m_device;
        struct rdsv3_fmr_pool   *m_pool; /* hca fmr pool */
        unsigned int            m_inval:1;

        struct rdsv3_scatterlist        *sg;
        unsigned int            sg_len;
        uint64_t                *dma;
        int                     sg_dma_len;

        /* DDI pinned memory */
        ddi_umem_cookie_t       umem_cookie;
        /* IBTF type definitions */
        ibt_hca_hdl_t           rc_hca_hdl;
        ibt_fmr_pool_hdl_t      fmr_pool_hdl;
        ibt_ma_hdl_t            rc_ma_hdl;
        ibt_mr_hdl_t            rc_fmr_hdl;
        ibt_pmr_desc_t          rc_mem_desc;
};

/*
 * delayed freed fmr's
 */
struct rdsv3_fmr_pool {
        list_t                  f_list; /* list of freed mr */
        kmutex_t                f_lock; /* lock of fmr pool */
        int32_t                 f_listcnt;
};

static int rdsv3_ib_flush_mr_pool(struct rdsv3_ib_device *rds_ibdev,
        ibt_fmr_pool_hdl_t pool_hdl, int free_all);
static void rdsv3_ib_teardown_mr(struct rdsv3_ib_mr *ibmr);
static void rdsv3_ib_mr_pool_flush_worker(struct rdsv3_work_s *work);
static struct rdsv3_ib_mr *rdsv3_ib_alloc_fmr(struct rdsv3_ib_device
        *rds_ibdev);
static int rdsv3_ib_map_fmr(struct rdsv3_ib_device *rds_ibdev,
        struct rdsv3_ib_mr *ibmr, struct buf *bp, unsigned int nents);

static struct rdsv3_ib_device *
rdsv3_ib_get_device(uint32_be_t ipaddr)
{
        struct rdsv3_ib_device *rds_ibdev;
        struct rdsv3_ib_ipaddr *i_ipaddr;

        RDSV3_DPRINTF4("rdsv3_ib_get_device", "Enter: ipaddr: 0x%x", ipaddr);

        RDSV3_FOR_EACH_LIST_NODE(rds_ibdev, &rdsv3_ib_devices, list) {
                rw_enter(&rds_ibdev->rwlock, RW_READER);
                RDSV3_FOR_EACH_LIST_NODE(i_ipaddr, &rds_ibdev->ipaddr_list,
                    list) {
                        if (i_ipaddr->ipaddr == ipaddr) {
                                rw_exit(&rds_ibdev->rwlock);
                                return (rds_ibdev);
                        }
                }
                rw_exit(&rds_ibdev->rwlock);
        }

        RDSV3_DPRINTF4("rdsv3_ib_get_device", "Return: ipaddr: 0x%x", ipaddr);

        return (NULL);
}

static int
rdsv3_ib_add_ipaddr(struct rdsv3_ib_device *rds_ibdev, uint32_be_t ipaddr)
{
        struct rdsv3_ib_ipaddr *i_ipaddr;

        RDSV3_DPRINTF4("rdsv3_ib_add_ipaddr", "rds_ibdev: %p ipaddr: %x",
            rds_ibdev, ipaddr);

        i_ipaddr = kmem_alloc(sizeof (*i_ipaddr), KM_NOSLEEP);
        if (!i_ipaddr)
                return (-ENOMEM);

        i_ipaddr->ipaddr = ipaddr;

        rw_enter(&rds_ibdev->rwlock, RW_WRITER);
        list_insert_tail(&rds_ibdev->ipaddr_list, i_ipaddr);
        rw_exit(&rds_ibdev->rwlock);

        return (0);
}

static void
rdsv3_ib_remove_ipaddr(struct rdsv3_ib_device *rds_ibdev, uint32_be_t ipaddr)
{
        struct rdsv3_ib_ipaddr *i_ipaddr, *next;
        struct rdsv3_ib_ipaddr *to_free = NULL;

        RDSV3_DPRINTF4("rdsv3_ib_remove_ipaddr", "rds_ibdev: %p, ipaddr: %x",
            rds_ibdev, ipaddr);

        rw_enter(&rds_ibdev->rwlock, RW_WRITER);
        RDSV3_FOR_EACH_LIST_NODE_SAFE(i_ipaddr, next, &rds_ibdev->ipaddr_list,
            list) {
                if (i_ipaddr->ipaddr == ipaddr) {
                        list_remove_node(&i_ipaddr->list);
                        to_free = i_ipaddr;
                        break;
                }
        }
        rw_exit(&rds_ibdev->rwlock);

        if (to_free) {
                kmem_free(i_ipaddr, sizeof (*i_ipaddr));
        }

        RDSV3_DPRINTF4("rdsv3_ib_remove_ipaddr",
            "Return: rds_ibdev: %p, ipaddr: %x", rds_ibdev, ipaddr);
}

int
rdsv3_ib_update_ipaddr(struct rdsv3_ib_device *rds_ibdev, uint32_be_t ipaddr)
{
        struct rdsv3_ib_device *rds_ibdev_old;

        RDSV3_DPRINTF4("rdsv3_ib_update_ipaddr", "rds_ibdev: %p, ipaddr: %x",
            rds_ibdev, ipaddr);

        rds_ibdev_old = rdsv3_ib_get_device(ipaddr);
        if (rds_ibdev_old)
                rdsv3_ib_remove_ipaddr(rds_ibdev_old, ipaddr);

        return (rdsv3_ib_add_ipaddr(rds_ibdev, ipaddr));
}

void
rdsv3_ib_add_conn(struct rdsv3_ib_device *rds_ibdev,
    struct rdsv3_connection *conn)
{
        struct rdsv3_ib_connection *ic = conn->c_transport_data;

        RDSV3_DPRINTF4("rdsv3_ib_add_conn", "rds_ibdev: %p, conn: %p",
            rds_ibdev, conn);

        /* conn was previously on the nodev_conns_list */
        mutex_enter(&ib_nodev_conns_lock);
        ASSERT(!list_is_empty(&ib_nodev_conns));
        ASSERT(list_link_active(&ic->ib_node));
        list_remove_node(&ic->ib_node);

        mutex_enter(&rds_ibdev->spinlock);
        list_insert_tail(&rds_ibdev->conn_list, ic);
        ic->i_on_dev_list = B_TRUE;
        mutex_exit(&rds_ibdev->spinlock);
        mutex_exit(&ib_nodev_conns_lock);
}

void
rdsv3_ib_remove_conn(struct rdsv3_ib_device *rds_ibdev,
    struct rdsv3_connection *conn)
{
        struct rdsv3_ib_connection *ic = conn->c_transport_data;

        RDSV3_DPRINTF4("rdsv3_ib_remove_conn", "rds_ibdev: %p, conn: %p",
            rds_ibdev, conn);

        /* place conn on nodev_conns_list */
        mutex_enter(&ib_nodev_conns_lock);

        mutex_enter(&rds_ibdev->spinlock);
        ASSERT(list_link_active(&ic->ib_node));
        list_remove_node(&ic->ib_node);
        ic->i_on_dev_list = B_FALSE;
        mutex_exit(&rds_ibdev->spinlock);

        list_insert_tail(&ib_nodev_conns, ic);

        mutex_exit(&ib_nodev_conns_lock);

        RDSV3_DPRINTF4("rdsv3_ib_remove_conn",
            "Return: rds_ibdev: %p, conn: %p", rds_ibdev, conn);
}

void
__rdsv3_ib_destroy_conns(struct list *list, kmutex_t *list_lock)
{
        struct rdsv3_ib_connection *ic, *_ic;
        list_t tmp_list;

        RDSV3_DPRINTF4("__rdsv3_ib_destroy_conns", "Enter: list: %p", list);

        /* avoid calling conn_destroy with irqs off */
        mutex_enter(list_lock);
        list_splice(list, &tmp_list);
        mutex_exit(list_lock);

        RDSV3_FOR_EACH_LIST_NODE_SAFE(ic, _ic, &tmp_list, ib_node) {
                rdsv3_conn_destroy(ic->conn);
        }

        RDSV3_DPRINTF4("__rdsv3_ib_destroy_conns", "Return: list: %p", list);
}

void
rdsv3_ib_destroy_mr_pool(struct rdsv3_ib_device *rds_ibdev)
{
        struct rdsv3_fmr_pool *pool = rds_ibdev->fmr_pool;

        RDSV3_DPRINTF4("rdsv3_ib_destroy_mr_pool", "Enter: ibdev: %p",
            rds_ibdev);

        if (rds_ibdev->fmr_pool_hdl == NULL)
                return;

        if (pool) {
                list_destroy(&pool->f_list);
                kmem_free((void *) pool, sizeof (*pool));
        }

        (void) rdsv3_ib_flush_mr_pool(rds_ibdev, rds_ibdev->fmr_pool_hdl, 1);
        (void) ibt_destroy_fmr_pool(ib_get_ibt_hca_hdl(rds_ibdev->dev),
            rds_ibdev->fmr_pool_hdl);
}

#define IB_FMR_MAX_BUF_SIZE     0x1000000       /* 16MB max buf */
int
rdsv3_ib_create_mr_pool(struct rdsv3_ib_device *rds_ibdev)
{
        uint_t h_page_sz;
        ibt_fmr_pool_attr_t fmr_attr;
        ibt_status_t ibt_status;
        struct rdsv3_fmr_pool *pool;

        RDSV3_DPRINTF4("rdsv3_ib_create_mr_pool",
            "Enter: ibdev: %p", rds_ibdev);

        pool = (struct rdsv3_fmr_pool *)kmem_zalloc(sizeof (*pool), KM_NOSLEEP);
        if (pool == NULL) {
                return (-ENOMEM);
        }

        /* setup FMR pool attributes */
        h_page_sz = rds_ibdev->hca_attr.hca_page_sz * 1024;

        fmr_attr.fmr_max_pages_per_fmr = (IB_FMR_MAX_BUF_SIZE / h_page_sz) + 2;
        fmr_attr.fmr_pool_size = RDSV3_FMR_POOL_SIZE;
        fmr_attr.fmr_dirty_watermark = 128;
        fmr_attr.fmr_cache = B_FALSE;
        fmr_attr.fmr_flags = IBT_MR_NOSLEEP  | IBT_MR_ENABLE_LOCAL_WRITE |
            IBT_MR_ENABLE_REMOTE_WRITE | IBT_MR_ENABLE_REMOTE_READ;
        fmr_attr.fmr_page_sz = h_page_sz;
        fmr_attr.fmr_func_hdlr = NULL;
        fmr_attr.fmr_func_arg = (void *) NULL;

        /* create the FMR pool */
        ibt_status = ibt_create_fmr_pool(rds_ibdev->ibt_hca_hdl,
            rds_ibdev->pd->ibt_pd, &fmr_attr, &rds_ibdev->fmr_pool_hdl);
        if (ibt_status != IBT_SUCCESS) {
                kmem_free((void *) pool, sizeof (*pool));
                rds_ibdev->fmr_pool = NULL;
                return (-ENOMEM);
        }

        list_create(&pool->f_list, sizeof (struct rdsv3_ib_mr),
            offsetof(struct rdsv3_ib_mr, m_obj));
        mutex_init(&pool->f_lock, NULL, MUTEX_DRIVER, NULL);
        rds_ibdev->fmr_pool = pool;
        rds_ibdev->max_fmrs = fmr_attr.fmr_pool_size;
        rds_ibdev->fmr_message_size = fmr_attr.fmr_max_pages_per_fmr;

        RDSV3_DPRINTF2("rdsv3_ib_create_mr_pool",
            "Exit: ibdev: %p fmr_pool: %p", rds_ibdev, pool);
        return (0);
}

void
rdsv3_ib_get_mr_info(struct rdsv3_ib_device *rds_ibdev,
        struct rds_info_rdma_connection *iinfo)
{
        iinfo->rdma_mr_max = rds_ibdev->max_fmrs;
        iinfo->rdma_mr_size = rds_ibdev->fmr_message_size;
}

void *
rdsv3_ib_get_mr(struct rds_iovec *args, unsigned long nents,
        struct rdsv3_sock *rs, uint32_t *key_ret)
{
        struct rdsv3_ib_device *rds_ibdev;
        struct rdsv3_ib_mr *ibmr = NULL;
        ddi_umem_cookie_t umem_cookie;
        size_t umem_len;
        caddr_t umem_addr;
        int ret;
        struct buf *bp;

        RDSV3_DPRINTF4("rdsv3_ib_get_mr", "Enter: args.addr: %p", args->addr);

        rds_ibdev = rdsv3_ib_get_device(rs->rs_bound_addr);

        if (rds_ibdev == NULL)
                return (void *)(PTR_ERR(-EFAULT));

        ibmr = rdsv3_ib_alloc_fmr(rds_ibdev);
        if (IS_ERR(ibmr))
                return (ibmr);

        /* pin user memory pages */
        umem_len   = ptob(btopr(args->bytes +
            ((uintptr_t)args->addr & PAGEOFFSET)));
        umem_addr  = (caddr_t)((uintptr_t)args->addr & ~PAGEOFFSET);
        ret = umem_lockmemory(umem_addr, umem_len,
            DDI_UMEMLOCK_WRITE | DDI_UMEMLOCK_READ,
            &umem_cookie, NULL, NULL);
        if (ret != 0) {
                kmem_free((void *) ibmr, sizeof (*ibmr));
                ibmr = ERR_PTR(-ret);
                return (ibmr);
        }

        /* transpose umem_cookie to buf structure for rdsv3_ib_map_fmr() */
        bp = ddi_umem_iosetup(umem_cookie, 0, umem_len,
            B_WRITE, 0, 0, NULL, DDI_UMEM_SLEEP);

        ret = rdsv3_ib_map_fmr(rds_ibdev, ibmr, bp, nents);
        freerbuf(bp);   /* free bp */
        if (ret == 0) {
                ibmr->umem_cookie = umem_cookie;
                *key_ret = (uint32_t)ibmr->rc_mem_desc.pmd_rkey;
                ibmr->m_device = rds_ibdev;
                ibmr->m_pool = rds_ibdev->fmr_pool;
                RDSV3_DPRINTF4("rdsv3_ib_get_mr",
                    "Return: ibmr: %p umem_cookie %p", ibmr, ibmr->umem_cookie);
                return (ibmr);
        } else { /* error return */
                RDSV3_DPRINTF2("rdsv3_ib_get_mr", "map_fmr failed (errno=%d)\n",
                    ret);
                ddi_umem_unlock(umem_cookie);
                kmem_free((void *)ibmr, sizeof (*ibmr));
                return (ERR_PTR(ret));
        }
}

static struct rdsv3_ib_mr *
rdsv3_ib_alloc_fmr(struct rdsv3_ib_device *rds_ibdev)
{
        struct rdsv3_ib_mr *ibmr;

        RDSV3_DPRINTF4("rdsv3_ib_alloc_fmr", "Enter: ibdev: %p", rds_ibdev);

        if (rds_ibdev->fmr_pool_hdl) {
                ibmr = (struct rdsv3_ib_mr *)kmem_zalloc(sizeof (*ibmr),
                    KM_SLEEP);
                ibmr->rc_hca_hdl = ib_get_ibt_hca_hdl(rds_ibdev->dev);
                ibmr->fmr_pool_hdl = rds_ibdev->fmr_pool_hdl;
                return (ibmr);
        }
        return (struct rdsv3_ib_mr *)(PTR_ERR(-ENOMEM));
}

static int
rdsv3_ib_map_fmr(struct rdsv3_ib_device *rds_ibdev, struct rdsv3_ib_mr *ibmr,
        struct buf *bp, unsigned int nents)
{
        ibt_va_attr_t va_attr;
        ibt_reg_req_t reg_req;
        uint_t paddr_list_len;
        uint_t page_sz;
        ibt_status_t ibt_status;
        /* LINTED E_FUNC_SET_NOT_USED */
        unsigned int l_nents = nents;

        RDSV3_DPRINTF4("rdsv3_ib_map_fmr", "Enter: ibmr: %p", ibmr);
        RDSV3_DPRINTF4("rdsv3_ib_map_fmr", "buf addr: %p", bp->b_un.b_addr);

        /* setup ibt_map_mem_area attributes */
        bzero(&va_attr, sizeof (ibt_va_attr_t));
        va_attr.va_buf   = bp;
        va_attr.va_flags = IBT_VA_FMR | IBT_VA_BUF;

        page_sz = rds_ibdev->hca_attr.hca_page_sz * 1024; /* in kbytes */
        paddr_list_len = (bp->b_bcount / page_sz) + 2; /* start + end pg */

        /* map user buffer to HCA address */
        ibt_status = ibt_map_mem_area(ibmr->rc_hca_hdl,
            &va_attr, paddr_list_len, &reg_req, &ibmr->rc_ma_hdl);
        if (ibt_status != IBT_SUCCESS) {
                return (-ENOMEM);
        }

        /*  use a free entry from FMR pool to register the specified memory */
        ibt_status = ibt_register_physical_fmr(ibmr->rc_hca_hdl,
            ibmr->fmr_pool_hdl,
            &reg_req.fn_arg, &ibmr->rc_fmr_hdl, &ibmr->rc_mem_desc);
        if (ibt_status != IBT_SUCCESS) {
                RDSV3_DPRINTF2("rdsv3_ib_map_fmr", "reg_phy_fmr failed %d",
                    ibt_status);
                (void) ibt_unmap_mem_area(ibmr->rc_hca_hdl,
                    ibmr->rc_ma_hdl);
                if (ibt_status == IBT_INSUFF_RESOURCE) {
                        return (-ENOBUFS);
                }
                return (-EINVAL);
        }
        RDSV3_DPRINTF4("rdsv3_ib_map_fmr", "Return: ibmr: %p rkey: 0x%x",
            ibmr, (uint32_t)ibmr->rc_mem_desc.pmd_rkey);
        return (0);
}

void
rdsv3_ib_sync_mr(void *trans_private, int direction)
{
        /* LINTED E_FUNC_SET_NOT_USED */
        void *l_trans_private = trans_private;
        /* LINTED E_FUNC_SET_NOT_USED */
        int l_direction = direction;

        /* FMR Sync not needed in Solaris on PCI-ex systems */

        RDSV3_DPRINTF4("rdsv3_ib_sync_mr", "Enter:");
}

void
rdsv3_ib_flush_mrs(void)
{
        struct rdsv3_ib_device *rds_ibdev;

        RDSV3_DPRINTF4("rdsv3_ib_flush_mrs", "Enter:");

        RDSV3_FOR_EACH_LIST_NODE(rds_ibdev, &rdsv3_ib_devices, list) {
                if (rds_ibdev->fmr_pool_hdl) {
                        (void) rdsv3_ib_flush_mr_pool(rds_ibdev,
                            rds_ibdev->fmr_pool_hdl, 0);
                }
        }
}

static void
rdsv3_ib_drop_mr(struct rdsv3_ib_mr *ibmr)
{
        /* return the fmr to the IBTF pool */
        (void) ibt_deregister_fmr(ibmr->rc_hca_hdl, ibmr->rc_fmr_hdl);
        (void) ibt_unmap_mem_area(ibmr->rc_hca_hdl, ibmr->rc_ma_hdl);
        (void) ddi_umem_unlock(ibmr->umem_cookie);
        kmem_free((void *) ibmr, sizeof (*ibmr));
}

void
rdsv3_ib_drain_mrlist_fn(void *data)
{
        struct rdsv3_fmr_pool *pool = (struct rdsv3_fmr_pool *)data;
        ibt_hca_hdl_t hca_hdl;
        ibt_fmr_pool_hdl_t fmr_pool_hdl;
        unsigned int inval;
        struct rdsv3_ib_mr *ibmr;
        list_t *listp = &pool->f_list;
        kmutex_t *lockp = &pool->f_lock;
        int i;

        inval = 0;
        i = 0;
        for (;;) {
                mutex_enter(lockp);
                ibmr = (struct rdsv3_ib_mr *)list_remove_head(listp);
                if (ibmr)
                        pool->f_listcnt--;
                mutex_exit(lockp);
                if (!ibmr)
                        break;
                if ((inval == 0) && ibmr->m_inval) {
                        inval = 1;
                        hca_hdl = ibmr->rc_hca_hdl;
                        fmr_pool_hdl = ibmr->fmr_pool_hdl;
                }
                i++;
                rdsv3_ib_drop_mr(ibmr);
        }
        if (inval)
                (void) ibt_flush_fmr_pool(hca_hdl, fmr_pool_hdl);
}

void
rdsv3_ib_free_mr(void *trans_private, int invalidate)
{
        struct rdsv3_ib_mr *ibmr = trans_private;
        rdsv3_af_thr_t *af_thr;

        RDSV3_DPRINTF4("rdsv3_ib_free_mr", "Enter: ibmr: %p inv: %d",
            ibmr, invalidate);

        /* save af_thr at local as ibmr might be freed at mutex_exit */
        af_thr = ibmr->m_device->fmr_soft_cq;
        ibmr->m_inval = (unsigned int) invalidate;
        mutex_enter(&ibmr->m_pool->f_lock);
        list_insert_tail(&ibmr->m_pool->f_list, ibmr);
        ibmr->m_pool->f_listcnt++;
        mutex_exit(&ibmr->m_pool->f_lock);

        rdsv3_af_thr_fire(af_thr);
}

static int
rdsv3_ib_flush_mr_pool(struct rdsv3_ib_device *rds_ibdev,
    ibt_fmr_pool_hdl_t pool_hdl, int free_all)
{
        /* LINTED E_FUNC_SET_NOT_USED */
        int l_free_all = free_all;

        RDSV3_DPRINTF4("rdsv3_ib_flush_mr_pool", "Enter: pool: %p", pool_hdl);

        rdsv3_ib_stats_inc(s_ib_rdma_mr_pool_flush);

        (void) ibt_flush_fmr_pool(ib_get_ibt_hca_hdl(rds_ibdev->dev),
            pool_hdl);
        return (0);
}