root/usr/src/uts/common/rpc/ib.h
/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
 */
/*
 * Copyright (c) 2007, The Ohio State University. All rights reserved.
 *
 * Portions of this source code is developed by the team members of
 * The Ohio State University's Network-Based Computing Laboratory (NBCL),
 * headed by Professor Dhabaleswar K. (DK) Panda.
 *
 * Acknowledgements to contributions from developors:
 *   Ranjit Noronha: noronha@cse.ohio-state.edu
 *   Lei Chai      : chail@cse.ohio-state.edu
 *   Weikuan Yu    : yuw@cse.ohio-state.edu
 *
 */


#ifndef _IB_H
#define _IB_H

/*
 * ib.h, rpcib plugin interface.
 */

#include <sys/types.h>
#include <sys/ddi.h>
#include <sys/sunddi.h>
#include <sys/conf.h>
#include <sys/stat.h>
#include <rpc/rpc.h>
#include <rpc/rpc_rdma.h>
#include <sys/ib/ibtl/ibti.h>
#include <sys/avl.h>

#ifdef __cplusplus
extern "C" {
#endif

#define MAX_BUFS        1024    /* max no. of buffers per pool */

#define DEF_CQ_SIZE     4096 - 1        /* default CQ size */
                                /*
                                 * Tavor returns the next higher power of 2
                                 * CQ entries than the requested size.
                                 * For instance, if you request (2^12 - 1)
                                 * CQ entries, Tavor returns 2^12 entries.
                                 * 4K CQ entries suffice.  Hence, 4096 - 1.
                                 */
#define DEF_SQ_SIZE     128     /* default SendQ size */
#define DEF_RQ_SIZE     256     /* default RecvQ size */
#define DSEG_MAX        2
#define RQ_DSEG_MAX     1       /* default RQ data seg */
#define IBSRM_HB        0x8000  /* high order bit of pkey */

/* max no. of refresh attempts on IBT_CM_CONN_STALE error */
#define REFRESH_ATTEMPTS        3

typedef struct rib_hca_s rib_hca_t;
typedef struct rib_qp_s rib_qp_t;
typedef struct rib_cq_s rib_cq_t;

/*
 * Notification for RDMA_DONE is based on xid
 */
struct rdma_done_list {
        uint32_t        xid;            /* XID waiting for RDMA_DONE */
        kcondvar_t      rdma_done_cv;   /* cv for RDMA_DONE */
        struct rdma_done_list   *next;
        struct rdma_done_list   *prev;
};

/*
 * State of the plugin.
 * ACCEPT = accepting new connections and requests
 * NO_ACCEPT = not accepting new connection and requests
 */
#define ACCEPT          1
#define NO_ACCEPT       2

/*
 * Send Wait states
 */
#define SEND_WAIT       -1

/*
 * Reply states
 */
#define REPLY_WAIT      -1

typedef void * rib_pvoid;
typedef rib_pvoid RIB_SYNCMEM_HANDLE;

/*
 * IB buffer pool management structure
 */

/*
 * Buffer pool info
 */
typedef struct {
        kmutex_t        buflock;        /* lock for this structure */
        caddr_t         buf;            /* pool address */
        uint32_t        bufhandle;      /* rkey for this pool */
        ulong_t         bufsize;        /* size of pool */
        int             rsize;          /* size of each element */
        int             numelems;       /* no. of elements allocated */
        int             buffree;        /* no. of free elements */
        void            *buflist[1];    /* free elements in pool */
} bufpool_t;

typedef struct {
        bufpool_t       *bpool;
        ibt_mr_hdl_t    *mr_hdl;
        ibt_mr_desc_t   *mr_desc;       /* vaddr, lkey, rkey */
} rib_bufpool_t;

/*
 * ATS relsted defines and structures.
 */
#define ATS_AR_DATA_LEN 16


/*
 * Service types supported by RPCIB
 * For now only NFS is supported.
 */
#define NFS             1
#define NLM             2

/*
 * Tracks consumer state (client or server).
 */
typedef enum {
        RIB_SERVER,
        RIB_CLIENT
} rib_mode_t;

/*
 * CQ structure
 */
struct rib_cq_s {
        rib_hca_t               *rib_hca;
        ibt_cq_hdl_t            rib_cq_hdl;
};

/*
 * Each registered service's data structure.
 */
typedef struct rib_service_s rib_service_t;
struct rib_service_s {
        uint32_t                srv_type;       /* i.e, NFS, NLM, v4CBD */
        ibt_srv_hdl_t           srv_hdl;        /* from ibt_register call */
        ib_svc_id_t             srv_id;
        rib_service_t           *next;
};

/*
 * RPCIB plugin state
 */
typedef struct rpcib_state {
        ibt_clnt_hdl_t          ibt_clnt_hdl;
        uint32_t                hca_count;
        uint32_t                nhca_inited;
        rib_hca_t               *hcas_list;
        krwlock_t               hcas_list_lock; /* protects hcas_list */
        int                     refcount;
        kmutex_t                open_hca_lock;
        queue_t                 *q;             /* up queue for a serv_type */
        void                    *private;
        rib_service_t           *service_list;
        krwlock_t               service_list_lock;
        kmutex_t                listen_lock;
} rpcib_state_t;

/*
 * Connection lists
 */
typedef struct {
        krwlock_t       conn_lock;      /* list lock */
        CONN            *conn_hd;       /* list head */
} rib_conn_list_t;

enum hca_state {
        HCA_DETACHED,           /* hca in detached state */
        HCA_INITED,             /* hca in up and running state */
};

typedef struct rib_hca_service_s rib_hca_service_t;
struct rib_hca_service_s {
        ib_svc_id_t     srv_id;
        ib_gid_t        gid;
        ibt_sbind_hdl_t sbind_hdl;
        rib_hca_service_t *next;
};

/*
 * RPCIB per HCA structure
 */
struct rib_hca_s {
        ibt_clnt_hdl_t          ibt_clnt_hdl;

        /*
         * per HCA.
         */
        ibt_hca_hdl_t           hca_hdl;        /* HCA handle */
        ibt_hca_attr_t          hca_attrs;      /* HCA attributes */
        ibt_pd_hdl_t            pd_hdl;
        rib_hca_service_t       *bound_services;
        krwlock_t               bound_services_lock;
        ib_guid_t               hca_guid;
        uint32_t                hca_nports;
        ibt_hca_portinfo_t      *hca_ports;
        size_t                  hca_pinfosz;
        enum hca_state          state;          /* state of HCA */
        krwlock_t               state_lock;     /* protects state field */
        bool_t                  inuse;          /* indicates HCA usage */
        kmutex_t                inuse_lock;     /* protects inuse field */

        rib_conn_list_t         cl_conn_list;   /* client conn list */
        rib_conn_list_t         srv_conn_list;  /* server conn list */

        rib_cq_t                *clnt_scq;
        rib_cq_t                *clnt_rcq;
        rib_cq_t                *svc_scq;
        rib_cq_t                *svc_rcq;
        kmutex_t                cb_lock;
        kcondvar_t              cb_cv;

        rib_bufpool_t           *recv_pool;     /* recv buf pool */
        rib_bufpool_t           *send_pool;     /* send buf pool */

        void                    *iblock;        /* interrupt cookie */

        kmem_cache_t    *server_side_cache;     /* long reply pool */
        avl_tree_t      avl_tree;
        kmutex_t        avl_lock;
        krwlock_t       avl_rw_lock;
        volatile bool_t avl_init;
        kmutex_t        cache_allocation_lock;
        ddi_taskq_t     *cleanup_helper;
        ib_svc_id_t     srv_id;
        ibt_srv_hdl_t   srv_hdl;
        uint_t          reg_state;

        volatile uint64_t       cache_allocation;
        uint64_t        cache_hits;
        uint64_t        cache_misses;
        uint64_t        cache_cold_misses;
        uint64_t        cache_hot_misses;
        uint64_t        cache_misses_above_the_limit;

        struct rib_hca_s *next;
};


/*
 * Structure on wait state of a post send
 */
struct send_wid {
        uint32_t        xid;
        int             cv_sig;
        kmutex_t        sendwait_lock;
        kcondvar_t      wait_cv;
        uint_t          status;
        rib_qp_t        *qp;
        int             nsbufs;                 /* # of send buffers posted */
        uint64_t        sbufaddr[DSEG_MAX];     /* posted send buffers */
        caddr_t         c;
        caddr_t         c1;
        int             l1;
        caddr_t         c2;
        int             l2;
        int             wl, rl;
};

/*
 * Structure on reply descriptor for recv queue.
 * Different from the above posting of a descriptor.
 */
struct reply {
        uint32_t        xid;
        uint_t          status;
        uint64_t        vaddr_cq;       /* buf addr from CQ */
        uint_t          bytes_xfer;
        kcondvar_t      wait_cv;
        struct reply    *next;
        struct reply    *prev;
};

struct svc_recv {
        rib_qp_t        *qp;
        uint64_t        vaddr;
        uint_t          bytes_xfer;
};

struct recv_wid {
        uint32_t        xid;
        rib_qp_t        *qp;
        uint64_t        addr;   /* posted buf addr */
};

/*
 * Per QP data structure
 */
struct rib_qp_s {
        rib_hca_t               *hca;
        rib_mode_t              mode;   /* RIB_SERVER or RIB_CLIENT */
        CONN                    rdmaconn;
        ibt_channel_hdl_t       qp_hdl;
        uint_t                  port_num;
        ib_qpn_t                qpn;
        int                     chan_flags;
        clock_t                 timeout;
        ibt_rc_chan_query_attr_t        qp_q_attrs;
        rib_cq_t                *send_cq;       /* send CQ */
        rib_cq_t                *recv_cq;       /* recv CQ */

        /*
         * Number of pre-posted rbufs
         */
        uint_t                  n_posted_rbufs;
        kcondvar_t              posted_rbufs_cv;
        kmutex_t                posted_rbufs_lock;

        /*
         * Number of SENDs pending completion
         */

        uint_t                  n_send_rbufs;
        kcondvar_t              send_rbufs_cv;
        kmutex_t                send_rbufs_lock;

        /*
         * RPC reply
         */
        uint_t                  rep_list_size;
        struct reply            *replylist;
        kmutex_t                replylist_lock;

        /*
         * server only, RDMA_DONE
         */
        struct rdma_done_list   *rdlist;
        kmutex_t                rdlist_lock;

        kmutex_t                cb_lock;
        kcondvar_t              cb_conn_cv;

        caddr_t                 q;      /* upstream queue */
        struct send_wid         wd;
};

#define ctoqp(conn)     ((rib_qp_t *)((conn)->c_private))
#define qptoc(rqp)      ((CONN *)&((rqp)->rdmaconn))

/*
 * Timeout for various calls
 */
#define CONN_WAIT_TIME  40
#define SEND_WAIT_TIME  40      /* time for send completion */

#define REPLY_WAIT_TIME 40      /* time to get reply from remote QP */

#ifdef __cplusplus
}
#endif

#endif  /* !_IB_H */