root/sys/dev/iser/icl_iser.h
/*-
 * Copyright (c) 2015, Mellanox Technologies, Inc. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

#ifndef ICL_ISER_H
#define ICL_ISER_H

/*
 * iSCSI Common Layer for RDMA.
 */

#include <sys/param.h>
#include <sys/capsicum.h>
#include <sys/condvar.h>
#include <sys/conf.h>
#include <sys/file.h>
#include <sys/kernel.h>
#include <sys/kthread.h>
#include <sys/lock.h>
#include <sys/mbuf.h>
#include <sys/mutex.h>
#include <sys/module.h>
#include <sys/protosw.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/sysctl.h>
#include <sys/systm.h>
#include <sys/sx.h>
#include <sys/uio.h>
#include <sys/taskqueue.h>
#include <sys/bio.h>
#include <vm/uma.h>
#include <netinet/in.h>
#include <netinet/tcp.h>
#include <dev/iscsi/icl.h>
#include <dev/iscsi/iscsi_proto.h>
#include <icl_conn_if.h>
#include <cam/cam.h>
#include <cam/cam_ccb.h>
#include <rdma/ib_verbs.h>
#include <rdma/ib_fmr_pool.h>
#include <rdma/rdma_cm.h>


#define ISER_DBG(X, ...)                                                \
        do {                                                            \
                if (unlikely(iser_debug > 2))                           \
                        printf("DEBUG: %s: " X "\n",                    \
                                __func__, ## __VA_ARGS__);              \
        } while (0)

#define ISER_INFO(X, ...)                                               \
        do {                                                            \
                if (unlikely(iser_debug > 1))                           \
                        printf("INFO: %s: " X "\n",                     \
                                __func__, ## __VA_ARGS__);              \
        } while (0)

#define ISER_WARN(X, ...)                                               \
        do {                                                            \
                if (unlikely(iser_debug > 0)) {                         \
                        printf("WARNING: %s: " X "\n",                  \
                                __func__, ## __VA_ARGS__);              \
                }                                                       \
        } while (0)

#define ISER_ERR(X, ...)                                                \
        printf("ERROR: %s: " X "\n", __func__, ## __VA_ARGS__)

#define ISER_VER                        0x10
#define ISER_WSV                        0x08
#define ISER_RSV                        0x04

#define ISER_FASTREG_LI_WRID            0xffffffffffffffffULL
#define ISER_BEACON_WRID                0xfffffffffffffffeULL

#define SHIFT_4K        12
#define SIZE_4K (1ULL << SHIFT_4K)
#define MASK_4K (~(SIZE_4K-1))

/* support up to 512KB in one RDMA */
#define ISCSI_ISER_SG_TABLESIZE         (0x80000 >> SHIFT_4K)
#define ISER_DEF_XMIT_CMDS_MAX 256

/* the max RX (recv) WR supported by the iSER QP is defined by                 *
 * max_recv_wr = commands_max + recv_beacon                                    */
#define ISER_QP_MAX_RECV_DTOS  (ISER_DEF_XMIT_CMDS_MAX + 1)
#define ISER_MIN_POSTED_RX              (ISER_DEF_XMIT_CMDS_MAX >> 2)

/* QP settings */
/* Maximal bounds on received asynchronous PDUs */
#define ISER_MAX_RX_MISC_PDUS           4 /* NOOP_IN(2) , ASYNC_EVENT(2)   */
#define ISER_MAX_TX_MISC_PDUS           6 /* NOOP_OUT(2), TEXT(1), SCSI_TMFUNC(2), LOGOUT(1) */

/* the max TX (send) WR supported by the iSER QP is defined by                 *
 * max_send_wr = T * (1 + D) + C ; D is how many inflight dataouts we expect   *
 * to have at max for SCSI command. The tx posting & completion handling code  *
 * supports -EAGAIN scheme where tx is suspended till the QP has room for more *
 * send WR. D=8 comes from 64K/8K                                              */

#define ISER_INFLIGHT_DATAOUTS          8

/* the send_beacon increase the max_send_wr by 1  */
#define ISER_QP_MAX_REQ_DTOS            (ISER_DEF_XMIT_CMDS_MAX *    \
                                        (1 + ISER_INFLIGHT_DATAOUTS) + \
                                        ISER_MAX_TX_MISC_PDUS        + \
                                        ISER_MAX_RX_MISC_PDUS + 1)

#define ISER_GET_MAX_XMIT_CMDS(send_wr) ((send_wr                       \
                                         - ISER_MAX_TX_MISC_PDUS        \
                                         - ISER_MAX_RX_MISC_PDUS - 1) / \
                                         (1 + ISER_INFLIGHT_DATAOUTS))

#define ISER_WC_BATCH_COUNT   16
#define ISER_SIGNAL_CMD_COUNT 32

/* Maximal QP's recommended per CQ. In case we use more QP's per CQ we might   *
 * encounter a CQ overrun state.                                               */
#define ISCSI_ISER_MAX_CONN     8
#define ISER_MAX_RX_LEN         (ISER_QP_MAX_RECV_DTOS * ISCSI_ISER_MAX_CONN)
#define ISER_MAX_TX_LEN         (ISER_QP_MAX_REQ_DTOS  * ISCSI_ISER_MAX_CONN)
#define ISER_MAX_CQ_LEN         (ISER_MAX_RX_LEN + ISER_MAX_TX_LEN + \
                                 ISCSI_ISER_MAX_CONN)

#define ISER_ZBVA_NOT_SUPPORTED                0x80
#define ISER_SEND_W_INV_NOT_SUPPORTED   0x40

#define ISCSI_DEF_MAX_RECV_SEG_LEN      8192
#define ISCSI_OPCODE_MASK               0x3f

#define icl_to_iser_conn(ic) \
        container_of(ic, struct iser_conn, icl_conn)
#define icl_to_iser_pdu(ip) \
        container_of(ip, struct icl_iser_pdu, icl_pdu)

/**
 * struct iser_hdr - iSER header
 *
 * @flags:        flags support (zbva, remote_inv)
 * @rsvd:         reserved
 * @write_stag:   write rkey
 * @write_va:     write virtual address
 * @reaf_stag:    read rkey
 * @read_va:      read virtual address
 */
struct iser_hdr {
        u8      flags;
        u8      rsvd[3];
        __be32  write_stag;
        __be64  write_va;
        __be32  read_stag;
        __be64  read_va;
} __attribute__((packed));

struct iser_cm_hdr {
        u8      flags;
        u8      rsvd[3];
} __packed;

/* Constant PDU lengths calculations */
#define ISER_HEADERS_LEN  (sizeof(struct iser_hdr) + ISCSI_BHS_SIZE)

#define ISER_RECV_DATA_SEG_LEN  128
#define ISER_RX_PAYLOAD_SIZE    (ISER_HEADERS_LEN + ISER_RECV_DATA_SEG_LEN)

#define ISER_RX_LOGIN_SIZE      (ISER_HEADERS_LEN + ISCSI_DEF_MAX_RECV_SEG_LEN)

enum iser_conn_state {
        ISER_CONN_INIT,            /* descriptor allocd, no conn          */
        ISER_CONN_PENDING,         /* in the process of being established */
        ISER_CONN_UP,              /* up and running                      */
        ISER_CONN_TERMINATING,     /* in the process of being terminated  */
        ISER_CONN_DOWN,            /* shut down                           */
        ISER_CONN_STATES_NUM
};

enum iser_task_status {
        ISER_TASK_STATUS_INIT = 0,
        ISER_TASK_STATUS_STARTED,
        ISER_TASK_STATUS_COMPLETED
};

enum iser_data_dir {
        ISER_DIR_IN = 0,           /* to initiator */
        ISER_DIR_OUT,              /* from initiator */
        ISER_DIRS_NUM
};

/**
 * struct iser_mem_reg - iSER memory registration info
 *
 * @sge:          memory region sg element
 * @rkey:         memory region remote key
 * @mem_h:        pointer to registration context (FMR/Fastreg)
 */
struct iser_mem_reg {
        struct ib_sge    sge;
        u32              rkey;
        void            *mem_h;
};

enum iser_desc_type {
        ISCSI_TX_CONTROL ,
        ISCSI_TX_SCSI_COMMAND,
        ISCSI_TX_DATAOUT
};

/**
 * struct iser_data_buf - iSER data buffer
 *
 * @sg:           pointer to the sg list
 * @size:         num entries of this sg
 * @data_len:     total beffer byte len
 * @dma_nents:    returned by dma_map_sg
 * @copy_buf:     allocated copy buf for SGs unaligned
 *                for rdma which are copied
 * @orig_sg:      pointer to the original sg list (in case
 *                we used a copy)
 * @sg_single:    SG-ified clone of a non SG SC or
 *                unaligned SG
 */
struct iser_data_buf {
        struct scatterlist sgl[ISCSI_ISER_SG_TABLESIZE];
        void               *sg;
        int                size;
        unsigned long      data_len;
        unsigned int       dma_nents;
        char               *copy_buf;
        struct scatterlist *orig_sg;
        struct scatterlist sg_single;
  };

/* fwd declarations */
struct iser_conn;
struct ib_conn;
struct iser_device;

/**
 * struct iser_tx_desc - iSER TX descriptor (for send wr_id)
 *
 * @iser_header:   iser header
 * @iscsi_header:  iscsi header (bhs)
 * @type:          command/control/dataout
 * @dma_addr:      header buffer dma_address
 * @tx_sg:         sg[0] points to iser/iscsi headers
 *                 sg[1] optionally points to either of immediate data
 *                 unsolicited data-out or control
 * @num_sge:       number sges used on this TX task
 * @mapped:        indicates if the descriptor is dma mapped
 */
struct iser_tx_desc {
        struct iser_hdr              iser_header;
        struct iscsi_bhs             iscsi_header __attribute__((packed));
        enum   iser_desc_type        type;
        u64                          dma_addr;
        struct ib_sge                tx_sg[2];
        int                          num_sge;
        bool                         mapped;
};

#define ISER_RX_PAD_SIZE        (256 - (ISER_RX_PAYLOAD_SIZE + \
                                        sizeof(u64) + sizeof(struct ib_sge)))
/**
 * struct iser_rx_desc - iSER RX descriptor (for recv wr_id)
 *
 * @iser_header:   iser header
 * @iscsi_header:  iscsi header
 * @data:          received data segment
 * @dma_addr:      receive buffer dma address
 * @rx_sg:         ib_sge of receive buffer
 * @pad:           for sense data TODO: Modify to maximum sense length supported
 */
struct iser_rx_desc {
        struct iser_hdr              iser_header;
        struct iscsi_bhs             iscsi_header;
        char                         data[ISER_RECV_DATA_SEG_LEN];
        u64                          dma_addr;
        struct ib_sge                rx_sg;
        char                         pad[ISER_RX_PAD_SIZE];
} __attribute__((packed));

struct icl_iser_pdu {
        struct icl_pdu               icl_pdu;
        struct iser_tx_desc          desc;
        struct iser_conn             *iser_conn;
        enum iser_task_status        status;
        struct ccb_scsiio                        *csio;
        int                          command_sent;
        int                          dir[ISER_DIRS_NUM];
        struct iser_mem_reg          rdma_reg[ISER_DIRS_NUM];
        struct iser_data_buf         data[ISER_DIRS_NUM];
};

/**
 * struct iser_comp - iSER completion context
 *
 * @device:     pointer to device handle
 * @cq:         completion queue
 * @wcs:        work completion array
 * @tq:         taskqueue handle
 * @task:       task to run task_fn
 * @active_qps: Number of active QPs attached
 *              to completion context
 */
struct iser_comp {
        struct iser_device      *device;
        struct ib_cq            *cq;
        struct ib_wc             wcs[ISER_WC_BATCH_COUNT];
        struct taskqueue        *tq;
        struct task             task;
        int                      active_qps;
};

/**
 * struct iser_device - iSER device handle
 *
 * @ib_device:     RDMA device
 * @pd:            Protection Domain for this device
 * @dev_attr:      Device attributes container
 * @mr:            Global DMA memory region
 * @event_handler: IB events handle routine
 * @ig_list:       entry in devices list
 * @refcount:      Reference counter, dominated by open iser connections
 * @comps_used:    Number of completion contexts used, Min between online
 *                 cpus and device max completion vectors
 * @comps:         Dinamically allocated array of completion handlers
 */
struct iser_device {
        struct ib_device             *ib_device;
        struct ib_pd                 *pd;
        struct ib_device_attr        dev_attr;
        struct ib_mr                 *mr;
        struct ib_event_handler      event_handler;
        struct list_head             ig_list;
        int                          refcount;
        int                          comps_used;
        struct iser_comp             *comps;
};

/**
 * struct iser_reg_resources - Fast registration recources
 *
 * @mr:         memory region
 * @mr_valid:   is mr valid indicator
 */
struct iser_reg_resources {
        struct ib_mr                     *mr;
        u8                                mr_valid:1;
};

/**
 * struct fast_reg_descriptor - Fast registration descriptor
 *
 * @list:           entry in connection fastreg pool
 * @rsc:            data buffer registration resources
 */
struct fast_reg_descriptor {
        struct list_head                  list;
        struct iser_reg_resources         rsc;
};


/**
 * struct iser_beacon - beacon to signal all flush errors were drained
 *
 * @send:           send wr
 * @recv:           recv wr
 * @flush_lock:     protects flush_cv
 * @flush_cv:       condition variable for beacon flush
 */
struct iser_beacon {
        union {
                struct ib_send_wr       send;
                struct ib_recv_wr       recv;
        };
        struct mtx                   flush_lock;
        struct cv                    flush_cv;
};

/**
 * struct ib_conn - Infiniband related objects
 *
 * @cma_id:              rdma_cm connection maneger handle
 * @qp:                  Connection Queue-pair
 * @device:              reference to iser device
 * @comp:                iser completion context
  */
struct ib_conn {
        struct rdma_cm_id           *cma_id;
        struct ib_qp                *qp;
        int                          post_recv_buf_count;
        u8                           sig_count;
        struct ib_recv_wr            rx_wr[ISER_MIN_POSTED_RX];
        struct iser_device          *device;
        struct iser_comp            *comp;
        struct iser_beacon           beacon;
        struct mtx               lock;
        union {
                struct {
                        struct ib_fmr_pool      *pool;
                        struct iser_page_vec    *page_vec;
                } fmr;
                struct {
                        struct list_head         pool;
                        int                      pool_size;
                } fastreg;
        };
};

struct iser_conn {
        struct icl_conn             icl_conn;
        struct ib_conn               ib_conn;
        struct cv                    up_cv;
        struct list_head             conn_list;
        struct sx                                state_mutex;
        enum iser_conn_state         state;
        int                                              qp_max_recv_dtos;
        int                                              min_posted_rx;
        u16                          max_cmds;
        char                         *login_buf;
        char                         *login_req_buf, *login_resp_buf;
        u64                          login_req_dma, login_resp_dma;
        unsigned int                 rx_desc_head;
        struct iser_rx_desc          *rx_descs;
        u32                          num_rx_descs;
        bool                         handoff_done;
};

/**
 * struct iser_global: iSER global context
 *
 * @device_list_mutex:    protects device_list
 * @device_list:          iser devices global list
 * @connlist_mutex:       protects connlist
 * @connlist:             iser connections global list
 * @desc_cache:           kmem cache for tx dataout
 * @close_conns_mutex:    serializes conns closure
 */
struct iser_global {
        struct sx        device_list_mutex;
        struct list_head  device_list;
        struct mtx        connlist_mutex;
        struct list_head  connlist;
        struct sx         close_conns_mutex;
};

extern struct iser_global ig;
extern int iser_debug;

void
iser_create_send_desc(struct iser_conn *, struct iser_tx_desc *);

int
iser_post_recvl(struct iser_conn *);

int
iser_post_recvm(struct iser_conn *, int);

int
iser_alloc_login_buf(struct iser_conn *iser_conn);

void
iser_free_login_buf(struct iser_conn *iser_conn);

int
iser_post_send(struct ib_conn *, struct iser_tx_desc *, bool);

void
iser_snd_completion(struct iser_tx_desc *, struct ib_conn *);

void
iser_rcv_completion(struct iser_rx_desc *, unsigned long,
                    struct ib_conn *);

void
iser_pdu_free(struct icl_conn *, struct icl_pdu *);

struct icl_pdu *
iser_new_pdu(struct icl_conn *ic, int flags);

int
iser_alloc_rx_descriptors(struct iser_conn *, int);

void
iser_free_rx_descriptors(struct iser_conn *);

int
iser_initialize_headers(struct icl_iser_pdu *, struct iser_conn *);

int
iser_send_control(struct iser_conn *, struct icl_iser_pdu *);

int
iser_send_command(struct iser_conn *, struct icl_iser_pdu *);

int
iser_reg_rdma_mem(struct icl_iser_pdu *, enum iser_data_dir);

void
iser_unreg_rdma_mem(struct icl_iser_pdu *, enum iser_data_dir);

int
iser_create_fastreg_pool(struct ib_conn *, unsigned);

void
iser_free_fastreg_pool(struct ib_conn *);

int
iser_dma_map_task_data(struct icl_iser_pdu *,
                       struct iser_data_buf *, enum iser_data_dir,
                       enum dma_data_direction);

int
iser_conn_terminate(struct iser_conn *);

void
iser_free_ib_conn_res(struct iser_conn *, bool);

void
iser_dma_unmap_task_data(struct icl_iser_pdu *, struct iser_data_buf *,
                         enum dma_data_direction);

int
iser_cma_handler(struct rdma_cm_id *, struct rdma_cm_event *);

#endif /* !ICL_ISER_H */