root/sys/dev/mlx5/mlx5_ib/mlx5_ib_qp.c
/*-
 * Copyright (c) 2013-2021, Mellanox Technologies. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS `AS IS' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

#include "opt_rss.h"
#include "opt_ratelimit.h"

#include <linux/module.h>
#include <rdma/ib_umem.h>
#include <rdma/ib_cache.h>
#include <rdma/ib_user_verbs.h>
#include <rdma/uverbs_ioctl.h>
#include <dev/mlx5/mlx5_ib/mlx5_ib.h>

/* not supported currently */
static int wq_signature;

enum {
        MLX5_IB_ACK_REQ_FREQ    = 8,
};

enum {
        MLX5_IB_DEFAULT_SCHED_QUEUE     = 0x83,
        MLX5_IB_DEFAULT_QP0_SCHED_QUEUE = 0x3f,
        MLX5_IB_LINK_TYPE_IB            = 0,
        MLX5_IB_LINK_TYPE_ETH           = 1
};

enum {
        MLX5_IB_SQ_STRIDE       = 6,
};

static const u32 mlx5_ib_opcode[] = {
        [IB_WR_SEND]                            = MLX5_OPCODE_SEND,
        [IB_WR_LSO]                             = MLX5_OPCODE_LSO,
        [IB_WR_SEND_WITH_IMM]                   = MLX5_OPCODE_SEND_IMM,
        [IB_WR_RDMA_WRITE]                      = MLX5_OPCODE_RDMA_WRITE,
        [IB_WR_RDMA_WRITE_WITH_IMM]             = MLX5_OPCODE_RDMA_WRITE_IMM,
        [IB_WR_RDMA_READ]                       = MLX5_OPCODE_RDMA_READ,
        [IB_WR_ATOMIC_CMP_AND_SWP]              = MLX5_OPCODE_ATOMIC_CS,
        [IB_WR_ATOMIC_FETCH_AND_ADD]            = MLX5_OPCODE_ATOMIC_FA,
        [IB_WR_SEND_WITH_INV]                   = MLX5_OPCODE_SEND_INVAL,
        [IB_WR_LOCAL_INV]                       = MLX5_OPCODE_UMR,
        [IB_WR_REG_MR]                          = MLX5_OPCODE_UMR,
        [IB_WR_MASKED_ATOMIC_CMP_AND_SWP]       = MLX5_OPCODE_ATOMIC_MASKED_CS,
        [IB_WR_MASKED_ATOMIC_FETCH_AND_ADD]     = MLX5_OPCODE_ATOMIC_MASKED_FA,
        [MLX5_IB_WR_UMR]                        = MLX5_OPCODE_UMR,
};

struct mlx5_wqe_eth_pad {
        u8 rsvd0[16];
};

enum raw_qp_set_mask_map {
        MLX5_RAW_QP_MOD_SET_RQ_Q_CTR_ID         = 1UL << 0,
};

struct mlx5_modify_raw_qp_param {
        u16 operation;

        u32 set_mask; /* raw_qp_set_mask_map */
        u8 rq_q_ctr_id;
};

static void get_cqs(enum ib_qp_type qp_type,
                    struct ib_cq *ib_send_cq, struct ib_cq *ib_recv_cq,
                    struct mlx5_ib_cq **send_cq, struct mlx5_ib_cq **recv_cq);

static int is_qp0(enum ib_qp_type qp_type)
{
        return qp_type == IB_QPT_SMI;
}

static int is_sqp(enum ib_qp_type qp_type)
{
        return is_qp0(qp_type) || is_qp1(qp_type);
}

static void *get_wqe(struct mlx5_ib_qp *qp, int offset)
{
        return mlx5_buf_offset(&qp->buf, offset);
}

static void *get_recv_wqe(struct mlx5_ib_qp *qp, int n)
{
        return get_wqe(qp, qp->rq.offset + (n << qp->rq.wqe_shift));
}

void *mlx5_get_send_wqe(struct mlx5_ib_qp *qp, int n)
{
        return get_wqe(qp, qp->sq.offset + (n << MLX5_IB_SQ_STRIDE));
}

/**
 * mlx5_ib_read_user_wqe() - Copy a user-space WQE to kernel space.
 *
 * @qp: QP to copy from.
 * @send: copy from the send queue when non-zero, use the receive queue
 *        otherwise.
 * @wqe_index:  index to start copying from. For send work queues, the
 *              wqe_index is in units of MLX5_SEND_WQE_BB.
 *              For receive work queue, it is the number of work queue
 *              element in the queue.
 * @buffer: destination buffer.
 * @length: maximum number of bytes to copy.
 *
 * Copies at least a single WQE, but may copy more data.
 *
 * Return: the number of bytes copied, or an error code.
 */
int mlx5_ib_read_user_wqe(struct mlx5_ib_qp *qp, int send, int wqe_index,
                          void *buffer, u32 length,
                          struct mlx5_ib_qp_base *base)
{
        struct ib_device *ibdev = qp->ibqp.device;
        struct mlx5_ib_dev *dev = to_mdev(ibdev);
        struct mlx5_ib_wq *wq = send ? &qp->sq : &qp->rq;
        size_t offset;
        size_t wq_end;
        struct ib_umem *umem = base->ubuffer.umem;
        u32 first_copy_length;
        int wqe_length;
        int ret;

        if (wq->wqe_cnt == 0) {
                mlx5_ib_dbg(dev, "mlx5_ib_read_user_wqe for a QP with wqe_cnt == 0. qp_type: 0x%x\n",
                            qp->ibqp.qp_type);
                return -EINVAL;
        }

        offset = wq->offset + ((wqe_index % wq->wqe_cnt) << wq->wqe_shift);
        wq_end = wq->offset + (wq->wqe_cnt << wq->wqe_shift);

        if (send && length < sizeof(struct mlx5_wqe_ctrl_seg))
                return -EINVAL;

        if (offset > umem->length ||
            (send && offset + sizeof(struct mlx5_wqe_ctrl_seg) > umem->length))
                return -EINVAL;

        first_copy_length = min_t(u32, offset + length, wq_end) - offset;
        ret = ib_umem_copy_from(buffer, umem, offset, first_copy_length);
        if (ret)
                return ret;

        if (send) {
                struct mlx5_wqe_ctrl_seg *ctrl = buffer;
                int ds = be32_to_cpu(ctrl->qpn_ds) & MLX5_WQE_CTRL_DS_MASK;

                wqe_length = ds * MLX5_WQE_DS_UNITS;
        } else {
                wqe_length = 1 << wq->wqe_shift;
        }

        if (wqe_length <= first_copy_length)
                return first_copy_length;

        ret = ib_umem_copy_from(buffer + first_copy_length, umem, wq->offset,
                                wqe_length - first_copy_length);
        if (ret)
                return ret;

        return wqe_length;
}

static void mlx5_ib_qp_event(struct mlx5_core_qp *qp, int type)
{
        struct ib_qp *ibqp = &to_mibqp(qp)->ibqp;
        struct ib_event event;

        if (type == MLX5_EVENT_TYPE_PATH_MIG) {
                /* This event is only valid for trans_qps */
                to_mibqp(qp)->port = to_mibqp(qp)->trans_qp.alt_port;
        }

        if (ibqp->event_handler) {
                event.device     = ibqp->device;
                event.element.qp = ibqp;
                switch (type) {
                case MLX5_EVENT_TYPE_PATH_MIG:
                        event.event = IB_EVENT_PATH_MIG;
                        break;
                case MLX5_EVENT_TYPE_COMM_EST:
                        event.event = IB_EVENT_COMM_EST;
                        break;
                case MLX5_EVENT_TYPE_SQ_DRAINED:
                        event.event = IB_EVENT_SQ_DRAINED;
                        break;
                case MLX5_EVENT_TYPE_SRQ_LAST_WQE:
                        event.event = IB_EVENT_QP_LAST_WQE_REACHED;
                        break;
                case MLX5_EVENT_TYPE_WQ_CATAS_ERROR:
                        event.event = IB_EVENT_QP_FATAL;
                        break;
                case MLX5_EVENT_TYPE_PATH_MIG_FAILED:
                        event.event = IB_EVENT_PATH_MIG_ERR;
                        break;
                case MLX5_EVENT_TYPE_WQ_INVAL_REQ_ERROR:
                        event.event = IB_EVENT_QP_REQ_ERR;
                        break;
                case MLX5_EVENT_TYPE_WQ_ACCESS_ERROR:
                        event.event = IB_EVENT_QP_ACCESS_ERR;
                        break;
                default:
                        pr_warn("mlx5_ib: Unexpected event type %d on QP %06x\n", type, qp->qpn);
                        return;
                }

                ibqp->event_handler(&event, ibqp->qp_context);
        }
}

static int set_rq_size(struct mlx5_ib_dev *dev, struct ib_qp_cap *cap,
                       int has_rq, struct mlx5_ib_qp *qp, struct mlx5_ib_create_qp *ucmd)
{
        int wqe_size;
        int wq_size;

        /* Sanity check RQ size before proceeding */
        if (cap->max_recv_wr > (1 << MLX5_CAP_GEN(dev->mdev, log_max_qp_sz)))
                return -EINVAL;

        if (!has_rq) {
                qp->rq.max_gs = 0;
                qp->rq.wqe_cnt = 0;
                qp->rq.wqe_shift = 0;
                cap->max_recv_wr = 0;
                cap->max_recv_sge = 0;
        } else {
                if (ucmd) {
                        qp->rq.wqe_cnt = ucmd->rq_wqe_count;
                        qp->rq.wqe_shift = ucmd->rq_wqe_shift;
                        qp->rq.max_gs = (1 << qp->rq.wqe_shift) / sizeof(struct mlx5_wqe_data_seg) - qp->wq_sig;
                        qp->rq.max_post = qp->rq.wqe_cnt;
                } else {
                        wqe_size = qp->wq_sig ? sizeof(struct mlx5_wqe_signature_seg) : 0;
                        wqe_size += cap->max_recv_sge * sizeof(struct mlx5_wqe_data_seg);
                        wqe_size = roundup_pow_of_two(wqe_size);
                        wq_size = roundup_pow_of_two(cap->max_recv_wr) * wqe_size;
                        wq_size = max_t(int, wq_size, MLX5_SEND_WQE_BB);
                        qp->rq.wqe_cnt = wq_size / wqe_size;
                        if (wqe_size > MLX5_CAP_GEN(dev->mdev, max_wqe_sz_rq)) {
                                mlx5_ib_dbg(dev, "wqe_size %d, max %d\n",
                                            wqe_size,
                                            MLX5_CAP_GEN(dev->mdev,
                                                         max_wqe_sz_rq));
                                return -EINVAL;
                        }
                        qp->rq.wqe_shift = ilog2(wqe_size);
                        qp->rq.max_gs = (1 << qp->rq.wqe_shift) / sizeof(struct mlx5_wqe_data_seg) - qp->wq_sig;
                        qp->rq.max_post = qp->rq.wqe_cnt;
                }
        }

        return 0;
}

static int sq_overhead(struct ib_qp_init_attr *attr)
{
        int size = 0;

        switch (attr->qp_type) {
        case IB_QPT_XRC_INI:
                size += sizeof(struct mlx5_wqe_xrc_seg);
                /* fall through */
        case IB_QPT_RC:
                size += sizeof(struct mlx5_wqe_ctrl_seg) +
                        max(sizeof(struct mlx5_wqe_atomic_seg) +
                            sizeof(struct mlx5_wqe_raddr_seg),
                            sizeof(struct mlx5_wqe_umr_ctrl_seg) +
                            sizeof(struct mlx5_mkey_seg));
                break;

        case IB_QPT_XRC_TGT:
                return 0;

        case IB_QPT_UC:
                size += sizeof(struct mlx5_wqe_ctrl_seg) +
                        max(sizeof(struct mlx5_wqe_raddr_seg),
                            sizeof(struct mlx5_wqe_umr_ctrl_seg) +
                            sizeof(struct mlx5_mkey_seg));
                break;

        case IB_QPT_UD:
                if (attr->create_flags & IB_QP_CREATE_IPOIB_UD_LSO)
                        size += sizeof(struct mlx5_wqe_eth_pad) +
                                sizeof(struct mlx5_wqe_eth_seg);
                /* fall through */
        case IB_QPT_SMI:
        case MLX5_IB_QPT_HW_GSI:
                size += sizeof(struct mlx5_wqe_ctrl_seg) +
                        sizeof(struct mlx5_wqe_datagram_seg);
                break;

        case MLX5_IB_QPT_REG_UMR:
                size += sizeof(struct mlx5_wqe_ctrl_seg) +
                        sizeof(struct mlx5_wqe_umr_ctrl_seg) +
                        sizeof(struct mlx5_mkey_seg);
                break;

        default:
                return -EINVAL;
        }

        return size;
}

static int calc_send_wqe(struct ib_qp_init_attr *attr)
{
        int inl_size = 0;
        int size;

        size = sq_overhead(attr);
        if (size < 0)
                return size;

        if (attr->cap.max_inline_data) {
                inl_size = size + sizeof(struct mlx5_wqe_inline_seg) +
                        attr->cap.max_inline_data;
        }

        size += attr->cap.max_send_sge * sizeof(struct mlx5_wqe_data_seg);
        if (attr->create_flags & IB_QP_CREATE_SIGNATURE_EN &&
            ALIGN(max_t(int, inl_size, size), MLX5_SEND_WQE_BB) < MLX5_SIG_WQE_SIZE)
                        return MLX5_SIG_WQE_SIZE;
        else
                return ALIGN(max_t(int, inl_size, size), MLX5_SEND_WQE_BB);
}

static int get_send_sge(struct ib_qp_init_attr *attr, int wqe_size)
{
        int max_sge;

        if (attr->qp_type == IB_QPT_RC)
                max_sge = (min_t(int, wqe_size, 512) -
                           sizeof(struct mlx5_wqe_ctrl_seg) -
                           sizeof(struct mlx5_wqe_raddr_seg)) /
                        sizeof(struct mlx5_wqe_data_seg);
        else if (attr->qp_type == IB_QPT_XRC_INI)
                max_sge = (min_t(int, wqe_size, 512) -
                           sizeof(struct mlx5_wqe_ctrl_seg) -
                           sizeof(struct mlx5_wqe_xrc_seg) -
                           sizeof(struct mlx5_wqe_raddr_seg)) /
                        sizeof(struct mlx5_wqe_data_seg);
        else
                max_sge = (wqe_size - sq_overhead(attr)) /
                        sizeof(struct mlx5_wqe_data_seg);

        return min_t(int, max_sge, wqe_size - sq_overhead(attr) /
                     sizeof(struct mlx5_wqe_data_seg));
}

static int calc_sq_size(struct mlx5_ib_dev *dev, struct ib_qp_init_attr *attr,
                        struct mlx5_ib_qp *qp)
{
        int wqe_size;
        int wq_size;

        if (!attr->cap.max_send_wr)
                return 0;

        wqe_size = calc_send_wqe(attr);
        mlx5_ib_dbg(dev, "wqe_size %d\n", wqe_size);
        if (wqe_size < 0)
                return wqe_size;

        if (wqe_size > MLX5_CAP_GEN(dev->mdev, max_wqe_sz_sq)) {
                mlx5_ib_dbg(dev, "wqe_size(%d) > max_sq_desc_sz(%d)\n",
                            wqe_size, MLX5_CAP_GEN(dev->mdev, max_wqe_sz_sq));
                return -EINVAL;
        }

        qp->max_inline_data = wqe_size - sq_overhead(attr) -
                              sizeof(struct mlx5_wqe_inline_seg);
        attr->cap.max_inline_data = qp->max_inline_data;

        if (attr->create_flags & IB_QP_CREATE_SIGNATURE_EN)
                qp->signature_en = true;

        wq_size = roundup_pow_of_two(attr->cap.max_send_wr * wqe_size);
        qp->sq.wqe_cnt = wq_size / MLX5_SEND_WQE_BB;
        if (qp->sq.wqe_cnt > (1 << MLX5_CAP_GEN(dev->mdev, log_max_qp_sz))) {
                mlx5_ib_dbg(dev, "wqe count(%d) exceeds limits(%d)\n",
                            qp->sq.wqe_cnt,
                            1 << MLX5_CAP_GEN(dev->mdev, log_max_qp_sz));
                return -ENOMEM;
        }
        qp->sq.wqe_shift = ilog2(MLX5_SEND_WQE_BB);
        qp->sq.max_gs = get_send_sge(attr, wqe_size);
        if (qp->sq.max_gs < attr->cap.max_send_sge)
                return -ENOMEM;

        attr->cap.max_send_sge = qp->sq.max_gs;
        qp->sq.max_post = wq_size / wqe_size;
        attr->cap.max_send_wr = qp->sq.max_post;

        return wq_size;
}

static int set_user_buf_size(struct mlx5_ib_dev *dev,
                            struct mlx5_ib_qp *qp,
                            struct mlx5_ib_create_qp *ucmd,
                            struct mlx5_ib_qp_base *base,
                            struct ib_qp_init_attr *attr)
{
        int desc_sz = 1 << qp->sq.wqe_shift;

        if (desc_sz > MLX5_CAP_GEN(dev->mdev, max_wqe_sz_sq)) {
                mlx5_ib_warn(dev, "desc_sz %d, max_sq_desc_sz %d\n",
                             desc_sz, MLX5_CAP_GEN(dev->mdev, max_wqe_sz_sq));
                return -EINVAL;
        }

        if (ucmd->sq_wqe_count && ((1 << ilog2(ucmd->sq_wqe_count)) != ucmd->sq_wqe_count)) {
                mlx5_ib_warn(dev, "sq_wqe_count %d, sq_wqe_count %d\n",
                             ucmd->sq_wqe_count, ucmd->sq_wqe_count);
                return -EINVAL;
        }

        qp->sq.wqe_cnt = ucmd->sq_wqe_count;

        if (qp->sq.wqe_cnt > (1 << MLX5_CAP_GEN(dev->mdev, log_max_qp_sz))) {
                mlx5_ib_warn(dev, "wqe_cnt %d, max_wqes %d\n",
                             qp->sq.wqe_cnt,
                             1 << MLX5_CAP_GEN(dev->mdev, log_max_qp_sz));
                return -EINVAL;
        }

        if (attr->qp_type == IB_QPT_RAW_PACKET) {
                base->ubuffer.buf_size = qp->rq.wqe_cnt << qp->rq.wqe_shift;
                qp->raw_packet_qp.sq.ubuffer.buf_size = qp->sq.wqe_cnt << 6;
        } else {
                base->ubuffer.buf_size = (qp->rq.wqe_cnt << qp->rq.wqe_shift) +
                                         (qp->sq.wqe_cnt << 6);
        }

        return 0;
}

static int qp_has_rq(struct ib_qp_init_attr *attr)
{
        if (attr->qp_type == IB_QPT_XRC_INI ||
            attr->qp_type == IB_QPT_XRC_TGT || attr->srq ||
            attr->qp_type == MLX5_IB_QPT_REG_UMR ||
            !attr->cap.max_recv_wr)
                return 0;

        return 1;
}

enum {
        /* this is the first blue flame register in the array of bfregs assigned
         * to a processes. Since we do not use it for blue flame but rather
         * regular 64 bit doorbells, we do not need a lock for maintaiing
         * "odd/even" order
         */
        NUM_NON_BLUE_FLAME_BFREGS = 1,
};

static int max_bfregs(struct mlx5_ib_dev *dev, struct mlx5_bfreg_info *bfregi)
{
        return get_num_static_uars(dev, bfregi) * MLX5_NON_FP_BFREGS_PER_UAR;
}

static int num_med_bfreg(struct mlx5_ib_dev *dev,
                         struct mlx5_bfreg_info *bfregi)
{
        int n;

        n = max_bfregs(dev, bfregi) - bfregi->num_low_latency_bfregs -
            NUM_NON_BLUE_FLAME_BFREGS;

        return n >= 0 ? n : 0;
}

static int first_med_bfreg(struct mlx5_ib_dev *dev,
                           struct mlx5_bfreg_info *bfregi)
{
        return num_med_bfreg(dev, bfregi) ? 1 : -ENOMEM;
}

static int first_hi_bfreg(struct mlx5_ib_dev *dev,
                          struct mlx5_bfreg_info *bfregi)
{
        int med;

        med = num_med_bfreg(dev, bfregi);
        return ++med;
}

static int alloc_high_class_bfreg(struct mlx5_ib_dev *dev,
                                  struct mlx5_bfreg_info *bfregi)
{
        int i;

        for (i = first_hi_bfreg(dev, bfregi); i < max_bfregs(dev, bfregi); i++) {
                if (!bfregi->count[i]) {
                        bfregi->count[i]++;
                        return i;
                }
        }

        return -ENOMEM;
}

static int alloc_med_class_bfreg(struct mlx5_ib_dev *dev,
                                 struct mlx5_bfreg_info *bfregi)
{
        int minidx = first_med_bfreg(dev, bfregi);
        int i;

        if (minidx < 0)
                return minidx;

        for (i = minidx; i < first_hi_bfreg(dev, bfregi); i++) {
                if (bfregi->count[i] < bfregi->count[minidx])
                        minidx = i;
                if (!bfregi->count[minidx])
                        break;
        }

        bfregi->count[minidx]++;
        return minidx;
}

static int alloc_bfreg(struct mlx5_ib_dev *dev,
                       struct mlx5_bfreg_info *bfregi)
{
        int bfregn = -ENOMEM;

        if (bfregi->lib_uar_dyn)
                return -EINVAL;

        mutex_lock(&bfregi->lock);
        if (bfregi->ver >= 2) {
                bfregn = alloc_high_class_bfreg(dev, bfregi);
                if (bfregn < 0)
                        bfregn = alloc_med_class_bfreg(dev, bfregi);
        }

        if (bfregn < 0) {
                BUILD_BUG_ON(NUM_NON_BLUE_FLAME_BFREGS != 1);
                bfregn = 0;
                bfregi->count[bfregn]++;
        }
        mutex_unlock(&bfregi->lock);

        return bfregn;
}

void mlx5_ib_free_bfreg(struct mlx5_ib_dev *dev, struct mlx5_bfreg_info *bfregi, int bfregn)
{
        mutex_lock(&bfregi->lock);
        bfregi->count[bfregn]--;
        mutex_unlock(&bfregi->lock);
}

static enum mlx5_qp_state to_mlx5_state(enum ib_qp_state state)
{
        switch (state) {
        case IB_QPS_RESET:      return MLX5_QP_STATE_RST;
        case IB_QPS_INIT:       return MLX5_QP_STATE_INIT;
        case IB_QPS_RTR:        return MLX5_QP_STATE_RTR;
        case IB_QPS_RTS:        return MLX5_QP_STATE_RTS;
        case IB_QPS_SQD:        return MLX5_QP_STATE_SQD;
        case IB_QPS_SQE:        return MLX5_QP_STATE_SQER;
        case IB_QPS_ERR:        return MLX5_QP_STATE_ERR;
        default:                return -1;
        }
}

static int to_mlx5_st(enum ib_qp_type type)
{
        switch (type) {
        case IB_QPT_RC:                 return MLX5_QP_ST_RC;
        case IB_QPT_UC:                 return MLX5_QP_ST_UC;
        case IB_QPT_UD:                 return MLX5_QP_ST_UD;
        case MLX5_IB_QPT_REG_UMR:       return MLX5_QP_ST_REG_UMR;
        case IB_QPT_XRC_INI:
        case IB_QPT_XRC_TGT:            return MLX5_QP_ST_XRC;
        case IB_QPT_SMI:                return MLX5_QP_ST_QP0;
        case MLX5_IB_QPT_HW_GSI:        return MLX5_QP_ST_QP1;
        case IB_QPT_RAW_IPV6:           return MLX5_QP_ST_RAW_IPV6;
        case IB_QPT_RAW_PACKET:
        case IB_QPT_RAW_ETHERTYPE:      return MLX5_QP_ST_RAW_ETHERTYPE;
        case IB_QPT_MAX:
        default:                return -EINVAL;
        }
}

static void mlx5_ib_lock_cqs(struct mlx5_ib_cq *send_cq,
                             struct mlx5_ib_cq *recv_cq);
static void mlx5_ib_unlock_cqs(struct mlx5_ib_cq *send_cq,
                               struct mlx5_ib_cq *recv_cq);

int bfregn_to_uar_index(struct mlx5_ib_dev *dev,
                        struct mlx5_bfreg_info *bfregi, u32 bfregn,
                        bool dyn_bfreg)
{
        unsigned int bfregs_per_sys_page;
        u32 index_of_sys_page;
        u32 offset;

        if (bfregi->lib_uar_dyn)
                return -EINVAL;

        bfregs_per_sys_page = get_uars_per_sys_page(dev, bfregi->lib_uar_4k) *
                                MLX5_NON_FP_BFREGS_PER_UAR;
        index_of_sys_page = bfregn / bfregs_per_sys_page;

        if (dyn_bfreg) {
                index_of_sys_page += bfregi->num_static_sys_pages;

                if (index_of_sys_page >= bfregi->num_sys_pages)
                        return -EINVAL;

                if (bfregn > bfregi->num_dyn_bfregs ||
                    bfregi->sys_pages[index_of_sys_page] == MLX5_IB_INVALID_UAR_INDEX) {
                        mlx5_ib_dbg(dev, "Invalid dynamic uar index\n");
                        return -EINVAL;
                }
        }

        offset = bfregn % bfregs_per_sys_page / MLX5_NON_FP_BFREGS_PER_UAR;
        return bfregi->sys_pages[index_of_sys_page] + offset;
}

static int mlx5_ib_umem_get(struct mlx5_ib_dev *dev,
                            struct ib_pd *pd,
                            unsigned long addr, size_t size,
                            struct ib_umem **umem,
                            int *npages, int *page_shift, int *ncont,
                            u32 *offset)
{
        int err;

        *umem = ib_umem_get(pd->uobject->context, addr, size, 0, 0);
        if (IS_ERR(*umem)) {
                mlx5_ib_dbg(dev, "umem_get failed\n");
                return PTR_ERR(*umem);
        }

        mlx5_ib_cont_pages(*umem, addr, 0, npages, page_shift, ncont, NULL);

        err = mlx5_ib_get_buf_offset(addr, *page_shift, offset);
        if (err) {
                mlx5_ib_warn(dev, "bad offset\n");
                goto err_umem;
        }

        mlx5_ib_dbg(dev, "addr 0x%lx, size %zu, npages %d, page_shift %d, ncont %d, offset %d\n",
                    addr, size, *npages, *page_shift, *ncont, *offset);

        return 0;

err_umem:
        ib_umem_release(*umem);
        *umem = NULL;

        return err;
}

static void destroy_user_rq(struct ib_pd *pd, struct mlx5_ib_rwq *rwq,
                            struct ib_udata *udata)
{
        struct mlx5_ib_ucontext *context =
                rdma_udata_to_drv_context(
                        udata,
                        struct mlx5_ib_ucontext,
                        ibucontext);

        mlx5_ib_db_unmap_user(context, &rwq->db);
        if (rwq->umem)
                ib_umem_release(rwq->umem);
}

static int create_user_rq(struct mlx5_ib_dev *dev, struct ib_pd *pd,
                          struct mlx5_ib_rwq *rwq,
                          struct mlx5_ib_create_wq *ucmd)
{
        struct mlx5_ib_ucontext *context;
        int page_shift = 0;
        int npages;
        u32 offset = 0;
        int ncont = 0;
        int err;

        if (!ucmd->buf_addr)
                return -EINVAL;

        context = to_mucontext(pd->uobject->context);
        rwq->umem = ib_umem_get(pd->uobject->context, ucmd->buf_addr,
                               rwq->buf_size, 0, 0);
        if (IS_ERR(rwq->umem)) {
                mlx5_ib_dbg(dev, "umem_get failed\n");
                err = PTR_ERR(rwq->umem);
                return err;
        }

        mlx5_ib_cont_pages(rwq->umem, ucmd->buf_addr, 0, &npages, &page_shift,
                           &ncont, NULL);
        err = mlx5_ib_get_buf_offset(ucmd->buf_addr, page_shift,
                                     &rwq->rq_page_offset);
        if (err) {
                mlx5_ib_warn(dev, "bad offset\n");
                goto err_umem;
        }

        rwq->rq_num_pas = ncont;
        rwq->page_shift = page_shift;
        rwq->log_page_size =  page_shift - MLX5_ADAPTER_PAGE_SHIFT;
        rwq->wq_sig = !!(ucmd->flags & MLX5_WQ_FLAG_SIGNATURE);

        mlx5_ib_dbg(dev, "addr 0x%llx, size %zd, npages %d, page_shift %d, ncont %d, offset %d\n",
                    (unsigned long long)ucmd->buf_addr, rwq->buf_size,
                    npages, page_shift, ncont, offset);

        err = mlx5_ib_db_map_user(context, ucmd->db_addr, &rwq->db);
        if (err) {
                mlx5_ib_dbg(dev, "map failed\n");
                goto err_umem;
        }

        rwq->create_type = MLX5_WQ_USER;
        return 0;

err_umem:
        ib_umem_release(rwq->umem);
        return err;
}

static int adjust_bfregn(struct mlx5_ib_dev *dev,
                         struct mlx5_bfreg_info *bfregi, int bfregn)
{
        return bfregn / MLX5_NON_FP_BFREGS_PER_UAR * MLX5_BFREGS_PER_UAR +
                                bfregn % MLX5_NON_FP_BFREGS_PER_UAR;
}

static int create_user_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd,
                          struct mlx5_ib_qp *qp, struct ib_udata *udata,
                          struct ib_qp_init_attr *attr,
                          u32 **in,
                          struct mlx5_ib_create_qp_resp *resp, int *inlen,
                          struct mlx5_ib_qp_base *base)
{
        struct mlx5_ib_ucontext *context;
        struct mlx5_ib_create_qp ucmd;
        struct mlx5_ib_ubuffer *ubuffer = &base->ubuffer;
        int page_shift = 0;
        int uar_index = 0;
        int npages;
        u32 offset = 0;
        int bfregn;
        int ncont = 0;
        __be64 *pas;
        void *qpc;
        int err;
        u16 uid;
        u32 uar_flags;

        err = ib_copy_from_udata(&ucmd, udata, sizeof(ucmd));
        if (err) {
                mlx5_ib_dbg(dev, "copy failed\n");
                return err;
        }

        context = to_mucontext(pd->uobject->context);
        uar_flags = ucmd.flags & (MLX5_QP_FLAG_UAR_PAGE_INDEX |
                                  MLX5_QP_FLAG_BFREG_INDEX);
        switch (uar_flags) {
        case MLX5_QP_FLAG_UAR_PAGE_INDEX:
                uar_index = ucmd.bfreg_index;
                bfregn = MLX5_IB_INVALID_BFREG;
                break;
        case MLX5_QP_FLAG_BFREG_INDEX:
                uar_index = bfregn_to_uar_index(dev, &context->bfregi,
                                                ucmd.bfreg_index, true);
                if (uar_index < 0)
                        return uar_index;
                bfregn = MLX5_IB_INVALID_BFREG;
                break;
        case 0:
                if (qp->flags & MLX5_IB_QP_CROSS_CHANNEL)
                        return -EINVAL;
                bfregn = alloc_bfreg(dev, &context->bfregi);
                if (bfregn < 0)
                        return bfregn;
                break;
        default:
                return -EINVAL;
        }

        mlx5_ib_dbg(dev, "bfregn 0x%x, uar_index 0x%x\n", bfregn, uar_index);
        if (bfregn != MLX5_IB_INVALID_BFREG)
                uar_index = bfregn_to_uar_index(dev, &context->bfregi, bfregn,
                                                false);

        qp->rq.offset = 0;
        qp->sq.wqe_shift = ilog2(MLX5_SEND_WQE_BB);
        qp->sq.offset = qp->rq.wqe_cnt << qp->rq.wqe_shift;

        err = set_user_buf_size(dev, qp, &ucmd, base, attr);
        if (err)
                goto err_bfreg;

        if (ucmd.buf_addr && ubuffer->buf_size) {
                ubuffer->buf_addr = ucmd.buf_addr;
                err = mlx5_ib_umem_get(dev, pd, ubuffer->buf_addr,
                                       ubuffer->buf_size,
                                       &ubuffer->umem, &npages, &page_shift,
                                       &ncont, &offset);
                if (err)
                        goto err_bfreg;
        } else {
                ubuffer->umem = NULL;
        }

        *inlen = MLX5_ST_SZ_BYTES(create_qp_in) +
                 MLX5_FLD_SZ_BYTES(create_qp_in, pas[0]) * ncont;
        *in = mlx5_vzalloc(*inlen);
        if (!*in) {
                err = -ENOMEM;
                goto err_umem;
        }

        uid = (attr->qp_type != IB_QPT_XRC_TGT &&
               attr->qp_type != IB_QPT_XRC_INI) ? to_mpd(pd)->uid : 0;
        MLX5_SET(create_qp_in, *in, uid, uid);
        pas = (__be64 *)MLX5_ADDR_OF(create_qp_in, *in, pas);
        if (ubuffer->umem)
                mlx5_ib_populate_pas(dev, ubuffer->umem, page_shift, pas, 0);

        qpc = MLX5_ADDR_OF(create_qp_in, *in, qpc);

        MLX5_SET(qpc, qpc, log_page_size, page_shift - MLX5_ADAPTER_PAGE_SHIFT);
        MLX5_SET(qpc, qpc, page_offset, offset);

        MLX5_SET(qpc, qpc, uar_page, uar_index);
        if (bfregn != MLX5_IB_INVALID_BFREG)
                resp->bfreg_index = adjust_bfregn(dev, &context->bfregi, bfregn);
        else
                resp->bfreg_index = MLX5_IB_INVALID_BFREG;
        qp->bfregn = bfregn;

        err = mlx5_ib_db_map_user(context, ucmd.db_addr, &qp->db);
        if (err) {
                mlx5_ib_dbg(dev, "map failed\n");
                goto err_free;
        }

        err = ib_copy_to_udata(udata, resp, sizeof(*resp));
        if (err) {
                mlx5_ib_dbg(dev, "copy failed\n");
                goto err_unmap;
        }
        qp->create_type = MLX5_QP_USER;

        return 0;

err_unmap:
        mlx5_ib_db_unmap_user(context, &qp->db);

err_free:
        kvfree(*in);

err_umem:
        if (ubuffer->umem)
                ib_umem_release(ubuffer->umem);

err_bfreg:
        if (bfregn != MLX5_IB_INVALID_BFREG)
                mlx5_ib_free_bfreg(dev, &context->bfregi, bfregn);
        return err;
}

static void destroy_qp_user(struct mlx5_ib_dev *dev, struct ib_pd *pd, struct mlx5_ib_qp *qp,
                            struct mlx5_ib_qp_base *base,
                            struct ib_udata *udata)
{
        struct mlx5_ib_ucontext *context =
                rdma_udata_to_drv_context(
                        udata,
                        struct mlx5_ib_ucontext,
                        ibucontext);

        mlx5_ib_db_unmap_user(context, &qp->db);
        if (base->ubuffer.umem)
                ib_umem_release(base->ubuffer.umem);

        /*
         * Free only the BFREGs which are handled by the kernel.
         * BFREGs of UARs allocated dynamically are handled by user.
         */
        if (qp->bfregn != MLX5_IB_INVALID_BFREG)
                mlx5_ib_free_bfreg(dev, &context->bfregi, qp->bfregn);
}

static int create_kernel_qp(struct mlx5_ib_dev *dev,
                            struct ib_qp_init_attr *init_attr,
                            struct mlx5_ib_qp *qp,
                            u32 **in, int *inlen,
                            struct mlx5_ib_qp_base *base)
{
        int uar_index;
        void *qpc;
        int err;

        if (init_attr->create_flags & ~(IB_QP_CREATE_SIGNATURE_EN |
                                        IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK |
                                        IB_QP_CREATE_IPOIB_UD_LSO |
                                        MLX5_IB_QP_CREATE_SQPN_QP1 |
                                        MLX5_IB_QP_CREATE_WC_TEST))
                return -EINVAL;

        spin_lock_init(&qp->bf.lock32);

        if (init_attr->qp_type == MLX5_IB_QPT_REG_UMR)
                qp->bf.bfreg = &dev->fp_bfreg;
        else if (init_attr->create_flags & MLX5_IB_QP_CREATE_WC_TEST)
                qp->bf.bfreg = &dev->wc_bfreg;
        else
                qp->bf.bfreg = &dev->bfreg;

        /* We need to divide by two since each register is comprised of
         * two buffers of identical size, namely odd and even
         */
        qp->bf.buf_size = (1 << MLX5_CAP_GEN(dev->mdev, log_bf_reg_size)) / 2;
        uar_index = qp->bf.bfreg->index;

        err = calc_sq_size(dev, init_attr, qp);
        if (err < 0) {
                mlx5_ib_dbg(dev, "err %d\n", err);
                return err;
        }

        qp->rq.offset = 0;
        qp->sq.offset = qp->rq.wqe_cnt << qp->rq.wqe_shift;
        base->ubuffer.buf_size = err + (qp->rq.wqe_cnt << qp->rq.wqe_shift);

        err = mlx5_buf_alloc(dev->mdev, base->ubuffer.buf_size,
            2 * PAGE_SIZE, &qp->buf);
        if (err) {
                mlx5_ib_dbg(dev, "err %d\n", err);
                return err;
        }

        qp->sq.qend = mlx5_get_send_wqe(qp, qp->sq.wqe_cnt);
        *inlen = MLX5_ST_SZ_BYTES(create_qp_in) +
                 MLX5_FLD_SZ_BYTES(create_qp_in, pas[0]) * qp->buf.npages;
        *in = mlx5_vzalloc(*inlen);
        if (!*in) {
                err = -ENOMEM;
                goto err_buf;
        }

        qpc = MLX5_ADDR_OF(create_qp_in, *in, qpc);
        MLX5_SET(qpc, qpc, uar_page, uar_index);
        MLX5_SET(qpc, qpc, ts_format, mlx5_get_qp_default_ts(dev->mdev));
        MLX5_SET(qpc, qpc, log_page_size, qp->buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT);

        /* Set "fast registration enabled" for all kernel QPs */
        MLX5_SET(qpc, qpc, fre, 1);
        MLX5_SET(qpc, qpc, rlky, 1);

        if (init_attr->create_flags & MLX5_IB_QP_CREATE_SQPN_QP1) {
                MLX5_SET(qpc, qpc, deth_sqpn, 1);
                qp->flags |= MLX5_IB_QP_SQPN_QP1;
        }

        mlx5_fill_page_array(&qp->buf,
                             (__be64 *)MLX5_ADDR_OF(create_qp_in, *in, pas));

        err = mlx5_db_alloc(dev->mdev, &qp->db);
        if (err) {
                mlx5_ib_dbg(dev, "err %d\n", err);
                goto err_free;
        }

        qp->sq.wrid = kmalloc(qp->sq.wqe_cnt * sizeof(*qp->sq.wrid), GFP_KERNEL);
        qp->sq.wr_data = kmalloc(qp->sq.wqe_cnt * sizeof(*qp->sq.wr_data), GFP_KERNEL);
        qp->rq.wrid = kmalloc(qp->rq.wqe_cnt * sizeof(*qp->rq.wrid), GFP_KERNEL);
        qp->sq.w_list = kmalloc(qp->sq.wqe_cnt * sizeof(*qp->sq.w_list), GFP_KERNEL);
        qp->sq.wqe_head = kmalloc(qp->sq.wqe_cnt * sizeof(*qp->sq.wqe_head), GFP_KERNEL);

        if (!qp->sq.wrid || !qp->sq.wr_data || !qp->rq.wrid ||
            !qp->sq.w_list || !qp->sq.wqe_head) {
                err = -ENOMEM;
                goto err_wrid;
        }
        qp->create_type = MLX5_QP_KERNEL;

        return 0;

err_wrid:
        kfree(qp->sq.wqe_head);
        kfree(qp->sq.w_list);
        kfree(qp->sq.wrid);
        kfree(qp->sq.wr_data);
        kfree(qp->rq.wrid);
        mlx5_db_free(dev->mdev, &qp->db);

err_free:
        kvfree(*in);

err_buf:
        mlx5_buf_free(dev->mdev, &qp->buf);
        return err;
}

static void destroy_qp_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp)
{
        kfree(qp->sq.wqe_head);
        kfree(qp->sq.w_list);
        kfree(qp->sq.wrid);
        kfree(qp->sq.wr_data);
        kfree(qp->rq.wrid);
        mlx5_db_free(dev->mdev, &qp->db);
        mlx5_buf_free(dev->mdev, &qp->buf);
}

static u32 get_rx_type(struct mlx5_ib_qp *qp, struct ib_qp_init_attr *attr)
{
        if (attr->srq || (attr->qp_type == IB_QPT_XRC_TGT) ||
            (attr->qp_type == IB_QPT_XRC_INI))
                return MLX5_SRQ_RQ;
        else if (!qp->has_rq)
                return MLX5_ZERO_LEN_RQ;
        else
                return MLX5_NON_ZERO_RQ;
}

static int is_connected(enum ib_qp_type qp_type)
{
        if (qp_type == IB_QPT_RC || qp_type == IB_QPT_UC)
                return 1;

        return 0;
}

static int create_raw_packet_qp_tis(struct mlx5_ib_dev *dev,
                                    struct mlx5_ib_sq *sq, u32 tdn,
                                    struct ib_pd *pd)
{
        u32 in[MLX5_ST_SZ_DW(create_tis_in)] = {0};
        void *tisc = MLX5_ADDR_OF(create_tis_in, in, ctx);

        MLX5_SET(create_tis_in, in, uid, to_mpd(pd)->uid);
        MLX5_SET(tisc, tisc, transport_domain, tdn);
        return mlx5_core_create_tis(dev->mdev, in, sizeof(in), &sq->tisn);
}

static void destroy_raw_packet_qp_tis(struct mlx5_ib_dev *dev,
                                      struct mlx5_ib_sq *sq, struct ib_pd *pd)
{
        mlx5_core_destroy_tis(dev->mdev, sq->tisn, to_mpd(pd)->uid);
}

static int create_raw_packet_qp_sq(struct mlx5_ib_dev *dev,
                                   struct mlx5_ib_sq *sq, void *qpin,
                                   struct ib_pd *pd)
{
        struct mlx5_ib_ubuffer *ubuffer = &sq->ubuffer;
        __be64 *pas;
        void *in;
        void *sqc;
        void *qpc = MLX5_ADDR_OF(create_qp_in, qpin, qpc);
        void *wq;
        int inlen;
        int err;
        int page_shift = 0;
        int npages;
        int ncont = 0;
        u32 offset = 0;
        u8 ts_format;

        ts_format = mlx5_get_sq_default_ts(dev->mdev);

        err = mlx5_ib_umem_get(dev, pd, ubuffer->buf_addr, ubuffer->buf_size,
                               &sq->ubuffer.umem, &npages, &page_shift,
                               &ncont, &offset);
        if (err)
                return err;

        inlen = MLX5_ST_SZ_BYTES(create_sq_in) + sizeof(u64) * ncont;
        in = mlx5_vzalloc(inlen);
        if (!in) {
                err = -ENOMEM;
                goto err_umem;
        }

        MLX5_SET(create_sq_in, in, uid, to_mpd(pd)->uid);
        sqc = MLX5_ADDR_OF(create_sq_in, in, ctx);
        MLX5_SET(sqc, sqc, flush_in_error_en, 1);
        MLX5_SET(sqc, sqc, state, MLX5_SQC_STATE_RST);
        MLX5_SET(sqc, sqc, ts_format, ts_format);
        MLX5_SET(sqc, sqc, user_index, MLX5_GET(qpc, qpc, user_index));
        MLX5_SET(sqc, sqc, cqn, MLX5_GET(qpc, qpc, cqn_snd));
        MLX5_SET(sqc, sqc, tis_lst_sz, 1);
        MLX5_SET(sqc, sqc, tis_num_0, sq->tisn);

        wq = MLX5_ADDR_OF(sqc, sqc, wq);
        MLX5_SET(wq, wq, wq_type, MLX5_WQ_TYPE_CYCLIC);
        MLX5_SET(wq, wq, pd, MLX5_GET(qpc, qpc, pd));
        MLX5_SET(wq, wq, uar_page, MLX5_GET(qpc, qpc, uar_page));
        MLX5_SET64(wq, wq, dbr_addr, MLX5_GET64(qpc, qpc, dbr_addr));
        MLX5_SET(wq, wq, log_wq_stride, ilog2(MLX5_SEND_WQE_BB));
        MLX5_SET(wq, wq, log_wq_sz, MLX5_GET(qpc, qpc, log_sq_size));
        MLX5_SET(wq, wq, log_wq_pg_sz,  page_shift - MLX5_ADAPTER_PAGE_SHIFT);
        MLX5_SET(wq, wq, page_offset, offset);

        pas = (__be64 *)MLX5_ADDR_OF(wq, wq, pas);
        mlx5_ib_populate_pas(dev, sq->ubuffer.umem, page_shift, pas, 0);

        err = mlx5_core_create_sq_tracked(dev->mdev, in, inlen, &sq->base.mqp);

        kvfree(in);

        if (err)
                goto err_umem;

        return 0;

err_umem:
        ib_umem_release(sq->ubuffer.umem);
        sq->ubuffer.umem = NULL;

        return err;
}

static void destroy_raw_packet_qp_sq(struct mlx5_ib_dev *dev,
                                     struct mlx5_ib_sq *sq)
{
        mlx5_core_destroy_sq_tracked(dev->mdev, &sq->base.mqp);
        ib_umem_release(sq->ubuffer.umem);
}

static int get_rq_pas_size(void *qpc)
{
        u32 log_page_size = MLX5_GET(qpc, qpc, log_page_size) + 12;
        u32 log_rq_stride = MLX5_GET(qpc, qpc, log_rq_stride);
        u32 log_rq_size   = MLX5_GET(qpc, qpc, log_rq_size);
        u32 page_offset   = MLX5_GET(qpc, qpc, page_offset);
        u32 po_quanta     = 1 << (log_page_size - 6);
        u32 rq_sz         = 1 << (log_rq_size + 4 + log_rq_stride);
        u32 page_size     = 1 << log_page_size;
        u32 rq_sz_po      = rq_sz + (page_offset * po_quanta);
        u32 rq_num_pas    = (rq_sz_po + page_size - 1) / page_size;

        return rq_num_pas * sizeof(u64);
}

static int create_raw_packet_qp_rq(struct mlx5_ib_dev *dev,
                                   struct mlx5_ib_rq *rq, void *qpin,
                                   struct ib_pd *pd)
{
        struct mlx5_ib_qp *mqp = rq->base.container_mibqp;
        __be64 *pas;
        __be64 *qp_pas;
        void *in;
        void *rqc;
        void *wq;
        void *qpc = MLX5_ADDR_OF(create_qp_in, qpin, qpc);
        int inlen;
        int err;
        u32 rq_pas_size = get_rq_pas_size(qpc);
        u8 ts_format;

        ts_format = mlx5_get_rq_default_ts(dev->mdev);

        inlen = MLX5_ST_SZ_BYTES(create_rq_in) + rq_pas_size;
        in = mlx5_vzalloc(inlen);
        if (!in)
                return -ENOMEM;

        MLX5_SET(create_rq_in, in, uid, to_mpd(pd)->uid);
        rqc = MLX5_ADDR_OF(create_rq_in, in, ctx);
        MLX5_SET(rqc, rqc, vlan_strip_disable, 1);
        MLX5_SET(rqc, rqc, mem_rq_type, MLX5_RQC_RQ_TYPE_MEMORY_RQ_INLINE);
        MLX5_SET(rqc, rqc, state, MLX5_RQC_STATE_RST);
        MLX5_SET(rqc, rqc, ts_format, ts_format);
        MLX5_SET(rqc, rqc, flush_in_error_en, 1);
        MLX5_SET(rqc, rqc, user_index, MLX5_GET(qpc, qpc, user_index));
        MLX5_SET(rqc, rqc, cqn, MLX5_GET(qpc, qpc, cqn_rcv));

        if (mqp->flags & MLX5_IB_QP_CAP_SCATTER_FCS)
                MLX5_SET(rqc, rqc, scatter_fcs, 1);

        wq = MLX5_ADDR_OF(rqc, rqc, wq);
        MLX5_SET(wq, wq, wq_type, MLX5_WQ_TYPE_CYCLIC);
        MLX5_SET(wq, wq, end_padding_mode,
                 MLX5_GET(qpc, qpc, end_padding_mode));
        MLX5_SET(wq, wq, page_offset, MLX5_GET(qpc, qpc, page_offset));
        MLX5_SET(wq, wq, pd, MLX5_GET(qpc, qpc, pd));
        MLX5_SET64(wq, wq, dbr_addr, MLX5_GET64(qpc, qpc, dbr_addr));
        MLX5_SET(wq, wq, log_wq_stride, MLX5_GET(qpc, qpc, log_rq_stride) + 4);
        MLX5_SET(wq, wq, log_wq_pg_sz, MLX5_GET(qpc, qpc, log_page_size));
        MLX5_SET(wq, wq, log_wq_sz, MLX5_GET(qpc, qpc, log_rq_size));

        pas = (__be64 *)MLX5_ADDR_OF(wq, wq, pas);
        qp_pas = (__be64 *)MLX5_ADDR_OF(create_qp_in, qpin, pas);
        memcpy(pas, qp_pas, rq_pas_size);

        err = mlx5_core_create_rq_tracked(dev->mdev, in, inlen, &rq->base.mqp);

        kvfree(in);

        return err;
}

static void destroy_raw_packet_qp_rq(struct mlx5_ib_dev *dev,
                                     struct mlx5_ib_rq *rq)
{
        mlx5_core_destroy_rq_tracked(dev->mdev, &rq->base.mqp);
}

static int create_raw_packet_qp_tir(struct mlx5_ib_dev *dev,
                                    struct mlx5_ib_rq *rq, u32 tdn,
                                    struct ib_pd *pd)
{
        u32 *in;
        void *tirc;
        int inlen;
        int err;

        inlen = MLX5_ST_SZ_BYTES(create_tir_in);
        in = mlx5_vzalloc(inlen);
        if (!in)
                return -ENOMEM;

        MLX5_SET(create_tir_in, in, uid, to_mpd(pd)->uid);
        tirc = MLX5_ADDR_OF(create_tir_in, in, tir_context);
        MLX5_SET(tirc, tirc, disp_type, MLX5_TIRC_DISP_TYPE_DIRECT);
        MLX5_SET(tirc, tirc, inline_rqn, rq->base.mqp.qpn);
        MLX5_SET(tirc, tirc, transport_domain, tdn);

        err = mlx5_core_create_tir(dev->mdev, in, inlen, &rq->tirn);

        kvfree(in);

        return err;
}

static void destroy_raw_packet_qp_tir(struct mlx5_ib_dev *dev,
                                      struct mlx5_ib_rq *rq,
                                      struct ib_pd *pd)
{
        mlx5_core_destroy_tir(dev->mdev, rq->tirn, to_mpd(pd)->uid);
}

static int create_raw_packet_qp(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp,
                                u32 *in,
                                struct ib_pd *pd)
{
        struct mlx5_ib_raw_packet_qp *raw_packet_qp = &qp->raw_packet_qp;
        struct mlx5_ib_sq *sq = &raw_packet_qp->sq;
        struct mlx5_ib_rq *rq = &raw_packet_qp->rq;
        struct ib_uobject *uobj = pd->uobject;
        struct ib_ucontext *ucontext = uobj->context;
        struct mlx5_ib_ucontext *mucontext = to_mucontext(ucontext);
        int err;
        u32 tdn = mucontext->tdn;

        if (qp->sq.wqe_cnt) {
                err = create_raw_packet_qp_tis(dev, sq, tdn, pd);
                if (err)
                        return err;

                err = create_raw_packet_qp_sq(dev, sq, in, pd);
                if (err)
                        goto err_destroy_tis;

                sq->base.container_mibqp = qp;
        }

        if (qp->rq.wqe_cnt) {
                rq->base.container_mibqp = qp;

                err = create_raw_packet_qp_rq(dev, rq, in, pd);
                if (err)
                        goto err_destroy_sq;


                err = create_raw_packet_qp_tir(dev, rq, tdn, pd);
                if (err)
                        goto err_destroy_rq;
        }

        qp->trans_qp.base.mqp.qpn = qp->sq.wqe_cnt ? sq->base.mqp.qpn :
                                                     rq->base.mqp.qpn;

        return 0;

err_destroy_rq:
        destroy_raw_packet_qp_rq(dev, rq);
err_destroy_sq:
        if (!qp->sq.wqe_cnt)
                return err;
        destroy_raw_packet_qp_sq(dev, sq);
err_destroy_tis:
        destroy_raw_packet_qp_tis(dev, sq, pd);

        return err;
}

static void destroy_raw_packet_qp(struct mlx5_ib_dev *dev,
                                  struct mlx5_ib_qp *qp)
{
        struct mlx5_ib_raw_packet_qp *raw_packet_qp = &qp->raw_packet_qp;
        struct mlx5_ib_sq *sq = &raw_packet_qp->sq;
        struct mlx5_ib_rq *rq = &raw_packet_qp->rq;

        if (qp->rq.wqe_cnt) {
                destroy_raw_packet_qp_tir(dev, rq, qp->ibqp.pd);
                destroy_raw_packet_qp_rq(dev, rq);
        }

        if (qp->sq.wqe_cnt) {
                destroy_raw_packet_qp_sq(dev, sq);
                destroy_raw_packet_qp_tis(dev, sq, qp->ibqp.pd);
        }
}

static void raw_packet_qp_copy_info(struct mlx5_ib_qp *qp,
                                    struct mlx5_ib_raw_packet_qp *raw_packet_qp)
{
        struct mlx5_ib_sq *sq = &raw_packet_qp->sq;
        struct mlx5_ib_rq *rq = &raw_packet_qp->rq;

        sq->sq = &qp->sq;
        rq->rq = &qp->rq;
        sq->doorbell = &qp->db;
        rq->doorbell = &qp->db;
}

static void destroy_rss_raw_qp_tir(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp)
{
        mlx5_core_destroy_tir(dev->mdev, qp->rss_qp.tirn,
                              to_mpd(qp->ibqp.pd)->uid);
}

static int create_rss_raw_qp_tir(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp,
                                 struct ib_pd *pd,
                                 struct ib_qp_init_attr *init_attr,
                                 struct ib_udata *udata)
{
        struct ib_uobject *uobj = pd->uobject;
        struct ib_ucontext *ucontext = uobj->context;
        struct mlx5_ib_ucontext *mucontext = to_mucontext(ucontext);
        struct mlx5_ib_create_qp_resp resp = {};
        int inlen;
        int err;
        u32 *in;
        void *tirc;
        void *hfso;
        u32 selected_fields = 0;
        size_t min_resp_len;
        u32 tdn = mucontext->tdn;
        struct mlx5_ib_create_qp_rss ucmd = {};
        size_t required_cmd_sz;

        if (init_attr->qp_type != IB_QPT_RAW_PACKET)
                return -EOPNOTSUPP;

        if (init_attr->create_flags || init_attr->send_cq)
                return -EINVAL;

        min_resp_len = offsetof(typeof(resp), bfreg_index) + sizeof(resp.bfreg_index);
        if (udata->outlen < min_resp_len)
                return -EINVAL;

        required_cmd_sz = offsetof(typeof(ucmd), reserved1) + sizeof(ucmd.reserved1);
        if (udata->inlen < required_cmd_sz) {
                mlx5_ib_dbg(dev, "invalid inlen\n");
                return -EINVAL;
        }

        if (udata->inlen > sizeof(ucmd) &&
            !ib_is_udata_cleared(udata, sizeof(ucmd),
                                 udata->inlen - sizeof(ucmd))) {
                mlx5_ib_dbg(dev, "inlen is not supported\n");
                return -EOPNOTSUPP;
        }

        if (ib_copy_from_udata(&ucmd, udata, min(sizeof(ucmd), udata->inlen))) {
                mlx5_ib_dbg(dev, "copy failed\n");
                return -EFAULT;
        }

        if (ucmd.comp_mask) {
                mlx5_ib_dbg(dev, "invalid comp mask\n");
                return -EOPNOTSUPP;
        }

        if (memchr_inv(ucmd.reserved, 0, sizeof(ucmd.reserved)) || ucmd.reserved1) {
                mlx5_ib_dbg(dev, "invalid reserved\n");
                return -EOPNOTSUPP;
        }

        err = ib_copy_to_udata(udata, &resp, min_resp_len);
        if (err) {
                mlx5_ib_dbg(dev, "copy failed\n");
                return -EINVAL;
        }

        inlen = MLX5_ST_SZ_BYTES(create_tir_in);
        in = mlx5_vzalloc(inlen);
        if (!in)
                return -ENOMEM;

        MLX5_SET(create_tir_in, in, uid, to_mpd(pd)->uid);
        tirc = MLX5_ADDR_OF(create_tir_in, in, tir_context);
        MLX5_SET(tirc, tirc, disp_type,
                 MLX5_TIRC_DISP_TYPE_INDIRECT);
        MLX5_SET(tirc, tirc, indirect_table,
                 init_attr->rwq_ind_tbl->ind_tbl_num);
        MLX5_SET(tirc, tirc, transport_domain, tdn);

        hfso = MLX5_ADDR_OF(tirc, tirc, rx_hash_field_selector_outer);
        switch (ucmd.rx_hash_function) {
        case MLX5_RX_HASH_FUNC_TOEPLITZ:
        {
                void *rss_key = MLX5_ADDR_OF(tirc, tirc, rx_hash_toeplitz_key);
                size_t len = MLX5_FLD_SZ_BYTES(tirc, rx_hash_toeplitz_key);

                if (len != ucmd.rx_key_len) {
                        err = -EINVAL;
                        goto err;
                }

                MLX5_SET(tirc, tirc, rx_hash_fn, MLX5_RX_HASH_FUNC_TOEPLITZ);
                memcpy(rss_key, ucmd.rx_hash_key, len);
                break;
        }
        default:
                err = -EOPNOTSUPP;
                goto err;
        }

        if (!ucmd.rx_hash_fields_mask) {
                /* special case when this TIR serves as steering entry without hashing */
                if (!init_attr->rwq_ind_tbl->log_ind_tbl_size)
                        goto create_tir;
                err = -EINVAL;
                goto err;
        }

        if (((ucmd.rx_hash_fields_mask & MLX5_RX_HASH_SRC_IPV4) ||
             (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_DST_IPV4)) &&
             ((ucmd.rx_hash_fields_mask & MLX5_RX_HASH_SRC_IPV6) ||
             (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_DST_IPV6))) {
                err = -EINVAL;
                goto err;
        }

        /* If none of IPV4 & IPV6 SRC/DST was set - this bit field is ignored */
        if ((ucmd.rx_hash_fields_mask & MLX5_RX_HASH_SRC_IPV4) ||
            (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_DST_IPV4))
                MLX5_SET(rx_hash_field_select, hfso, l3_prot_type,
                         MLX5_L3_PROT_TYPE_IPV4);
        else if ((ucmd.rx_hash_fields_mask & MLX5_RX_HASH_SRC_IPV6) ||
                 (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_DST_IPV6))
                MLX5_SET(rx_hash_field_select, hfso, l3_prot_type,
                         MLX5_L3_PROT_TYPE_IPV6);

        if (((ucmd.rx_hash_fields_mask & MLX5_RX_HASH_SRC_PORT_TCP) ||
             (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_DST_PORT_TCP)) &&
             ((ucmd.rx_hash_fields_mask & MLX5_RX_HASH_SRC_PORT_UDP) ||
             (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_DST_PORT_UDP))) {
                err = -EINVAL;
                goto err;
        }

        /* If none of TCP & UDP SRC/DST was set - this bit field is ignored */
        if ((ucmd.rx_hash_fields_mask & MLX5_RX_HASH_SRC_PORT_TCP) ||
            (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_DST_PORT_TCP))
                MLX5_SET(rx_hash_field_select, hfso, l4_prot_type,
                         MLX5_L4_PROT_TYPE_TCP);
        else if ((ucmd.rx_hash_fields_mask & MLX5_RX_HASH_SRC_PORT_UDP) ||
                 (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_DST_PORT_UDP))
                MLX5_SET(rx_hash_field_select, hfso, l4_prot_type,
                         MLX5_L4_PROT_TYPE_UDP);

        if ((ucmd.rx_hash_fields_mask & MLX5_RX_HASH_SRC_IPV4) ||
            (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_SRC_IPV6))
                selected_fields |= MLX5_HASH_FIELD_SEL_SRC_IP;

        if ((ucmd.rx_hash_fields_mask & MLX5_RX_HASH_DST_IPV4) ||
            (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_DST_IPV6))
                selected_fields |= MLX5_HASH_FIELD_SEL_DST_IP;

        if ((ucmd.rx_hash_fields_mask & MLX5_RX_HASH_SRC_PORT_TCP) ||
            (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_SRC_PORT_UDP))
                selected_fields |= MLX5_HASH_FIELD_SEL_L4_SPORT;

        if ((ucmd.rx_hash_fields_mask & MLX5_RX_HASH_DST_PORT_TCP) ||
            (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_DST_PORT_UDP))
                selected_fields |= MLX5_HASH_FIELD_SEL_L4_DPORT;

        MLX5_SET(rx_hash_field_select, hfso, selected_fields, selected_fields);

create_tir:
        err = mlx5_core_create_tir(dev->mdev, in, inlen, &qp->rss_qp.tirn);

        if (err)
                goto err;

        kvfree(in);
        /* qpn is reserved for that QP */
        qp->trans_qp.base.mqp.qpn = 0;
        qp->flags |= MLX5_IB_QP_RSS;
        return 0;

err:
        kvfree(in);
        return err;
}

static int atomic_size_to_mode(int size_mask)
{
        /* driver does not support atomic_size > 256B
         * and does not know how to translate bigger sizes
         */
        int supported_size_mask = size_mask & 0x1ff;
        int log_max_size;

        if (!supported_size_mask)
                return -EOPNOTSUPP;

        log_max_size = __fls(supported_size_mask);

        if (log_max_size > 3)
                return log_max_size;

        return MLX5_ATOMIC_MODE_8B;
}

static int get_atomic_mode(struct mlx5_ib_dev *dev,
                           enum ib_qp_type qp_type)
{
        u8 atomic_operations = MLX5_CAP_ATOMIC(dev->mdev, atomic_operations);
        u8 atomic = MLX5_CAP_GEN(dev->mdev, atomic);
        int atomic_mode = -EOPNOTSUPP;
        int atomic_size_mask;

        if (!atomic)
                return -EOPNOTSUPP;

        if (qp_type == MLX5_IB_QPT_DCT)
                atomic_size_mask = MLX5_CAP_ATOMIC(dev->mdev, atomic_size_dc);
        else
                atomic_size_mask = MLX5_CAP_ATOMIC(dev->mdev, atomic_size_qp);

        if ((atomic_operations & MLX5_ATOMIC_OPS_MASKED_CMP_SWAP) ||
            (atomic_operations & MLX5_ATOMIC_OPS_MASKED_FETCH_ADD))
                atomic_mode = atomic_size_to_mode(atomic_size_mask);

        if (atomic_mode <= 0 &&
            (atomic_operations & MLX5_ATOMIC_OPS_CMP_SWAP &&
             atomic_operations & MLX5_ATOMIC_OPS_FETCH_ADD))
                atomic_mode = MLX5_ATOMIC_MODE_IB_COMP;

        return atomic_mode;
}

static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd,
                            struct ib_qp_init_attr *init_attr,
                            struct ib_udata *udata, struct mlx5_ib_qp *qp)
{
        struct mlx5_ib_resources *devr = &dev->devr;
        int inlen = MLX5_ST_SZ_BYTES(create_qp_in);
        struct mlx5_core_dev *mdev = dev->mdev;
        struct mlx5_ib_create_qp_resp resp;
        struct mlx5_ib_cq *send_cq;
        struct mlx5_ib_cq *recv_cq;
        unsigned long flags;
        u32 uidx = MLX5_IB_DEFAULT_UIDX;
        struct mlx5_ib_create_qp ucmd;
        struct mlx5_ib_qp_base *base;
        void *qpc;
        u32 *in;
        int err;

        base = init_attr->qp_type == IB_QPT_RAW_PACKET ?
               &qp->raw_packet_qp.rq.base :
               &qp->trans_qp.base;

        if (init_attr->qp_type != IB_QPT_RAW_PACKET)
                mlx5_ib_odp_create_qp(qp);

        mutex_init(&qp->mutex);
        spin_lock_init(&qp->sq.lock);
        spin_lock_init(&qp->rq.lock);

        if (init_attr->rwq_ind_tbl) {
                if (!udata)
                        return -ENOSYS;

                err = create_rss_raw_qp_tir(dev, qp, pd, init_attr, udata);
                return err;
        }

        if (init_attr->create_flags & IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK) {
                if (!MLX5_CAP_GEN(mdev, block_lb_mc)) {
                        mlx5_ib_dbg(dev, "block multicast loopback isn't supported\n");
                        return -EINVAL;
                } else {
                        qp->flags |= MLX5_IB_QP_BLOCK_MULTICAST_LOOPBACK;
                }
        }

        if (init_attr->create_flags &
                        (IB_QP_CREATE_CROSS_CHANNEL |
                         IB_QP_CREATE_MANAGED_SEND |
                         IB_QP_CREATE_MANAGED_RECV)) {
                if (!MLX5_CAP_GEN(mdev, cd)) {
                        mlx5_ib_dbg(dev, "cross-channel isn't supported\n");
                        return -EINVAL;
                }
                if (init_attr->create_flags & IB_QP_CREATE_CROSS_CHANNEL)
                        qp->flags |= MLX5_IB_QP_CROSS_CHANNEL;
                if (init_attr->create_flags & IB_QP_CREATE_MANAGED_SEND)
                        qp->flags |= MLX5_IB_QP_MANAGED_SEND;
                if (init_attr->create_flags & IB_QP_CREATE_MANAGED_RECV)
                        qp->flags |= MLX5_IB_QP_MANAGED_RECV;
        }

        if (init_attr->qp_type == IB_QPT_UD &&
            (init_attr->create_flags & IB_QP_CREATE_IPOIB_UD_LSO))
                if (!MLX5_CAP_GEN(mdev, ipoib_ipoib_offloads)) {
                        mlx5_ib_dbg(dev, "ipoib UD lso qp isn't supported\n");
                        return -EOPNOTSUPP;
                }

        if (init_attr->create_flags & IB_QP_CREATE_SCATTER_FCS) {
                if (init_attr->qp_type != IB_QPT_RAW_PACKET) {
                        mlx5_ib_dbg(dev, "Scatter FCS is supported only for Raw Packet QPs");
                        return -EOPNOTSUPP;
                }
                if (!MLX5_CAP_GEN(dev->mdev, eth_net_offloads) ||
                    !MLX5_CAP_ETH(dev->mdev, scatter_fcs)) {
                        mlx5_ib_dbg(dev, "Scatter FCS isn't supported\n");
                        return -EOPNOTSUPP;
                }
                qp->flags |= MLX5_IB_QP_CAP_SCATTER_FCS;
        }

        if (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR)
                qp->sq_signal_bits = MLX5_WQE_CTRL_CQ_UPDATE;

        if (pd && pd->uobject) {
                if (ib_copy_from_udata(&ucmd, udata, sizeof(ucmd))) {
                        mlx5_ib_dbg(dev, "copy failed\n");
                        return -EFAULT;
                }

                err = get_qp_user_index(to_mucontext(pd->uobject->context),
                                        &ucmd, udata->inlen, &uidx);
                if (err)
                        return err;

                qp->wq_sig = !!(ucmd.flags & MLX5_QP_FLAG_SIGNATURE);
                qp->scat_cqe = !!(ucmd.flags & MLX5_QP_FLAG_SCATTER_CQE);
        } else {
                qp->wq_sig = !!wq_signature;
        }

        qp->has_rq = qp_has_rq(init_attr);
        err = set_rq_size(dev, &init_attr->cap, qp->has_rq,
                          qp, (pd && pd->uobject) ? &ucmd : NULL);
        if (err) {
                mlx5_ib_dbg(dev, "err %d\n", err);
                return err;
        }

        if (pd) {
                if (pd->uobject) {
                        __u32 max_wqes =
                                1 << MLX5_CAP_GEN(mdev, log_max_qp_sz);
                        mlx5_ib_dbg(dev, "requested sq_wqe_count (%d)\n", ucmd.sq_wqe_count);
                        if (ucmd.rq_wqe_shift != qp->rq.wqe_shift ||
                            ucmd.rq_wqe_count != qp->rq.wqe_cnt) {
                                mlx5_ib_dbg(dev, "invalid rq params\n");
                                return -EINVAL;
                        }
                        if (ucmd.sq_wqe_count > max_wqes) {
                                mlx5_ib_dbg(dev, "requested sq_wqe_count (%d) > max allowed (%d)\n",
                                            ucmd.sq_wqe_count, max_wqes);
                                return -EINVAL;
                        }
                        if (init_attr->create_flags &
                            MLX5_IB_QP_CREATE_SQPN_QP1) {
                                mlx5_ib_dbg(dev, "user-space is not allowed to create UD QPs spoofing as QP1\n");
                                return -EINVAL;
                        }
                        err = create_user_qp(dev, pd, qp, udata, init_attr, &in,
                                             &resp, &inlen, base);
                        if (err)
                                mlx5_ib_dbg(dev, "err %d\n", err);
                } else {
                        err = create_kernel_qp(dev, init_attr, qp, &in, &inlen,
                                               base);
                        if (err)
                                mlx5_ib_dbg(dev, "err %d\n", err);
                }

                if (err)
                        return err;
        } else {
                in = mlx5_vzalloc(inlen);
                if (!in)
                        return -ENOMEM;

                qp->create_type = MLX5_QP_EMPTY;
        }

        if (is_sqp(init_attr->qp_type))
                qp->port = init_attr->port_num;

        qpc = MLX5_ADDR_OF(create_qp_in, in, qpc);

        MLX5_SET(qpc, qpc, st, to_mlx5_st(init_attr->qp_type));
        MLX5_SET(qpc, qpc, pm_state, MLX5_QP_PM_MIGRATED);

        if (init_attr->qp_type != MLX5_IB_QPT_REG_UMR)
                MLX5_SET(qpc, qpc, pd, to_mpd(pd ? pd : devr->p0)->pdn);
        else
                MLX5_SET(qpc, qpc, latency_sensitive, 1);


        if (qp->wq_sig)
                MLX5_SET(qpc, qpc, wq_signature, 1);

        if (qp->flags & MLX5_IB_QP_BLOCK_MULTICAST_LOOPBACK)
                MLX5_SET(qpc, qpc, block_lb_mc, 1);

        if (qp->flags & MLX5_IB_QP_CROSS_CHANNEL)
                MLX5_SET(qpc, qpc, cd_master, 1);
        if (qp->flags & MLX5_IB_QP_MANAGED_SEND)
                MLX5_SET(qpc, qpc, cd_slave_send, 1);
        if (qp->flags & MLX5_IB_QP_MANAGED_RECV)
                MLX5_SET(qpc, qpc, cd_slave_receive, 1);

        if (qp->scat_cqe && is_connected(init_attr->qp_type)) {
                int rcqe_sz;
                int scqe_sz;

                rcqe_sz = mlx5_ib_get_cqe_size(dev, init_attr->recv_cq);
                scqe_sz = mlx5_ib_get_cqe_size(dev, init_attr->send_cq);

                if (rcqe_sz == 128)
                        MLX5_SET(qpc, qpc, cs_res, MLX5_RES_SCAT_DATA64_CQE);
                else
                        MLX5_SET(qpc, qpc, cs_res, MLX5_RES_SCAT_DATA32_CQE);

                if (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR) {
                        if (scqe_sz == 128)
                                MLX5_SET(qpc, qpc, cs_req, MLX5_REQ_SCAT_DATA64_CQE);
                        else
                                MLX5_SET(qpc, qpc, cs_req, MLX5_REQ_SCAT_DATA32_CQE);
                }
        }

        if (qp->rq.wqe_cnt) {
                MLX5_SET(qpc, qpc, log_rq_stride, qp->rq.wqe_shift - 4);
                MLX5_SET(qpc, qpc, log_rq_size, ilog2(qp->rq.wqe_cnt));
        }

        if (init_attr->qp_type != IB_QPT_RAW_PACKET)
                MLX5_SET(qpc, qpc, ts_format, mlx5_get_qp_default_ts(dev->mdev));

        MLX5_SET(qpc, qpc, rq_type, get_rx_type(qp, init_attr));

        if (qp->sq.wqe_cnt)
                MLX5_SET(qpc, qpc, log_sq_size, ilog2(qp->sq.wqe_cnt));
        else
                MLX5_SET(qpc, qpc, no_sq, 1);

        /* Set default resources */
        switch (init_attr->qp_type) {
        case IB_QPT_XRC_TGT:
                MLX5_SET(qpc, qpc, cqn_rcv, to_mcq(devr->c0)->mcq.cqn);
                MLX5_SET(qpc, qpc, cqn_snd, to_mcq(devr->c0)->mcq.cqn);
                MLX5_SET(qpc, qpc, srqn_rmpn, to_msrq(devr->s0)->msrq.srqn);
                MLX5_SET(qpc, qpc, xrcd, to_mxrcd(init_attr->xrcd)->xrcdn);
                break;
        case IB_QPT_XRC_INI:
                MLX5_SET(qpc, qpc, cqn_rcv, to_mcq(devr->c0)->mcq.cqn);
                MLX5_SET(qpc, qpc, xrcd, to_mxrcd(devr->x1)->xrcdn);
                MLX5_SET(qpc, qpc, srqn_rmpn, to_msrq(devr->s0)->msrq.srqn);
                break;
        default:
                if (init_attr->srq) {
                        MLX5_SET(qpc, qpc, xrcd, to_mxrcd(devr->x0)->xrcdn);
                        MLX5_SET(qpc, qpc, srqn_rmpn, to_msrq(init_attr->srq)->msrq.srqn);
                } else {
                        MLX5_SET(qpc, qpc, xrcd, to_mxrcd(devr->x1)->xrcdn);
                        MLX5_SET(qpc, qpc, srqn_rmpn, to_msrq(devr->s1)->msrq.srqn);
                }
        }

        if (init_attr->send_cq)
                MLX5_SET(qpc, qpc, cqn_snd, to_mcq(init_attr->send_cq)->mcq.cqn);

        if (init_attr->recv_cq)
                MLX5_SET(qpc, qpc, cqn_rcv, to_mcq(init_attr->recv_cq)->mcq.cqn);

        MLX5_SET64(qpc, qpc, dbr_addr, qp->db.dma);

        /* 0xffffff means we ask to work with cqe version 0 */
        if (MLX5_CAP_GEN(mdev, cqe_version) == MLX5_CQE_VERSION_V1)
                MLX5_SET(qpc, qpc, user_index, uidx);

        /* we use IB_QP_CREATE_IPOIB_UD_LSO to indicates ipoib qp */
        if (init_attr->qp_type == IB_QPT_UD &&
            (init_attr->create_flags & IB_QP_CREATE_IPOIB_UD_LSO)) {
                MLX5_SET(qpc, qpc, ulp_stateless_offload_mode, 1);
                qp->flags |= MLX5_IB_QP_LSO;
        }

        if (init_attr->qp_type == IB_QPT_RAW_PACKET) {
                qp->raw_packet_qp.sq.ubuffer.buf_addr = ucmd.sq_buf_addr;
                raw_packet_qp_copy_info(qp, &qp->raw_packet_qp);
                err = create_raw_packet_qp(dev, qp, in, pd);
        } else {
                err = mlx5_core_create_qp(dev->mdev, &base->mqp, in, inlen);
        }

        if (err) {
                mlx5_ib_dbg(dev, "create qp failed\n");
                goto err_create;
        }

        kvfree(in);

        base->container_mibqp = qp;
        base->mqp.event = mlx5_ib_qp_event;

        get_cqs(init_attr->qp_type, init_attr->send_cq, init_attr->recv_cq,
                &send_cq, &recv_cq);
        spin_lock_irqsave(&dev->reset_flow_resource_lock, flags);
        mlx5_ib_lock_cqs(send_cq, recv_cq);
        /* Maintain device to QPs access, needed for further handling via reset
         * flow
         */
        list_add_tail(&qp->qps_list, &dev->qp_list);
        /* Maintain CQ to QPs access, needed for further handling via reset flow
         */
        if (send_cq)
                list_add_tail(&qp->cq_send_list, &send_cq->list_send_qp);
        if (recv_cq)
                list_add_tail(&qp->cq_recv_list, &recv_cq->list_recv_qp);
        mlx5_ib_unlock_cqs(send_cq, recv_cq);
        spin_unlock_irqrestore(&dev->reset_flow_resource_lock, flags);

        return 0;

err_create:
        if (qp->create_type == MLX5_QP_USER)
                destroy_qp_user(dev, pd, qp, base, udata);
        else if (qp->create_type == MLX5_QP_KERNEL)
                destroy_qp_kernel(dev, qp);

        kvfree(in);
        return err;
}

static void mlx5_ib_lock_cqs(struct mlx5_ib_cq *send_cq, struct mlx5_ib_cq *recv_cq)
        __acquires(&send_cq->lock) __acquires(&recv_cq->lock)
{
        if (send_cq) {
                if (recv_cq) {
                        if (send_cq->mcq.cqn < recv_cq->mcq.cqn)  {
                                spin_lock(&send_cq->lock);
                                spin_lock_nested(&recv_cq->lock,
                                                 SINGLE_DEPTH_NESTING);
                        } else if (send_cq->mcq.cqn == recv_cq->mcq.cqn) {
                                spin_lock(&send_cq->lock);
                                __acquire(&recv_cq->lock);
                        } else {
                                spin_lock(&recv_cq->lock);
                                spin_lock_nested(&send_cq->lock,
                                                 SINGLE_DEPTH_NESTING);
                        }
                } else {
                        spin_lock(&send_cq->lock);
                        __acquire(&recv_cq->lock);
                }
        } else if (recv_cq) {
                spin_lock(&recv_cq->lock);
                __acquire(&send_cq->lock);
        } else {
                __acquire(&send_cq->lock);
                __acquire(&recv_cq->lock);
        }
}

static void mlx5_ib_unlock_cqs(struct mlx5_ib_cq *send_cq, struct mlx5_ib_cq *recv_cq)
        __releases(&send_cq->lock) __releases(&recv_cq->lock)
{
        if (send_cq) {
                if (recv_cq) {
                        if (send_cq->mcq.cqn < recv_cq->mcq.cqn)  {
                                spin_unlock(&recv_cq->lock);
                                spin_unlock(&send_cq->lock);
                        } else if (send_cq->mcq.cqn == recv_cq->mcq.cqn) {
                                __release(&recv_cq->lock);
                                spin_unlock(&send_cq->lock);
                        } else {
                                spin_unlock(&send_cq->lock);
                                spin_unlock(&recv_cq->lock);
                        }
                } else {
                        __release(&recv_cq->lock);
                        spin_unlock(&send_cq->lock);
                }
        } else if (recv_cq) {
                __release(&send_cq->lock);
                spin_unlock(&recv_cq->lock);
        } else {
                __release(&recv_cq->lock);
                __release(&send_cq->lock);
        }
}

static struct mlx5_ib_pd *get_pd(struct mlx5_ib_qp *qp)
{
        return to_mpd(qp->ibqp.pd);
}

static void get_cqs(enum ib_qp_type qp_type,
                    struct ib_cq *ib_send_cq, struct ib_cq *ib_recv_cq,
                    struct mlx5_ib_cq **send_cq, struct mlx5_ib_cq **recv_cq)
{
        switch (qp_type) {
        case IB_QPT_XRC_TGT:
                *send_cq = NULL;
                *recv_cq = NULL;
                break;
        case MLX5_IB_QPT_REG_UMR:
        case IB_QPT_XRC_INI:
                *send_cq = ib_send_cq ? to_mcq(ib_send_cq) : NULL;
                *recv_cq = NULL;
                break;

        case IB_QPT_SMI:
        case MLX5_IB_QPT_HW_GSI:
        case IB_QPT_RC:
        case IB_QPT_UC:
        case IB_QPT_UD:
        case IB_QPT_RAW_IPV6:
        case IB_QPT_RAW_ETHERTYPE:
        case IB_QPT_RAW_PACKET:
                *send_cq = ib_send_cq ? to_mcq(ib_send_cq) : NULL;
                *recv_cq = ib_recv_cq ? to_mcq(ib_recv_cq) : NULL;
                break;

        case IB_QPT_MAX:
        default:
                *send_cq = NULL;
                *recv_cq = NULL;
                break;
        }
}

static int modify_raw_packet_qp(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp,
                                const struct mlx5_modify_raw_qp_param *raw_qp_param,
                                u8 lag_tx_affinity);

static void destroy_qp_common(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp,
                              struct ib_udata *udata)
{
        struct mlx5_ib_cq *send_cq, *recv_cq;
        struct mlx5_ib_qp_base *base = &qp->trans_qp.base;
        unsigned long flags;
        int err;

        if (qp->ibqp.rwq_ind_tbl) {
                destroy_rss_raw_qp_tir(dev, qp);
                return;
        }

        base = qp->ibqp.qp_type == IB_QPT_RAW_PACKET ?
               &qp->raw_packet_qp.rq.base :
               &qp->trans_qp.base;

        if (qp->state != IB_QPS_RESET) {
                if (qp->ibqp.qp_type != IB_QPT_RAW_PACKET) {
                        mlx5_ib_qp_disable_pagefaults(qp);
                        err = mlx5_core_qp_modify(dev->mdev,
                                                  MLX5_CMD_OP_2RST_QP, 0,
                                                  NULL, &base->mqp);
                } else {
                        struct mlx5_modify_raw_qp_param raw_qp_param = {
                                .operation = MLX5_CMD_OP_2RST_QP
                        };

                        err = modify_raw_packet_qp(dev, qp, &raw_qp_param, 0);
                }
                if (err)
                        mlx5_ib_warn(dev, "mlx5_ib: modify QP 0x%06x to RESET failed\n",
                                     base->mqp.qpn);
        }

        get_cqs(qp->ibqp.qp_type, qp->ibqp.send_cq, qp->ibqp.recv_cq,
                &send_cq, &recv_cq);

        spin_lock_irqsave(&dev->reset_flow_resource_lock, flags);
        mlx5_ib_lock_cqs(send_cq, recv_cq);
        /* del from lists under both locks above to protect reset flow paths */
        list_del(&qp->qps_list);
        if (send_cq)
                list_del(&qp->cq_send_list);

        if (recv_cq)
                list_del(&qp->cq_recv_list);

        if (qp->create_type == MLX5_QP_KERNEL) {
                __mlx5_ib_cq_clean(recv_cq, base->mqp.qpn,
                                   qp->ibqp.srq ? to_msrq(qp->ibqp.srq) : NULL);
                if (send_cq != recv_cq)
                        __mlx5_ib_cq_clean(send_cq, base->mqp.qpn,
                                           NULL);
        }
        mlx5_ib_unlock_cqs(send_cq, recv_cq);
        spin_unlock_irqrestore(&dev->reset_flow_resource_lock, flags);

        if (qp->ibqp.qp_type == IB_QPT_RAW_PACKET) {
                destroy_raw_packet_qp(dev, qp);
        } else {
                err = mlx5_core_destroy_qp(dev->mdev, &base->mqp);
                if (err)
                        mlx5_ib_warn(dev, "failed to destroy QP 0x%x\n",
                                     base->mqp.qpn);
        }

        if (qp->create_type == MLX5_QP_KERNEL)
                destroy_qp_kernel(dev, qp);
        else if (qp->create_type == MLX5_QP_USER)
                destroy_qp_user(dev, &get_pd(qp)->ibpd, qp, base, udata);
}

static const char *ib_qp_type_str(enum ib_qp_type type)
{
        switch (type) {
        case IB_QPT_SMI:
                return "IB_QPT_SMI";
        case IB_QPT_GSI:
                return "IB_QPT_GSI";
        case IB_QPT_RC:
                return "IB_QPT_RC";
        case IB_QPT_UC:
                return "IB_QPT_UC";
        case IB_QPT_UD:
                return "IB_QPT_UD";
        case IB_QPT_RAW_IPV6:
                return "IB_QPT_RAW_IPV6";
        case IB_QPT_RAW_ETHERTYPE:
                return "IB_QPT_RAW_ETHERTYPE";
        case IB_QPT_XRC_INI:
                return "IB_QPT_XRC_INI";
        case IB_QPT_XRC_TGT:
                return "IB_QPT_XRC_TGT";
        case IB_QPT_RAW_PACKET:
                return "IB_QPT_RAW_PACKET";
        case MLX5_IB_QPT_REG_UMR:
                return "MLX5_IB_QPT_REG_UMR";
        case IB_QPT_MAX:
        default:
                return "Invalid QP type";
        }
}

struct ib_qp *mlx5_ib_create_qp(struct ib_pd *pd,
                                struct ib_qp_init_attr *init_attr,
                                struct ib_udata *udata)
{
        struct mlx5_ib_dev *dev;
        struct mlx5_ib_qp *qp;
        u16 xrcdn = 0;
        int err;

        if (pd) {
                dev = to_mdev(pd->device);

                if (init_attr->qp_type == IB_QPT_RAW_PACKET) {
                        if (!pd->uobject) {
                                mlx5_ib_dbg(dev, "Raw Packet QP is not supported for kernel consumers\n");
                                return ERR_PTR(-EINVAL);
                        } else if (!to_mucontext(pd->uobject->context)->cqe_version) {
                                mlx5_ib_dbg(dev, "Raw Packet QP is only supported for CQE version > 0\n");
                                return ERR_PTR(-EINVAL);
                        }
                }
        } else {
                /* being cautious here */
                if (init_attr->qp_type != IB_QPT_XRC_TGT &&
                    init_attr->qp_type != MLX5_IB_QPT_REG_UMR) {
                        pr_warn("%s: no PD for transport %s\n", __func__,
                                ib_qp_type_str(init_attr->qp_type));
                        return ERR_PTR(-EINVAL);
                }
                dev = to_mdev(to_mxrcd(init_attr->xrcd)->ibxrcd.device);
        }

        switch (init_attr->qp_type) {
        case IB_QPT_XRC_TGT:
        case IB_QPT_XRC_INI:
                if (!MLX5_CAP_GEN(dev->mdev, xrc)) {
                        mlx5_ib_dbg(dev, "XRC not supported\n");
                        return ERR_PTR(-ENOSYS);
                }
                init_attr->recv_cq = NULL;
                if (init_attr->qp_type == IB_QPT_XRC_TGT) {
                        xrcdn = to_mxrcd(init_attr->xrcd)->xrcdn;
                        init_attr->send_cq = NULL;
                }

                /* fall through */
        case IB_QPT_RAW_PACKET:
        case IB_QPT_RC:
        case IB_QPT_UC:
        case IB_QPT_UD:
        case IB_QPT_SMI:
        case MLX5_IB_QPT_HW_GSI:
        case MLX5_IB_QPT_REG_UMR:
                qp = kzalloc(sizeof(*qp), GFP_KERNEL);
                if (!qp)
                        return ERR_PTR(-ENOMEM);

                err = create_qp_common(dev, pd, init_attr, udata, qp);
                if (err) {
                        mlx5_ib_dbg(dev, "create_qp_common failed\n");
                        kfree(qp);
                        return ERR_PTR(err);
                }

                if (is_qp0(init_attr->qp_type))
                        qp->ibqp.qp_num = 0;
                else if (is_qp1(init_attr->qp_type))
                        qp->ibqp.qp_num = 1;
                else
                        qp->ibqp.qp_num = qp->trans_qp.base.mqp.qpn;

                mlx5_ib_dbg(dev, "ib qpnum 0x%x, mlx qpn 0x%x, rcqn 0x%x, scqn 0x%x\n",
                            qp->ibqp.qp_num, qp->trans_qp.base.mqp.qpn,
                            init_attr->recv_cq ? to_mcq(init_attr->recv_cq)->mcq.cqn : -1,
                            init_attr->send_cq ? to_mcq(init_attr->send_cq)->mcq.cqn : -1);

                qp->trans_qp.xrcdn = xrcdn;

                break;

        case IB_QPT_GSI:
                return mlx5_ib_gsi_create_qp(pd, init_attr);

        case IB_QPT_RAW_IPV6:
        case IB_QPT_RAW_ETHERTYPE:
        case IB_QPT_MAX:
        default:
                mlx5_ib_dbg(dev, "unsupported qp type %d\n",
                            init_attr->qp_type);
                /* Don't support raw QPs */
                return ERR_PTR(-EINVAL);
        }

        return &qp->ibqp;
}

int mlx5_ib_destroy_qp(struct ib_qp *qp, struct ib_udata *udata)
{
        struct mlx5_ib_dev *dev = to_mdev(qp->device);
        struct mlx5_ib_qp *mqp = to_mqp(qp);

        if (unlikely(qp->qp_type == IB_QPT_GSI))
                return mlx5_ib_gsi_destroy_qp(qp);

        destroy_qp_common(dev, mqp, udata);

        kfree(mqp);

        return 0;
}

static int to_mlx5_access_flags(struct mlx5_ib_qp *qp,
                                const struct ib_qp_attr *attr,
                                int attr_mask, __be32 *hw_access_flags_be)
{
        u8 dest_rd_atomic;
        u32 access_flags, hw_access_flags = 0;

        struct mlx5_ib_dev *dev = to_mdev(qp->ibqp.device);

        if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC)
                dest_rd_atomic = attr->max_dest_rd_atomic;
        else
                dest_rd_atomic = qp->trans_qp.resp_depth;

        if (attr_mask & IB_QP_ACCESS_FLAGS)
                access_flags = attr->qp_access_flags;
        else
                access_flags = qp->trans_qp.atomic_rd_en;

        if (!dest_rd_atomic)
                access_flags &= IB_ACCESS_REMOTE_WRITE;

        if (access_flags & IB_ACCESS_REMOTE_READ)
                hw_access_flags |= MLX5_QP_BIT_RRE;
        if (access_flags & IB_ACCESS_REMOTE_ATOMIC) {
                int atomic_mode;

                atomic_mode = get_atomic_mode(dev, qp->ibqp.qp_type);
                if (atomic_mode < 0)
                        return -EOPNOTSUPP;

                hw_access_flags |= MLX5_QP_BIT_RAE;
                hw_access_flags |= atomic_mode << MLX5_ATOMIC_MODE_OFF;
        }

        if (access_flags & IB_ACCESS_REMOTE_WRITE)
                hw_access_flags |= MLX5_QP_BIT_RWE;

        *hw_access_flags_be = cpu_to_be32(hw_access_flags);

        return 0;
}

enum {
        MLX5_PATH_FLAG_FL       = 1 << 0,
        MLX5_PATH_FLAG_FREE_AR  = 1 << 1,
        MLX5_PATH_FLAG_COUNTER  = 1 << 2,
};

static int ib_rate_to_mlx5(struct mlx5_ib_dev *dev, u8 rate)
{
        if (rate == IB_RATE_PORT_CURRENT) {
                return 0;
        } else if (rate < IB_RATE_2_5_GBPS || rate > IB_RATE_600_GBPS) {
                return -EINVAL;
        } else {
                while (rate != IB_RATE_2_5_GBPS &&
                       !(1 << (rate + MLX5_STAT_RATE_OFFSET) &
                         MLX5_CAP_GEN(dev->mdev, stat_rate_support)))
                        --rate;
        }

        return rate + MLX5_STAT_RATE_OFFSET;
}

static int modify_raw_packet_eth_prio(struct mlx5_core_dev *dev,
                                      struct mlx5_ib_sq *sq, u8 sl,
                                      struct ib_pd *pd)
{
        void *in;
        void *tisc;
        int inlen;
        int err;

        inlen = MLX5_ST_SZ_BYTES(modify_tis_in);
        in = mlx5_vzalloc(inlen);
        if (!in)
                return -ENOMEM;

        MLX5_SET(modify_tis_in, in, bitmask.prio, 1);
        MLX5_SET(modify_tis_in, in, uid, to_mpd(pd)->uid);

        tisc = MLX5_ADDR_OF(modify_tis_in, in, ctx);
        MLX5_SET(tisc, tisc, prio, ((sl & 0x7) << 1));

        err = mlx5_core_modify_tis(dev, sq->tisn, in, inlen);

        kvfree(in);

        return err;
}

static int modify_raw_packet_tx_affinity(struct mlx5_core_dev *dev,
                                         struct mlx5_ib_sq *sq, u8 tx_affinity,
                                         struct ib_pd *pd)
{
        void *in;
        void *tisc;
        int inlen;
        int err;

        inlen = MLX5_ST_SZ_BYTES(modify_tis_in);
        in = mlx5_vzalloc(inlen);
        if (!in)
                return -ENOMEM;

        MLX5_SET(modify_tis_in, in, bitmask.lag_tx_port_affinity, 1);
        MLX5_SET(modify_tis_in, in, uid, to_mpd(pd)->uid);

        tisc = MLX5_ADDR_OF(modify_tis_in, in, ctx);
        MLX5_SET(tisc, tisc, lag_tx_port_affinity, tx_affinity);

        err = mlx5_core_modify_tis(dev, sq->tisn, in, inlen);

        kvfree(in);

        return err;
}

static int mlx5_set_path(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp,
                         const struct ib_ah_attr *ah,
                         struct mlx5_qp_path *path, u8 port, int attr_mask,
                         u32 path_flags, const struct ib_qp_attr *attr,
                         bool alt)
{
        enum rdma_link_layer ll = rdma_port_get_link_layer(&dev->ib_dev, port);
        int err;
        enum ib_gid_type gid_type;

        if (attr_mask & IB_QP_PKEY_INDEX)
                path->pkey_index = cpu_to_be16(alt ? attr->alt_pkey_index :
                                                     attr->pkey_index);

        if (ah->ah_flags & IB_AH_GRH) {
                if (ah->grh.sgid_index >=
                    dev->mdev->port_caps[port - 1].gid_table_len) {
                        pr_err("sgid_index (%u) too large. max is %d\n",
                               ah->grh.sgid_index,
                               dev->mdev->port_caps[port - 1].gid_table_len);
                        return -EINVAL;
                }
        }

        if (ll == IB_LINK_LAYER_ETHERNET) {
                if (!(ah->ah_flags & IB_AH_GRH))
                        return -EINVAL;
                err = mlx5_get_roce_gid_type(dev, port, ah->grh.sgid_index,
                                             &gid_type);
                if (err)
                        return err;
                memcpy(path->rmac, ah->dmac, sizeof(ah->dmac));
                path->udp_sport = mlx5_get_roce_udp_sport(dev, port,
                                                          ah->grh.sgid_index);
                path->dci_cfi_prio_sl = (ah->sl & 0x7) << 4;
                if (gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP)
                        path->ecn_dscp = (ah->grh.traffic_class >> 2) & 0x3f;
        } else {
                path->fl_free_ar = (path_flags & MLX5_PATH_FLAG_FL) ? 0x80 : 0;
                path->fl_free_ar |=
                        (path_flags & MLX5_PATH_FLAG_FREE_AR) ? 0x40 : 0;
                path->rlid = cpu_to_be16(ah->dlid);
                path->grh_mlid = ah->src_path_bits & 0x7f;
                if (ah->ah_flags & IB_AH_GRH)
                        path->grh_mlid  |= 1 << 7;
                path->dci_cfi_prio_sl = ah->sl & 0xf;
        }

        if (ah->ah_flags & IB_AH_GRH) {
                path->mgid_index = ah->grh.sgid_index;
                path->hop_limit  = ah->grh.hop_limit;
                path->tclass_flowlabel =
                        cpu_to_be32((ah->grh.traffic_class << 20) |
                                    (ah->grh.flow_label));
                memcpy(path->rgid, ah->grh.dgid.raw, 16);
        }

        err = ib_rate_to_mlx5(dev, ah->static_rate);
        if (err < 0)
                return err;
        path->static_rate = err;
        path->port = port;

        if (attr_mask & IB_QP_TIMEOUT)
                path->ackto_lt = (alt ? attr->alt_timeout : attr->timeout) << 3;

        if ((qp->ibqp.qp_type == IB_QPT_RAW_PACKET) && qp->sq.wqe_cnt)
                return modify_raw_packet_eth_prio(dev->mdev,
                                                  &qp->raw_packet_qp.sq,
                                                  ah->sl & 0xf, qp->ibqp.pd);

        return 0;
}

static enum mlx5_qp_optpar opt_mask[MLX5_QP_NUM_STATE][MLX5_QP_NUM_STATE][MLX5_QP_ST_MAX] = {
        [MLX5_QP_STATE_INIT] = {
                [MLX5_QP_STATE_INIT] = {
                        [MLX5_QP_ST_RC] = MLX5_QP_OPTPAR_RRE            |
                                          MLX5_QP_OPTPAR_RAE            |
                                          MLX5_QP_OPTPAR_RWE            |
                                          MLX5_QP_OPTPAR_PKEY_INDEX     |
                                          MLX5_QP_OPTPAR_PRI_PORT,
                        [MLX5_QP_ST_UC] = MLX5_QP_OPTPAR_RWE            |
                                          MLX5_QP_OPTPAR_PKEY_INDEX     |
                                          MLX5_QP_OPTPAR_PRI_PORT,
                        [MLX5_QP_ST_UD] = MLX5_QP_OPTPAR_PKEY_INDEX     |
                                          MLX5_QP_OPTPAR_Q_KEY          |
                                          MLX5_QP_OPTPAR_PRI_PORT,
                },
                [MLX5_QP_STATE_RTR] = {
                        [MLX5_QP_ST_RC] = MLX5_QP_OPTPAR_ALT_ADDR_PATH  |
                                          MLX5_QP_OPTPAR_RRE            |
                                          MLX5_QP_OPTPAR_RAE            |
                                          MLX5_QP_OPTPAR_RWE            |
                                          MLX5_QP_OPTPAR_PKEY_INDEX,
                        [MLX5_QP_ST_UC] = MLX5_QP_OPTPAR_ALT_ADDR_PATH  |
                                          MLX5_QP_OPTPAR_RWE            |
                                          MLX5_QP_OPTPAR_PKEY_INDEX,
                        [MLX5_QP_ST_UD] = MLX5_QP_OPTPAR_PKEY_INDEX     |
                                          MLX5_QP_OPTPAR_Q_KEY,
                        [MLX5_QP_ST_MLX] = MLX5_QP_OPTPAR_PKEY_INDEX    |
                                           MLX5_QP_OPTPAR_Q_KEY,
                        [MLX5_QP_ST_XRC] = MLX5_QP_OPTPAR_ALT_ADDR_PATH |
                                          MLX5_QP_OPTPAR_RRE            |
                                          MLX5_QP_OPTPAR_RAE            |
                                          MLX5_QP_OPTPAR_RWE            |
                                          MLX5_QP_OPTPAR_PKEY_INDEX,
                },
        },
        [MLX5_QP_STATE_RTR] = {
                [MLX5_QP_STATE_RTS] = {
                        [MLX5_QP_ST_RC] = MLX5_QP_OPTPAR_ALT_ADDR_PATH  |
                                          MLX5_QP_OPTPAR_RRE            |
                                          MLX5_QP_OPTPAR_RAE            |
                                          MLX5_QP_OPTPAR_RWE            |
                                          MLX5_QP_OPTPAR_PM_STATE       |
                                          MLX5_QP_OPTPAR_RNR_TIMEOUT,
                        [MLX5_QP_ST_UC] = MLX5_QP_OPTPAR_ALT_ADDR_PATH  |
                                          MLX5_QP_OPTPAR_RWE            |
                                          MLX5_QP_OPTPAR_PM_STATE,
                        [MLX5_QP_ST_UD] = MLX5_QP_OPTPAR_Q_KEY,
                },
        },
        [MLX5_QP_STATE_RTS] = {
                [MLX5_QP_STATE_RTS] = {
                        [MLX5_QP_ST_RC] = MLX5_QP_OPTPAR_RRE            |
                                          MLX5_QP_OPTPAR_RAE            |
                                          MLX5_QP_OPTPAR_RWE            |
                                          MLX5_QP_OPTPAR_RNR_TIMEOUT    |
                                          MLX5_QP_OPTPAR_PM_STATE       |
                                          MLX5_QP_OPTPAR_ALT_ADDR_PATH,
                        [MLX5_QP_ST_UC] = MLX5_QP_OPTPAR_RWE            |
                                          MLX5_QP_OPTPAR_PM_STATE       |
                                          MLX5_QP_OPTPAR_ALT_ADDR_PATH,
                        [MLX5_QP_ST_UD] = MLX5_QP_OPTPAR_Q_KEY          |
                                          MLX5_QP_OPTPAR_SRQN           |
                                          MLX5_QP_OPTPAR_CQN_RCV,
                },
        },
        [MLX5_QP_STATE_SQER] = {
                [MLX5_QP_STATE_RTS] = {
                        [MLX5_QP_ST_UD]  = MLX5_QP_OPTPAR_Q_KEY,
                        [MLX5_QP_ST_MLX] = MLX5_QP_OPTPAR_Q_KEY,
                        [MLX5_QP_ST_UC]  = MLX5_QP_OPTPAR_RWE,
                        [MLX5_QP_ST_RC]  = MLX5_QP_OPTPAR_RNR_TIMEOUT   |
                                           MLX5_QP_OPTPAR_RWE           |
                                           MLX5_QP_OPTPAR_RAE           |
                                           MLX5_QP_OPTPAR_RRE,
                },
        },
};

static int ib_nr_to_mlx5_nr(int ib_mask)
{
        switch (ib_mask) {
        case IB_QP_STATE:
                return 0;
        case IB_QP_CUR_STATE:
                return 0;
        case IB_QP_EN_SQD_ASYNC_NOTIFY:
                return 0;
        case IB_QP_ACCESS_FLAGS:
                return MLX5_QP_OPTPAR_RWE | MLX5_QP_OPTPAR_RRE |
                        MLX5_QP_OPTPAR_RAE;
        case IB_QP_PKEY_INDEX:
                return MLX5_QP_OPTPAR_PKEY_INDEX;
        case IB_QP_PORT:
                return MLX5_QP_OPTPAR_PRI_PORT;
        case IB_QP_QKEY:
                return MLX5_QP_OPTPAR_Q_KEY;
        case IB_QP_AV:
                return MLX5_QP_OPTPAR_PRIMARY_ADDR_PATH |
                        MLX5_QP_OPTPAR_PRI_PORT;
        case IB_QP_PATH_MTU:
                return 0;
        case IB_QP_TIMEOUT:
                return MLX5_QP_OPTPAR_ACK_TIMEOUT;
        case IB_QP_RETRY_CNT:
                return MLX5_QP_OPTPAR_RETRY_COUNT;
        case IB_QP_RNR_RETRY:
                return MLX5_QP_OPTPAR_RNR_RETRY;
        case IB_QP_RQ_PSN:
                return 0;
        case IB_QP_MAX_QP_RD_ATOMIC:
                return MLX5_QP_OPTPAR_SRA_MAX;
        case IB_QP_ALT_PATH:
                return MLX5_QP_OPTPAR_ALT_ADDR_PATH;
        case IB_QP_MIN_RNR_TIMER:
                return MLX5_QP_OPTPAR_RNR_TIMEOUT;
        case IB_QP_SQ_PSN:
                return 0;
        case IB_QP_MAX_DEST_RD_ATOMIC:
                return MLX5_QP_OPTPAR_RRA_MAX | MLX5_QP_OPTPAR_RWE |
                        MLX5_QP_OPTPAR_RRE | MLX5_QP_OPTPAR_RAE;
        case IB_QP_PATH_MIG_STATE:
                return MLX5_QP_OPTPAR_PM_STATE;
        case IB_QP_CAP:
                return 0;
        case IB_QP_DEST_QPN:
                return 0;
        }
        return 0;
}

static int ib_mask_to_mlx5_opt(int ib_mask)
{
        int result = 0;
        int i;

        for (i = 0; i < 8 * sizeof(int); i++) {
                if ((1 << i) & ib_mask)
                        result |= ib_nr_to_mlx5_nr(1 << i);
        }

        return result;
}

static int modify_raw_packet_qp_rq(
        struct mlx5_ib_dev *dev, struct mlx5_ib_rq *rq, int new_state,
        const struct mlx5_modify_raw_qp_param *raw_qp_param, struct ib_pd *pd)
{
        void *in;
        void *rqc;
        int inlen;
        int err;

        inlen = MLX5_ST_SZ_BYTES(modify_rq_in);
        in = mlx5_vzalloc(inlen);
        if (!in)
                return -ENOMEM;

        MLX5_SET(modify_rq_in, in, rqn, rq->base.mqp.qpn);
        MLX5_SET(modify_rq_in, in, rq_state, rq->state);
        MLX5_SET(modify_rq_in, in, uid, to_mpd(pd)->uid);

        rqc = MLX5_ADDR_OF(modify_rq_in, in, ctx);
        MLX5_SET(rqc, rqc, state, new_state);

        if (raw_qp_param->set_mask & MLX5_RAW_QP_MOD_SET_RQ_Q_CTR_ID) {
                if (MLX5_CAP_GEN(dev->mdev, modify_rq_counters_set_id)) {
                        MLX5_SET64(modify_rq_in, in, modify_bitmask,
                                   MLX5_MODIFY_RQ_IN_MODIFY_BITMASK_MODIFY_RQ_COUNTER_SET_ID);
                        MLX5_SET(rqc, rqc, counter_set_id, raw_qp_param->rq_q_ctr_id);
                } else
                        pr_info_once("%s: RAW PACKET QP counters are not supported on current FW\n",
                                     dev->ib_dev.name);
        }

        err = mlx5_core_modify_rq(dev->mdev, in, inlen);
        if (err)
                goto out;

        rq->state = new_state;

out:
        kvfree(in);
        return err;
}

static int modify_raw_packet_qp_sq(struct mlx5_core_dev *dev,
                                   struct mlx5_ib_sq *sq, int new_state,
                                   struct ib_pd *pd)
{
        void *in;
        void *sqc;
        int inlen;
        int err;

        inlen = MLX5_ST_SZ_BYTES(modify_sq_in);
        in = mlx5_vzalloc(inlen);
        if (!in)
                return -ENOMEM;

        MLX5_SET(modify_sq_in, in, sqn, sq->base.mqp.qpn);
        MLX5_SET(modify_sq_in, in, uid, to_mpd(pd)->uid);
        MLX5_SET(modify_sq_in, in, sq_state, sq->state);

        sqc = MLX5_ADDR_OF(modify_sq_in, in, ctx);
        MLX5_SET(sqc, sqc, state, new_state);

        err = mlx5_core_modify_sq(dev, in, inlen);
        if (err)
                goto out;

        sq->state = new_state;

out:
        kvfree(in);
        return err;
}

static int modify_raw_packet_qp(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp,
                                const struct mlx5_modify_raw_qp_param *raw_qp_param,
                                u8 tx_affinity)
{
        struct mlx5_ib_raw_packet_qp *raw_packet_qp = &qp->raw_packet_qp;
        struct mlx5_ib_rq *rq = &raw_packet_qp->rq;
        struct mlx5_ib_sq *sq = &raw_packet_qp->sq;
        int modify_rq = !!qp->rq.wqe_cnt;
        int modify_sq = !!qp->sq.wqe_cnt;
        int rq_state;
        int sq_state;
        int err;

        switch (raw_qp_param->operation) {
        case MLX5_CMD_OP_RST2INIT_QP:
                rq_state = MLX5_RQC_STATE_RDY;
                sq_state = MLX5_SQC_STATE_RDY;
                break;
        case MLX5_CMD_OP_2ERR_QP:
                rq_state = MLX5_RQC_STATE_ERR;
                sq_state = MLX5_SQC_STATE_ERR;
                break;
        case MLX5_CMD_OP_2RST_QP:
                rq_state = MLX5_RQC_STATE_RST;
                sq_state = MLX5_SQC_STATE_RST;
                break;
        case MLX5_CMD_OP_RTR2RTS_QP:
        case MLX5_CMD_OP_RTS2RTS_QP:
                return raw_qp_param->set_mask ? -EINVAL : 0;
        case MLX5_CMD_OP_INIT2INIT_QP:
        case MLX5_CMD_OP_INIT2RTR_QP:
                if (raw_qp_param->set_mask)
                        return -EINVAL;
                else
                        return 0;
        default:
                WARN_ON(1);
                return -EINVAL;
        }

        if (modify_rq) {
                err =  modify_raw_packet_qp_rq(dev, rq, rq_state, raw_qp_param,
                                               qp->ibqp.pd);
                if (err)
                        return err;
        }

        if (modify_sq) {
                if (tx_affinity) {
                        err = modify_raw_packet_tx_affinity(dev->mdev, sq,
                                                            tx_affinity,
                                                            qp->ibqp.pd);
                        if (err)
                                return err;
                }

                return modify_raw_packet_qp_sq(dev->mdev, sq, sq_state, qp->ibqp.pd);
        }

        return 0;
}

static int __mlx5_ib_modify_qp(struct ib_qp *ibqp,
                               const struct ib_qp_attr *attr, int attr_mask,
                               enum ib_qp_state cur_state, enum ib_qp_state new_state)
{
        static const u16 optab[MLX5_QP_NUM_STATE][MLX5_QP_NUM_STATE] = {
                [MLX5_QP_STATE_RST] = {
                        [MLX5_QP_STATE_RST]     = MLX5_CMD_OP_2RST_QP,
                        [MLX5_QP_STATE_ERR]     = MLX5_CMD_OP_2ERR_QP,
                        [MLX5_QP_STATE_INIT]    = MLX5_CMD_OP_RST2INIT_QP,
                },
                [MLX5_QP_STATE_INIT]  = {
                        [MLX5_QP_STATE_RST]     = MLX5_CMD_OP_2RST_QP,
                        [MLX5_QP_STATE_ERR]     = MLX5_CMD_OP_2ERR_QP,
                        [MLX5_QP_STATE_INIT]    = MLX5_CMD_OP_INIT2INIT_QP,
                        [MLX5_QP_STATE_RTR]     = MLX5_CMD_OP_INIT2RTR_QP,
                },
                [MLX5_QP_STATE_RTR]   = {
                        [MLX5_QP_STATE_RST]     = MLX5_CMD_OP_2RST_QP,
                        [MLX5_QP_STATE_ERR]     = MLX5_CMD_OP_2ERR_QP,
                        [MLX5_QP_STATE_RTS]     = MLX5_CMD_OP_RTR2RTS_QP,
                },
                [MLX5_QP_STATE_RTS]   = {
                        [MLX5_QP_STATE_RST]     = MLX5_CMD_OP_2RST_QP,
                        [MLX5_QP_STATE_ERR]     = MLX5_CMD_OP_2ERR_QP,
                        [MLX5_QP_STATE_RTS]     = MLX5_CMD_OP_RTS2RTS_QP,
                },
                [MLX5_QP_STATE_SQD] = {
                        [MLX5_QP_STATE_RST]     = MLX5_CMD_OP_2RST_QP,
                        [MLX5_QP_STATE_ERR]     = MLX5_CMD_OP_2ERR_QP,
                },
                [MLX5_QP_STATE_SQER] = {
                        [MLX5_QP_STATE_RST]     = MLX5_CMD_OP_2RST_QP,
                        [MLX5_QP_STATE_ERR]     = MLX5_CMD_OP_2ERR_QP,
                        [MLX5_QP_STATE_RTS]     = MLX5_CMD_OP_SQERR2RTS_QP,
                },
                [MLX5_QP_STATE_ERR] = {
                        [MLX5_QP_STATE_RST]     = MLX5_CMD_OP_2RST_QP,
                        [MLX5_QP_STATE_ERR]     = MLX5_CMD_OP_2ERR_QP,
                }
        };

        struct mlx5_ib_dev *dev = to_mdev(ibqp->device);
        struct mlx5_ib_qp *qp = to_mqp(ibqp);
        struct mlx5_ib_qp_base *base = &qp->trans_qp.base;
        struct mlx5_ib_cq *send_cq, *recv_cq;
        struct mlx5_qp_context *context;
        struct mlx5_ib_pd *pd;
        struct mlx5_ib_port *mibport = NULL;
        enum mlx5_qp_state mlx5_cur, mlx5_new;
        enum mlx5_qp_optpar optpar;
        int mlx5_st;
        int err;
        u16 op;

        context = kzalloc(sizeof(*context), GFP_KERNEL);
        if (!context)
                return -ENOMEM;

        err = to_mlx5_st(ibqp->qp_type);
        if (err < 0) {
                mlx5_ib_dbg(dev, "unsupported qp type %d\n", ibqp->qp_type);
                goto out;
        }

        context->flags = cpu_to_be32(err << 16);

        if (!(attr_mask & IB_QP_PATH_MIG_STATE)) {
                context->flags |= cpu_to_be32(MLX5_QP_PM_MIGRATED << 11);
        } else {
                switch (attr->path_mig_state) {
                case IB_MIG_MIGRATED:
                        context->flags |= cpu_to_be32(MLX5_QP_PM_MIGRATED << 11);
                        break;
                case IB_MIG_REARM:
                        context->flags |= cpu_to_be32(MLX5_QP_PM_REARM << 11);
                        break;
                case IB_MIG_ARMED:
                        context->flags |= cpu_to_be32(MLX5_QP_PM_ARMED << 11);
                        break;
                }
        }

        if (is_sqp(ibqp->qp_type)) {
                context->mtu_msgmax = (IB_MTU_256 << 5) | 8;
        } else if (ibqp->qp_type == IB_QPT_UD ||
                   ibqp->qp_type == MLX5_IB_QPT_REG_UMR) {
                context->mtu_msgmax = (IB_MTU_4096 << 5) | 12;
        } else if (attr_mask & IB_QP_PATH_MTU) {
                if (attr->path_mtu < IB_MTU_256 ||
                    attr->path_mtu > IB_MTU_4096) {
                        mlx5_ib_warn(dev, "invalid mtu %d\n", attr->path_mtu);
                        err = -EINVAL;
                        goto out;
                }
                context->mtu_msgmax = (attr->path_mtu << 5) |
                                      (u8)MLX5_CAP_GEN(dev->mdev, log_max_msg);
        }

        if (attr_mask & IB_QP_DEST_QPN)
                context->log_pg_sz_remote_qpn = cpu_to_be32(attr->dest_qp_num);

        if (attr_mask & IB_QP_PKEY_INDEX)
                context->pri_path.pkey_index = cpu_to_be16(attr->pkey_index);

        /* todo implement counter_index functionality */

        if (is_sqp(ibqp->qp_type))
                context->pri_path.port = qp->port;

        if (attr_mask & IB_QP_PORT)
                context->pri_path.port = attr->port_num;

        if (attr_mask & IB_QP_AV) {
                err = mlx5_set_path(dev, qp, &attr->ah_attr, &context->pri_path,
                                    attr_mask & IB_QP_PORT ? attr->port_num : qp->port,
                                    attr_mask, 0, attr, false);
                if (err)
                        goto out;
        }

        if (attr_mask & IB_QP_TIMEOUT)
                context->pri_path.ackto_lt |= attr->timeout << 3;

        if (attr_mask & IB_QP_ALT_PATH) {
                err = mlx5_set_path(dev, qp, &attr->alt_ah_attr,
                                    &context->alt_path,
                                    attr->alt_port_num,
                                    attr_mask | IB_QP_PKEY_INDEX | IB_QP_TIMEOUT,
                                    0, attr, true);
                if (err)
                        goto out;
        }

        pd = get_pd(qp);
        get_cqs(qp->ibqp.qp_type, qp->ibqp.send_cq, qp->ibqp.recv_cq,
                &send_cq, &recv_cq);

        context->flags_pd = cpu_to_be32(pd ? pd->pdn : to_mpd(dev->devr.p0)->pdn);
        context->cqn_send = send_cq ? cpu_to_be32(send_cq->mcq.cqn) : 0;
        context->cqn_recv = recv_cq ? cpu_to_be32(recv_cq->mcq.cqn) : 0;
        context->params1  = cpu_to_be32(MLX5_IB_ACK_REQ_FREQ << 28);

        if (attr_mask & IB_QP_RNR_RETRY)
                context->params1 |= cpu_to_be32(attr->rnr_retry << 13);

        if (attr_mask & IB_QP_RETRY_CNT)
                context->params1 |= cpu_to_be32(attr->retry_cnt << 16);

        if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC) {
                if (attr->max_rd_atomic)
                        context->params1 |=
                                cpu_to_be32(fls(attr->max_rd_atomic - 1) << 21);
        }

        if (attr_mask & IB_QP_SQ_PSN)
                context->next_send_psn = cpu_to_be32(attr->sq_psn);

        if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) {
                if (attr->max_dest_rd_atomic)
                        context->params2 |=
                                cpu_to_be32(fls(attr->max_dest_rd_atomic - 1) << 21);
        }

        if (attr_mask & (IB_QP_ACCESS_FLAGS | IB_QP_MAX_DEST_RD_ATOMIC)) {
                __be32 access_flags;

                err = to_mlx5_access_flags(qp, attr, attr_mask, &access_flags);
                if (err)
                        goto out;

                context->params2 |= access_flags;
        }

        if (attr_mask & IB_QP_MIN_RNR_TIMER)
                context->rnr_nextrecvpsn |= cpu_to_be32(attr->min_rnr_timer << 24);

        if (attr_mask & IB_QP_RQ_PSN)
                context->rnr_nextrecvpsn |= cpu_to_be32(attr->rq_psn);

        if (attr_mask & IB_QP_QKEY)
                context->qkey = cpu_to_be32(attr->qkey);

        if (qp->rq.wqe_cnt && cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT)
                context->db_rec_addr = cpu_to_be64(qp->db.dma);

        if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) {
                u8 port_num = (attr_mask & IB_QP_PORT ? attr->port_num :
                               qp->port) - 1;
                mibport = &dev->port[port_num];
                context->qp_counter_set_usr_page |=
                        cpu_to_be32((u32)(mibport->q_cnt_id) << 24);
        }

        if (!ibqp->uobject && cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT)
                context->sq_crq_size |= cpu_to_be16(1 << 4);

        if (qp->flags & MLX5_IB_QP_SQPN_QP1)
                context->deth_sqpn = cpu_to_be32(1);

        mlx5_cur = to_mlx5_state(cur_state);
        mlx5_new = to_mlx5_state(new_state);
        mlx5_st = to_mlx5_st(ibqp->qp_type);
        if (mlx5_st < 0)
                goto out;

        /* If moving to a reset or error state, we must disable page faults on
         * this QP and flush all current page faults. Otherwise a stale page
         * fault may attempt to work on this QP after it is reset and moved
         * again to RTS, and may cause the driver and the device to get out of
         * sync. */
        if (cur_state != IB_QPS_RESET && cur_state != IB_QPS_ERR &&
            (new_state == IB_QPS_RESET || new_state == IB_QPS_ERR) &&
            (qp->ibqp.qp_type != IB_QPT_RAW_PACKET))
                mlx5_ib_qp_disable_pagefaults(qp);

        if (mlx5_cur >= MLX5_QP_NUM_STATE || mlx5_new >= MLX5_QP_NUM_STATE ||
            !optab[mlx5_cur][mlx5_new])
                goto out;

        op = optab[mlx5_cur][mlx5_new];
        optpar = ib_mask_to_mlx5_opt(attr_mask);
        optpar &= opt_mask[mlx5_cur][mlx5_new][mlx5_st];

        if (qp->ibqp.qp_type == IB_QPT_RAW_PACKET) {
                struct mlx5_modify_raw_qp_param raw_qp_param = {};

                raw_qp_param.operation = op;
                if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) {
                        raw_qp_param.rq_q_ctr_id = mibport->q_cnt_id;
                        raw_qp_param.set_mask |= MLX5_RAW_QP_MOD_SET_RQ_Q_CTR_ID;
                }
                err = modify_raw_packet_qp(dev, qp, &raw_qp_param, 0);
        } else {
                err = mlx5_core_qp_modify(dev->mdev, op, optpar, context,
                                          &base->mqp);
        }

        if (err)
                goto out;

        if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT &&
            (qp->ibqp.qp_type != IB_QPT_RAW_PACKET))
                mlx5_ib_qp_enable_pagefaults(qp);

        qp->state = new_state;

        if (attr_mask & IB_QP_ACCESS_FLAGS)
                qp->trans_qp.atomic_rd_en = attr->qp_access_flags;
        if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC)
                qp->trans_qp.resp_depth = attr->max_dest_rd_atomic;
        if (attr_mask & IB_QP_PORT)
                qp->port = attr->port_num;
        if (attr_mask & IB_QP_ALT_PATH)
                qp->trans_qp.alt_port = attr->alt_port_num;

        /*
         * If we moved a kernel QP to RESET, clean up all old CQ
         * entries and reinitialize the QP.
         */
        if (new_state == IB_QPS_RESET && !ibqp->uobject) {
                mlx5_ib_cq_clean(recv_cq, base->mqp.qpn,
                                 ibqp->srq ? to_msrq(ibqp->srq) : NULL);
                if (send_cq != recv_cq)
                        mlx5_ib_cq_clean(send_cq, base->mqp.qpn, NULL);

                qp->rq.head = 0;
                qp->rq.tail = 0;
                qp->sq.head = 0;
                qp->sq.tail = 0;
                qp->sq.cur_post = 0;
                qp->sq.last_poll = 0;
                qp->db.db[MLX5_RCV_DBR] = 0;
                qp->db.db[MLX5_SND_DBR] = 0;
        }

out:
        kfree(context);
        return err;
}

int mlx5_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
                      int attr_mask, struct ib_udata *udata)
{
        struct mlx5_ib_dev *dev = to_mdev(ibqp->device);
        struct mlx5_ib_qp *qp = to_mqp(ibqp);
        enum ib_qp_type qp_type;
        enum ib_qp_state cur_state, new_state;
        int err = -EINVAL;
        int port;

        if (ibqp->rwq_ind_tbl)
                return -ENOSYS;

        if (unlikely(ibqp->qp_type == IB_QPT_GSI))
                return mlx5_ib_gsi_modify_qp(ibqp, attr, attr_mask);

        qp_type = (unlikely(ibqp->qp_type == MLX5_IB_QPT_HW_GSI)) ?
                IB_QPT_GSI : ibqp->qp_type;

        mutex_lock(&qp->mutex);

        cur_state = attr_mask & IB_QP_CUR_STATE ? attr->cur_qp_state : qp->state;
        new_state = attr_mask & IB_QP_STATE ? attr->qp_state : cur_state;

        if (qp_type != MLX5_IB_QPT_REG_UMR &&
            !ib_modify_qp_is_ok(cur_state, new_state, qp_type, attr_mask)) {
                mlx5_ib_dbg(dev, "invalid QP state transition from %d to %d, qp_type %d, attr_mask 0x%x\n",
                            cur_state, new_state, ibqp->qp_type, attr_mask);
                goto out;
        }

        if ((attr_mask & IB_QP_PORT) &&
            (attr->port_num == 0 ||
             attr->port_num > MLX5_CAP_GEN(dev->mdev, num_ports))) {
                mlx5_ib_dbg(dev, "invalid port number %d. number of ports is %d\n",
                            attr->port_num, dev->num_ports);
                goto out;
        }

        if (attr_mask & IB_QP_PKEY_INDEX) {
                port = attr_mask & IB_QP_PORT ? attr->port_num : qp->port;
                if (attr->pkey_index >=
                    dev->mdev->port_caps[port - 1].pkey_table_len) {
                        mlx5_ib_dbg(dev, "invalid pkey index %d\n",
                                    attr->pkey_index);
                        goto out;
                }
        }

        if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC &&
            attr->max_rd_atomic >
            (1 << MLX5_CAP_GEN(dev->mdev, log_max_ra_res_qp))) {
                mlx5_ib_dbg(dev, "invalid max_rd_atomic value %d\n",
                            attr->max_rd_atomic);
                goto out;
        }

        if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC &&
            attr->max_dest_rd_atomic >
            (1 << MLX5_CAP_GEN(dev->mdev, log_max_ra_req_qp))) {
                mlx5_ib_dbg(dev, "invalid max_dest_rd_atomic value %d\n",
                            attr->max_dest_rd_atomic);
                goto out;
        }

        if (cur_state == new_state && cur_state == IB_QPS_RESET) {
                err = 0;
                goto out;
        }

        err = __mlx5_ib_modify_qp(ibqp, attr, attr_mask, cur_state, new_state);

out:
        mutex_unlock(&qp->mutex);
        return err;
}

static int mlx5_wq_overflow(struct mlx5_ib_wq *wq, int nreq, struct ib_cq *ib_cq)
{
        struct mlx5_ib_cq *cq;
        unsigned cur;

        cur = wq->head - wq->tail;
        if (likely(cur + nreq < wq->max_post))
                return 0;

        cq = to_mcq(ib_cq);
        spin_lock(&cq->lock);
        cur = wq->head - wq->tail;
        spin_unlock(&cq->lock);

        return cur + nreq >= wq->max_post;
}

static __always_inline void set_raddr_seg(struct mlx5_wqe_raddr_seg *rseg,
                                          u64 remote_addr, u32 rkey)
{
        rseg->raddr    = cpu_to_be64(remote_addr);
        rseg->rkey     = cpu_to_be32(rkey);
        rseg->reserved = 0;
}

static void *set_eth_seg(struct mlx5_wqe_eth_seg *eseg,
                         const struct ib_send_wr *wr, void *qend,
                         struct mlx5_ib_qp *qp, int *size)
{
        void *seg = eseg;

        memset(eseg, 0, sizeof(struct mlx5_wqe_eth_seg));

        if (wr->send_flags & IB_SEND_IP_CSUM)
                eseg->cs_flags = MLX5_ETH_WQE_L3_CSUM |
                                 MLX5_ETH_WQE_L4_CSUM;

        seg += sizeof(struct mlx5_wqe_eth_seg);
        *size += sizeof(struct mlx5_wqe_eth_seg) / 16;

        if (wr->opcode == IB_WR_LSO) {
                struct ib_ud_wr *ud_wr = container_of(wr, struct ib_ud_wr, wr);
                int size_of_inl_hdr_start = sizeof(eseg->inline_hdr_start);
                u64 left, leftlen, copysz;
                void *pdata = ud_wr->header;

                left = ud_wr->hlen;
                eseg->mss = cpu_to_be16(ud_wr->mss);
                eseg->inline_hdr_sz = cpu_to_be16(left);

                /*
                 * check if there is space till the end of queue, if yes,
                 * copy all in one shot, otherwise copy till the end of queue,
                 * rollback and than the copy the left
                 */
                leftlen = qend - (void *)eseg->inline_hdr_start;
                copysz = min_t(u64, leftlen, left);

                memcpy(seg - size_of_inl_hdr_start, pdata, copysz);

                if (likely(copysz > size_of_inl_hdr_start)) {
                        seg += ALIGN(copysz - size_of_inl_hdr_start, 16);
                        *size += ALIGN(copysz - size_of_inl_hdr_start, 16) / 16;
                }

                if (unlikely(copysz < left)) { /* the last wqe in the queue */
                        seg = mlx5_get_send_wqe(qp, 0);
                        left -= copysz;
                        pdata += copysz;
                        memcpy(seg, pdata, left);
                        seg += ALIGN(left, 16);
                        *size += ALIGN(left, 16) / 16;
                }
        }

        return seg;
}

static void set_datagram_seg(struct mlx5_wqe_datagram_seg *dseg,
                             const struct ib_send_wr *wr)
{
        memcpy(&dseg->av, &to_mah(ud_wr(wr)->ah)->av, sizeof(struct mlx5_av));
        dseg->av.dqp_dct = cpu_to_be32(ud_wr(wr)->remote_qpn | MLX5_EXTENDED_UD_AV);
        dseg->av.key.qkey.qkey = cpu_to_be32(ud_wr(wr)->remote_qkey);
}

static void set_data_ptr_seg(struct mlx5_wqe_data_seg *dseg, struct ib_sge *sg)
{
        dseg->byte_count = cpu_to_be32(sg->length);
        dseg->lkey       = cpu_to_be32(sg->lkey);
        dseg->addr       = cpu_to_be64(sg->addr);
}

static __be16 get_klm_octo(int npages)
{
        return cpu_to_be16(ALIGN(npages, 8) / 2);
}

static __be64 frwr_mkey_mask(void)
{
        u64 result;

        result = MLX5_MKEY_MASK_LEN             |
                MLX5_MKEY_MASK_PAGE_SIZE        |
                MLX5_MKEY_MASK_START_ADDR       |
                MLX5_MKEY_MASK_EN_RINVAL        |
                MLX5_MKEY_MASK_KEY              |
                MLX5_MKEY_MASK_LR               |
                MLX5_MKEY_MASK_LW               |
                MLX5_MKEY_MASK_RR               |
                MLX5_MKEY_MASK_RW               |
                MLX5_MKEY_MASK_A                |
                MLX5_MKEY_MASK_SMALL_FENCE      |
                MLX5_MKEY_MASK_FREE;

        return cpu_to_be64(result);
}

static __be64 sig_mkey_mask(void)
{
        u64 result;

        result = MLX5_MKEY_MASK_LEN             |
                MLX5_MKEY_MASK_PAGE_SIZE        |
                MLX5_MKEY_MASK_START_ADDR       |
                MLX5_MKEY_MASK_EN_SIGERR        |
                MLX5_MKEY_MASK_EN_RINVAL        |
                MLX5_MKEY_MASK_KEY              |
                MLX5_MKEY_MASK_LR               |
                MLX5_MKEY_MASK_LW               |
                MLX5_MKEY_MASK_RR               |
                MLX5_MKEY_MASK_RW               |
                MLX5_MKEY_MASK_SMALL_FENCE      |
                MLX5_MKEY_MASK_FREE             |
                MLX5_MKEY_MASK_BSF_EN;

        return cpu_to_be64(result);
}

static void set_reg_umr_seg(struct mlx5_wqe_umr_ctrl_seg *umr,
                                struct mlx5_ib_mr *mr)
{
        int ndescs = mr->ndescs;

        memset(umr, 0, sizeof(*umr));

        if (mr->access_mode == MLX5_ACCESS_MODE_KLM)
                /* KLMs take twice the size of MTTs */
                ndescs *= 2;

        umr->flags = MLX5_UMR_CHECK_NOT_FREE;
        umr->klm_octowords = get_klm_octo(ndescs);
        umr->mkey_mask = frwr_mkey_mask();
}

static void set_linv_umr_seg(struct mlx5_wqe_umr_ctrl_seg *umr)
{
        memset(umr, 0, sizeof(*umr));
        umr->mkey_mask = cpu_to_be64(MLX5_MKEY_MASK_FREE);
        umr->flags = 1 << 7;
}

static __be64 get_umr_reg_mr_mask(void)
{
        u64 result;

        result = MLX5_MKEY_MASK_LEN             |
                 MLX5_MKEY_MASK_PAGE_SIZE       |
                 MLX5_MKEY_MASK_START_ADDR      |
                 MLX5_MKEY_MASK_PD              |
                 MLX5_MKEY_MASK_LR              |
                 MLX5_MKEY_MASK_LW              |
                 MLX5_MKEY_MASK_KEY             |
                 MLX5_MKEY_MASK_RR              |
                 MLX5_MKEY_MASK_RW              |
                 MLX5_MKEY_MASK_A               |
                 MLX5_MKEY_MASK_FREE;

        return cpu_to_be64(result);
}

static __be64 get_umr_unreg_mr_mask(void)
{
        u64 result;

        result = MLX5_MKEY_MASK_FREE;

        return cpu_to_be64(result);
}

static __be64 get_umr_update_mtt_mask(void)
{
        u64 result;

        result = MLX5_MKEY_MASK_FREE;

        return cpu_to_be64(result);
}

static __be64 get_umr_update_translation_mask(void)
{
        u64 result;

        result = MLX5_MKEY_MASK_LEN |
                 MLX5_MKEY_MASK_PAGE_SIZE |
                 MLX5_MKEY_MASK_START_ADDR |
                 MLX5_MKEY_MASK_KEY |
                 MLX5_MKEY_MASK_FREE;

        return cpu_to_be64(result);
}

static __be64 get_umr_update_access_mask(void)
{
        u64 result;

        result = MLX5_MKEY_MASK_LW |
                 MLX5_MKEY_MASK_RR |
                 MLX5_MKEY_MASK_RW |
                 MLX5_MKEY_MASK_A |
                 MLX5_MKEY_MASK_KEY |
                 MLX5_MKEY_MASK_FREE;

        return cpu_to_be64(result);
}

static __be64 get_umr_update_pd_mask(void)
{
        u64 result;

        result = MLX5_MKEY_MASK_PD |
                 MLX5_MKEY_MASK_KEY |
                 MLX5_MKEY_MASK_FREE;

        return cpu_to_be64(result);
}

static void set_reg_umr_segment(struct mlx5_wqe_umr_ctrl_seg *umr,
                                const struct ib_send_wr *wr)
{
        const struct mlx5_umr_wr *umrwr = umr_wr(wr);

        memset(umr, 0, sizeof(*umr));

        if (wr->send_flags & MLX5_IB_SEND_UMR_FAIL_IF_FREE)
                umr->flags = MLX5_UMR_CHECK_FREE; /* fail if free */
        else
                umr->flags = MLX5_UMR_CHECK_NOT_FREE; /* fail if not free */

        if (!(wr->send_flags & MLX5_IB_SEND_UMR_UNREG)) {
                umr->klm_octowords = get_klm_octo(umrwr->npages);
                if (wr->send_flags & MLX5_IB_SEND_UMR_UPDATE_MTT) {
                        umr->mkey_mask = get_umr_update_mtt_mask();
                        umr->bsf_octowords = get_klm_octo(umrwr->target.offset);
                        umr->flags |= MLX5_UMR_TRANSLATION_OFFSET_EN;
                }
                if (wr->send_flags & MLX5_IB_SEND_UMR_UPDATE_TRANSLATION)
                        umr->mkey_mask |= get_umr_update_translation_mask();
                if (wr->send_flags & MLX5_IB_SEND_UMR_UPDATE_ACCESS)
                        umr->mkey_mask |= get_umr_update_access_mask();
                if (wr->send_flags & MLX5_IB_SEND_UMR_UPDATE_PD)
                        umr->mkey_mask |= get_umr_update_pd_mask();
                if (!umr->mkey_mask)
                        umr->mkey_mask = get_umr_reg_mr_mask();
        } else {
                umr->mkey_mask = get_umr_unreg_mr_mask();
        }

        if (!wr->num_sge)
                umr->flags |= MLX5_UMR_INLINE;
}

static u8 get_umr_flags(int acc)
{
        return (acc & IB_ACCESS_REMOTE_ATOMIC ? MLX5_PERM_ATOMIC       : 0) |
               (acc & IB_ACCESS_REMOTE_WRITE  ? MLX5_PERM_REMOTE_WRITE : 0) |
               (acc & IB_ACCESS_REMOTE_READ   ? MLX5_PERM_REMOTE_READ  : 0) |
               (acc & IB_ACCESS_LOCAL_WRITE   ? MLX5_PERM_LOCAL_WRITE  : 0) |
                MLX5_PERM_LOCAL_READ | MLX5_PERM_UMR_EN;
}

static void set_reg_mkey_seg(struct mlx5_mkey_seg *seg,
                             struct mlx5_ib_mr *mr,
                             u32 key, int access)
{
        int ndescs = ALIGN(mr->ndescs, 8) >> 1;

        memset(seg, 0, sizeof(*seg));

        if (mr->access_mode == MLX5_ACCESS_MODE_MTT)
                seg->log2_page_size = ilog2(mr->ibmr.page_size);
        else if (mr->access_mode == MLX5_ACCESS_MODE_KLM)
                /* KLMs take twice the size of MTTs */
                ndescs *= 2;

        seg->flags = get_umr_flags(access) | mr->access_mode;
        seg->qpn_mkey7_0 = cpu_to_be32((key & 0xff) | 0xffffff00);
        seg->flags_pd = cpu_to_be32(MLX5_MKEY_REMOTE_INVAL);
        seg->start_addr = cpu_to_be64(mr->ibmr.iova);
        seg->len = cpu_to_be64(mr->ibmr.length);
        seg->xlt_oct_size = cpu_to_be32(ndescs);
}

static void set_linv_mkey_seg(struct mlx5_mkey_seg *seg)
{
        memset(seg, 0, sizeof(*seg));
        seg->status = MLX5_MKEY_STATUS_FREE;
}

static void set_reg_mkey_segment(struct mlx5_mkey_seg *seg, const struct ib_send_wr *wr)
{
        const struct mlx5_umr_wr *umrwr = umr_wr(wr);

        memset(seg, 0, sizeof(*seg));
        if (wr->send_flags & MLX5_IB_SEND_UMR_UNREG) {
                seg->status = MLX5_MKEY_STATUS_FREE;
                return;
        }

        seg->flags = convert_access(umrwr->access_flags);
        if (!(wr->send_flags & MLX5_IB_SEND_UMR_UPDATE_MTT)) {
                if (umrwr->pd)
                        seg->flags_pd = cpu_to_be32(to_mpd(umrwr->pd)->pdn);
                seg->start_addr = cpu_to_be64(umrwr->target.virt_addr);
        }
        seg->len = cpu_to_be64(umrwr->length);
        seg->log2_page_size = umrwr->page_shift;
        seg->qpn_mkey7_0 = cpu_to_be32(0xffffff00 |
                                       mlx5_mkey_variant(umrwr->mkey));
}

static void set_reg_data_seg(struct mlx5_wqe_data_seg *dseg,
                             struct mlx5_ib_mr *mr,
                             struct mlx5_ib_pd *pd)
{
        int bcount = mr->desc_size * mr->ndescs;

        dseg->addr = cpu_to_be64(mr->desc_map);
        dseg->byte_count = cpu_to_be32(ALIGN(bcount, 64));
        dseg->lkey = cpu_to_be32(pd->ibpd.local_dma_lkey);
}

static __be32 send_ieth(const struct ib_send_wr *wr)
{
        switch (wr->opcode) {
        case IB_WR_SEND_WITH_IMM:
        case IB_WR_RDMA_WRITE_WITH_IMM:
                return wr->ex.imm_data;

        case IB_WR_SEND_WITH_INV:
                return cpu_to_be32(wr->ex.invalidate_rkey);

        default:
                return 0;
        }
}

static u8 calc_sig(void *wqe, int size)
{
        u8 *p = wqe;
        u8 res = 0;
        int i;

        for (i = 0; i < size; i++)
                res ^= p[i];

        return ~res;
}

static u8 wq_sig(void *wqe)
{
        return calc_sig(wqe, (*((u8 *)wqe + 8) & 0x3f) << 4);
}

static int set_data_inl_seg(struct mlx5_ib_qp *qp, const struct ib_send_wr *wr,
                            void *wqe, int *sz)
{
        struct mlx5_wqe_inline_seg *seg;
        void *qend = qp->sq.qend;
        void *addr;
        int inl = 0;
        int copy;
        int len;
        int i;

        seg = wqe;
        wqe += sizeof(*seg);
        for (i = 0; i < wr->num_sge; i++) {
                addr = (void *)(unsigned long)(wr->sg_list[i].addr);
                len  = wr->sg_list[i].length;
                inl += len;

                if (unlikely(inl > qp->max_inline_data))
                        return -ENOMEM;

                if (unlikely(wqe + len > qend)) {
                        copy = qend - wqe;
                        memcpy(wqe, addr, copy);
                        addr += copy;
                        len -= copy;
                        wqe = mlx5_get_send_wqe(qp, 0);
                }
                memcpy(wqe, addr, len);
                wqe += len;
        }

        seg->byte_count = cpu_to_be32(inl | MLX5_INLINE_SEG);

        *sz = ALIGN(inl + sizeof(seg->byte_count), 16) / 16;

        return 0;
}

static u16 prot_field_size(enum ib_signature_type type)
{
        switch (type) {
        case IB_SIG_TYPE_T10_DIF:
                return MLX5_DIF_SIZE;
        default:
                return 0;
        }
}

static u8 bs_selector(int block_size)
{
        switch (block_size) {
        case 512:           return 0x1;
        case 520:           return 0x2;
        case 4096:          return 0x3;
        case 4160:          return 0x4;
        case 1073741824:    return 0x5;
        default:            return 0;
        }
}

static void mlx5_fill_inl_bsf(struct ib_sig_domain *domain,
                              struct mlx5_bsf_inl *inl)
{
        /* Valid inline section and allow BSF refresh */
        inl->vld_refresh = cpu_to_be16(MLX5_BSF_INL_VALID |
                                       MLX5_BSF_REFRESH_DIF);
        inl->dif_apptag = cpu_to_be16(domain->sig.dif.app_tag);
        inl->dif_reftag = cpu_to_be32(domain->sig.dif.ref_tag);
        /* repeating block */
        inl->rp_inv_seed = MLX5_BSF_REPEAT_BLOCK;
        inl->sig_type = domain->sig.dif.bg_type == IB_T10DIF_CRC ?
                        MLX5_DIF_CRC : MLX5_DIF_IPCS;

        if (domain->sig.dif.ref_remap)
                inl->dif_inc_ref_guard_check |= MLX5_BSF_INC_REFTAG;

        if (domain->sig.dif.app_escape) {
                if (domain->sig.dif.ref_escape)
                        inl->dif_inc_ref_guard_check |= MLX5_BSF_APPREF_ESCAPE;
                else
                        inl->dif_inc_ref_guard_check |= MLX5_BSF_APPTAG_ESCAPE;
        }

        inl->dif_app_bitmask_check =
                cpu_to_be16(domain->sig.dif.apptag_check_mask);
}

static int mlx5_set_bsf(struct ib_mr *sig_mr,
                        struct ib_sig_attrs *sig_attrs,
                        struct mlx5_bsf *bsf, u32 data_size)
{
        struct mlx5_core_sig_ctx *msig = to_mmr(sig_mr)->sig;
        struct mlx5_bsf_basic *basic = &bsf->basic;
        struct ib_sig_domain *mem = &sig_attrs->mem;
        struct ib_sig_domain *wire = &sig_attrs->wire;

        memset(bsf, 0, sizeof(*bsf));

        /* Basic + Extended + Inline */
        basic->bsf_size_sbs = 1 << 7;
        /* Input domain check byte mask */
        basic->check_byte_mask = sig_attrs->check_mask;
        basic->raw_data_size = cpu_to_be32(data_size);

        /* Memory domain */
        switch (sig_attrs->mem.sig_type) {
        case IB_SIG_TYPE_NONE:
                break;
        case IB_SIG_TYPE_T10_DIF:
                basic->mem.bs_selector = bs_selector(mem->sig.dif.pi_interval);
                basic->m_bfs_psv = cpu_to_be32(msig->psv_memory.psv_idx);
                mlx5_fill_inl_bsf(mem, &bsf->m_inl);
                break;
        default:
                return -EINVAL;
        }

        /* Wire domain */
        switch (sig_attrs->wire.sig_type) {
        case IB_SIG_TYPE_NONE:
                break;
        case IB_SIG_TYPE_T10_DIF:
                if (mem->sig.dif.pi_interval == wire->sig.dif.pi_interval &&
                    mem->sig_type == wire->sig_type) {
                        /* Same block structure */
                        basic->bsf_size_sbs |= 1 << 4;
                        if (mem->sig.dif.bg_type == wire->sig.dif.bg_type)
                                basic->wire.copy_byte_mask |= MLX5_CPY_GRD_MASK;
                        if (mem->sig.dif.app_tag == wire->sig.dif.app_tag)
                                basic->wire.copy_byte_mask |= MLX5_CPY_APP_MASK;
                        if (mem->sig.dif.ref_tag == wire->sig.dif.ref_tag)
                                basic->wire.copy_byte_mask |= MLX5_CPY_REF_MASK;
                } else
                        basic->wire.bs_selector = bs_selector(wire->sig.dif.pi_interval);

                basic->w_bfs_psv = cpu_to_be32(msig->psv_wire.psv_idx);
                mlx5_fill_inl_bsf(wire, &bsf->w_inl);
                break;
        default:
                return -EINVAL;
        }

        return 0;
}

static int set_sig_data_segment(const struct ib_sig_handover_wr *wr,
                                struct mlx5_ib_qp *qp, void **seg, int *size)
{
        struct ib_sig_attrs *sig_attrs = wr->sig_attrs;
        struct ib_mr *sig_mr = wr->sig_mr;
        struct mlx5_bsf *bsf;
        u32 data_len = wr->wr.sg_list->length;
        u32 data_key = wr->wr.sg_list->lkey;
        u64 data_va = wr->wr.sg_list->addr;
        int ret;
        int wqe_size;

        if (!wr->prot ||
            (data_key == wr->prot->lkey &&
             data_va == wr->prot->addr &&
             data_len == wr->prot->length)) {
                /**
                 * Source domain doesn't contain signature information
                 * or data and protection are interleaved in memory.
                 * So need construct:
                 *                  ------------------
                 *                 |     data_klm     |
                 *                  ------------------
                 *                 |       BSF        |
                 *                  ------------------
                 **/
                struct mlx5_klm *data_klm = *seg;

                data_klm->bcount = cpu_to_be32(data_len);
                data_klm->key = cpu_to_be32(data_key);
                data_klm->va = cpu_to_be64(data_va);
                wqe_size = ALIGN(sizeof(*data_klm), 64);
        } else {
                /**
                 * Source domain contains signature information
                 * So need construct a strided block format:
                 *               ---------------------------
                 *              |     stride_block_ctrl     |
                 *               ---------------------------
                 *              |          data_klm         |
                 *               ---------------------------
                 *              |          prot_klm         |
                 *               ---------------------------
                 *              |             BSF           |
                 *               ---------------------------
                 **/
                struct mlx5_stride_block_ctrl_seg *sblock_ctrl;
                struct mlx5_stride_block_entry *data_sentry;
                struct mlx5_stride_block_entry *prot_sentry;
                u32 prot_key = wr->prot->lkey;
                u64 prot_va = wr->prot->addr;
                u16 block_size = sig_attrs->mem.sig.dif.pi_interval;
                int prot_size;

                sblock_ctrl = *seg;
                data_sentry = (void *)sblock_ctrl + sizeof(*sblock_ctrl);
                prot_sentry = (void *)data_sentry + sizeof(*data_sentry);

                prot_size = prot_field_size(sig_attrs->mem.sig_type);
                if (!prot_size) {
                        pr_err("Bad block size given: %u\n", block_size);
                        return -EINVAL;
                }
                sblock_ctrl->bcount_per_cycle = cpu_to_be32(block_size +
                                                            prot_size);
                sblock_ctrl->op = cpu_to_be32(MLX5_STRIDE_BLOCK_OP);
                sblock_ctrl->repeat_count = cpu_to_be32(data_len / block_size);
                sblock_ctrl->num_entries = cpu_to_be16(2);

                data_sentry->bcount = cpu_to_be16(block_size);
                data_sentry->key = cpu_to_be32(data_key);
                data_sentry->va = cpu_to_be64(data_va);
                data_sentry->stride = cpu_to_be16(block_size);

                prot_sentry->bcount = cpu_to_be16(prot_size);
                prot_sentry->key = cpu_to_be32(prot_key);
                prot_sentry->va = cpu_to_be64(prot_va);
                prot_sentry->stride = cpu_to_be16(prot_size);

                wqe_size = ALIGN(sizeof(*sblock_ctrl) + sizeof(*data_sentry) +
                                 sizeof(*prot_sentry), 64);
        }

        *seg += wqe_size;
        *size += wqe_size / 16;
        if (unlikely((*seg == qp->sq.qend)))
                *seg = mlx5_get_send_wqe(qp, 0);

        bsf = *seg;
        ret = mlx5_set_bsf(sig_mr, sig_attrs, bsf, data_len);
        if (ret)
                return -EINVAL;

        *seg += sizeof(*bsf);
        *size += sizeof(*bsf) / 16;
        if (unlikely((*seg == qp->sq.qend)))
                *seg = mlx5_get_send_wqe(qp, 0);

        return 0;
}

static void set_sig_mkey_segment(struct mlx5_mkey_seg *seg,
                                 const struct ib_sig_handover_wr *wr, u32 nelements,
                                 u32 length, u32 pdn)
{
        struct ib_mr *sig_mr = wr->sig_mr;
        u32 sig_key = sig_mr->rkey;
        u8 sigerr = to_mmr(sig_mr)->sig->sigerr_count & 1;

        memset(seg, 0, sizeof(*seg));

        seg->flags = get_umr_flags(wr->access_flags) |
                                   MLX5_ACCESS_MODE_KLM;
        seg->qpn_mkey7_0 = cpu_to_be32((sig_key & 0xff) | 0xffffff00);
        seg->flags_pd = cpu_to_be32(MLX5_MKEY_REMOTE_INVAL | sigerr << 26 |
                                    MLX5_MKEY_BSF_EN | pdn);
        seg->len = cpu_to_be64(length);
        seg->xlt_oct_size = cpu_to_be32(be16_to_cpu(get_klm_octo(nelements)));
        seg->bsfs_octo_size = cpu_to_be32(MLX5_MKEY_BSF_OCTO_SIZE);
}

static void set_sig_umr_segment(struct mlx5_wqe_umr_ctrl_seg *umr,
                                u32 nelements)
{
        memset(umr, 0, sizeof(*umr));

        umr->flags = MLX5_FLAGS_INLINE | MLX5_FLAGS_CHECK_FREE;
        umr->klm_octowords = get_klm_octo(nelements);
        umr->bsf_octowords = cpu_to_be16(MLX5_MKEY_BSF_OCTO_SIZE);
        umr->mkey_mask = sig_mkey_mask();
}


static int set_sig_umr_wr(const struct ib_send_wr *send_wr, struct mlx5_ib_qp *qp,
                          void **seg, int *size)
{
        const struct ib_sig_handover_wr *wr = sig_handover_wr(send_wr);
        struct mlx5_ib_mr *sig_mr = to_mmr(wr->sig_mr);
        u32 pdn = get_pd(qp)->pdn;
        u32 klm_oct_size;
        int region_len, ret;

        if (unlikely(wr->wr.num_sge != 1) ||
            unlikely(wr->access_flags & IB_ACCESS_REMOTE_ATOMIC) ||
            unlikely(!sig_mr->sig) || unlikely(!qp->signature_en) ||
            unlikely(!sig_mr->sig->sig_status_checked))
                return -EINVAL;

        /* length of the protected region, data + protection */
        region_len = wr->wr.sg_list->length;
        if (wr->prot &&
            (wr->prot->lkey != wr->wr.sg_list->lkey  ||
             wr->prot->addr != wr->wr.sg_list->addr  ||
             wr->prot->length != wr->wr.sg_list->length))
                region_len += wr->prot->length;

        /**
         * KLM octoword size - if protection was provided
         * then we use strided block format (3 octowords),
         * else we use single KLM (1 octoword)
         **/
        klm_oct_size = wr->prot ? 3 : 1;

        set_sig_umr_segment(*seg, klm_oct_size);
        *seg += sizeof(struct mlx5_wqe_umr_ctrl_seg);
        *size += sizeof(struct mlx5_wqe_umr_ctrl_seg) / 16;
        if (unlikely((*seg == qp->sq.qend)))
                *seg = mlx5_get_send_wqe(qp, 0);

        set_sig_mkey_segment(*seg, wr, klm_oct_size, region_len, pdn);
        *seg += sizeof(struct mlx5_mkey_seg);
        *size += sizeof(struct mlx5_mkey_seg) / 16;
        if (unlikely((*seg == qp->sq.qend)))
                *seg = mlx5_get_send_wqe(qp, 0);

        ret = set_sig_data_segment(wr, qp, seg, size);
        if (ret)
                return ret;

        sig_mr->sig->sig_status_checked = false;
        return 0;
}

static int set_psv_wr(struct ib_sig_domain *domain,
                      u32 psv_idx, void **seg, int *size)
{
        struct mlx5_seg_set_psv *psv_seg = *seg;

        memset(psv_seg, 0, sizeof(*psv_seg));
        psv_seg->psv_num = cpu_to_be32(psv_idx);
        switch (domain->sig_type) {
        case IB_SIG_TYPE_NONE:
                break;
        case IB_SIG_TYPE_T10_DIF:
                psv_seg->transient_sig = cpu_to_be32(domain->sig.dif.bg << 16 |
                                                     domain->sig.dif.app_tag);
                psv_seg->ref_tag = cpu_to_be32(domain->sig.dif.ref_tag);
                break;
        default:
                pr_err("Bad signature type given.\n");
                return 1;
        }

        *seg += sizeof(*psv_seg);
        *size += sizeof(*psv_seg) / 16;

        return 0;
}

static int set_reg_wr(struct mlx5_ib_qp *qp,
                      const struct ib_reg_wr *wr,
                      void **seg, int *size)
{
        struct mlx5_ib_mr *mr = to_mmr(wr->mr);
        struct mlx5_ib_pd *pd = to_mpd(qp->ibqp.pd);

        if (unlikely(wr->wr.send_flags & IB_SEND_INLINE)) {
                mlx5_ib_warn(to_mdev(qp->ibqp.device),
                             "Invalid IB_SEND_INLINE send flag\n");
                return -EINVAL;
        }

        set_reg_umr_seg(*seg, mr);
        *seg += sizeof(struct mlx5_wqe_umr_ctrl_seg);
        *size += sizeof(struct mlx5_wqe_umr_ctrl_seg) / 16;
        if (unlikely((*seg == qp->sq.qend)))
                *seg = mlx5_get_send_wqe(qp, 0);

        set_reg_mkey_seg(*seg, mr, wr->key, wr->access);
        *seg += sizeof(struct mlx5_mkey_seg);
        *size += sizeof(struct mlx5_mkey_seg) / 16;
        if (unlikely((*seg == qp->sq.qend)))
                *seg = mlx5_get_send_wqe(qp, 0);

        set_reg_data_seg(*seg, mr, pd);
        *seg += sizeof(struct mlx5_wqe_data_seg);
        *size += (sizeof(struct mlx5_wqe_data_seg) / 16);

        return 0;
}

static void set_linv_wr(struct mlx5_ib_qp *qp, void **seg, int *size)
{
        set_linv_umr_seg(*seg);
        *seg += sizeof(struct mlx5_wqe_umr_ctrl_seg);
        *size += sizeof(struct mlx5_wqe_umr_ctrl_seg) / 16;
        if (unlikely((*seg == qp->sq.qend)))
                *seg = mlx5_get_send_wqe(qp, 0);
        set_linv_mkey_seg(*seg);
        *seg += sizeof(struct mlx5_mkey_seg);
        *size += sizeof(struct mlx5_mkey_seg) / 16;
        if (unlikely((*seg == qp->sq.qend)))
                *seg = mlx5_get_send_wqe(qp, 0);
}

static void dump_wqe(struct mlx5_ib_qp *qp, int idx, int size_16)
{
        __be32 *p = NULL;
        int tidx = idx;
        int i, j;

        pr_debug("dump wqe at %p\n", mlx5_get_send_wqe(qp, tidx));
        for (i = 0, j = 0; i < size_16 * 4; i += 4, j += 4) {
                if ((i & 0xf) == 0) {
                        void *buf = mlx5_get_send_wqe(qp, tidx);
                        tidx = (tidx + 1) & (qp->sq.wqe_cnt - 1);
                        p = buf;
                        j = 0;
                }
                pr_debug("%08x %08x %08x %08x\n", be32_to_cpu(p[j]),
                         be32_to_cpu(p[j + 1]), be32_to_cpu(p[j + 2]),
                         be32_to_cpu(p[j + 3]));
        }
}

static u8 get_fence(u8 fence, const struct ib_send_wr *wr)
{
        if (unlikely(wr->opcode == IB_WR_LOCAL_INV &&
                     wr->send_flags & IB_SEND_FENCE))
                return MLX5_FENCE_MODE_STRONG_ORDERING;

        if (unlikely(fence)) {
                if (wr->send_flags & IB_SEND_FENCE)
                        return MLX5_FENCE_MODE_SMALL_AND_FENCE;
                else
                        return fence;
        } else if (unlikely(wr->send_flags & IB_SEND_FENCE)) {
                return MLX5_FENCE_MODE_FENCE;
        }

        return 0;
}

static int begin_wqe(struct mlx5_ib_qp *qp, void **seg,
                     struct mlx5_wqe_ctrl_seg **ctrl,
                     const struct ib_send_wr *wr, unsigned *idx,
                     int *size, int nreq, int send_flags)
{
        if (unlikely(mlx5_wq_overflow(&qp->sq, nreq, qp->ibqp.send_cq)))
                return -ENOMEM;

        *idx = qp->sq.cur_post & (qp->sq.wqe_cnt - 1);
        *seg = mlx5_get_send_wqe(qp, *idx);
        *ctrl = *seg;
        *(uint32_t *)(*seg + 8) = 0;
        (*ctrl)->imm = send_ieth(wr);
        (*ctrl)->fm_ce_se = qp->sq_signal_bits |
                (send_flags & IB_SEND_SIGNALED ?
                 MLX5_WQE_CTRL_CQ_UPDATE : 0) |
                (send_flags & IB_SEND_SOLICITED ?
                 MLX5_WQE_CTRL_SOLICITED : 0);

        *seg += sizeof(**ctrl);
        *size = sizeof(**ctrl) / 16;

        return 0;
}

static void finish_wqe(struct mlx5_ib_qp *qp,
                       struct mlx5_wqe_ctrl_seg *ctrl,
                       u8 size, unsigned idx, u64 wr_id,
                       int nreq, u8 fence, u8 next_fence,
                       u32 mlx5_opcode)
{
        u8 opmod = 0;

        ctrl->opmod_idx_opcode = cpu_to_be32(((u32)(qp->sq.cur_post) << 8) |
                                             mlx5_opcode | ((u32)opmod << 24));
        ctrl->qpn_ds = cpu_to_be32(size | (qp->trans_qp.base.mqp.qpn << 8));
        ctrl->fm_ce_se |= fence;
        qp->fm_cache = next_fence;
        if (unlikely(qp->wq_sig))
                ctrl->signature = wq_sig(ctrl);

        qp->sq.wrid[idx] = wr_id;
        qp->sq.w_list[idx].opcode = mlx5_opcode;
        qp->sq.wqe_head[idx] = qp->sq.head + nreq;
        qp->sq.cur_post += DIV_ROUND_UP(size * 16, MLX5_SEND_WQE_BB);
        qp->sq.w_list[idx].next = qp->sq.cur_post;
}


int mlx5_ib_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr,
                      const struct ib_send_wr **bad_wr)
{
        struct mlx5_wqe_ctrl_seg *ctrl = NULL;  /* compiler warning */
        struct mlx5_ib_dev *dev = to_mdev(ibqp->device);
        struct mlx5_core_dev *mdev = dev->mdev;
        struct mlx5_ib_qp *qp;
        struct mlx5_ib_mr *mr;
        struct mlx5_wqe_data_seg *dpseg;
        struct mlx5_wqe_xrc_seg *xrc;
        struct mlx5_bf *bf;
        int uninitialized_var(size);
        void *qend;
        unsigned long flags;
        unsigned idx;
        int err = 0;
        int num_sge;
        void *seg;
        int nreq;
        int i;
        u8 next_fence = 0;
        u8 fence;

        if (unlikely(ibqp->qp_type == IB_QPT_GSI))
                return mlx5_ib_gsi_post_send(ibqp, wr, bad_wr);

        qp = to_mqp(ibqp);
        bf = &qp->bf;
        qend = qp->sq.qend;

        spin_lock_irqsave(&qp->sq.lock, flags);

        if (mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) {
                err = -EIO;
                *bad_wr = wr;
                nreq = 0;
                goto out;
        }

        for (nreq = 0; wr; nreq++, wr = wr->next) {
                if (unlikely(wr->opcode < 0 || wr->opcode >= ARRAY_SIZE(mlx5_ib_opcode))) {
                        mlx5_ib_warn(dev, "\n");
                        err = -EINVAL;
                        *bad_wr = wr;
                        goto out;
                }

                fence = qp->fm_cache;
                num_sge = wr->num_sge;
                if (unlikely(num_sge > qp->sq.max_gs)) {
                        mlx5_ib_warn(dev, "\n");
                        err = -EINVAL;
                        *bad_wr = wr;
                        goto out;
                }

                err = begin_wqe(qp, &seg, &ctrl, wr, &idx, &size, nreq, wr->send_flags);
                if (err) {
                        mlx5_ib_warn(dev, "\n");
                        err = -ENOMEM;
                        *bad_wr = wr;
                        goto out;
                }

                switch (ibqp->qp_type) {
                case IB_QPT_XRC_INI:
                        xrc = seg;
                        seg += sizeof(*xrc);
                        size += sizeof(*xrc) / 16;
                        /* fall through */
                case IB_QPT_RC:
                        switch (wr->opcode) {
                        case IB_WR_RDMA_READ:
                        case IB_WR_RDMA_WRITE:
                        case IB_WR_RDMA_WRITE_WITH_IMM:
                                set_raddr_seg(seg, rdma_wr(wr)->remote_addr,
                                              rdma_wr(wr)->rkey);
                                seg += sizeof(struct mlx5_wqe_raddr_seg);
                                size += sizeof(struct mlx5_wqe_raddr_seg) / 16;
                                break;

                        case IB_WR_ATOMIC_CMP_AND_SWP:
                        case IB_WR_ATOMIC_FETCH_AND_ADD:
                        case IB_WR_MASKED_ATOMIC_CMP_AND_SWP:
                                mlx5_ib_warn(dev, "Atomic operations are not supported yet\n");
                                err = -ENOSYS;
                                *bad_wr = wr;
                                goto out;

                        case IB_WR_LOCAL_INV:
                                next_fence = MLX5_FENCE_MODE_INITIATOR_SMALL;
                                qp->sq.wr_data[idx] = IB_WR_LOCAL_INV;
                                ctrl->imm = cpu_to_be32(wr->ex.invalidate_rkey);
                                set_linv_wr(qp, &seg, &size);
                                num_sge = 0;
                                break;

                        case IB_WR_REG_MR:
                                next_fence = MLX5_FENCE_MODE_INITIATOR_SMALL;
                                qp->sq.wr_data[idx] = IB_WR_REG_MR;
                                ctrl->imm = cpu_to_be32(reg_wr(wr)->key);
                                err = set_reg_wr(qp, reg_wr(wr), &seg, &size);
                                if (err) {
                                        *bad_wr = wr;
                                        goto out;
                                }
                                num_sge = 0;
                                break;

                        case IB_WR_REG_SIG_MR:
                                qp->sq.wr_data[idx] = IB_WR_REG_SIG_MR;
                                mr = to_mmr(sig_handover_wr(wr)->sig_mr);

                                ctrl->imm = cpu_to_be32(mr->ibmr.rkey);
                                err = set_sig_umr_wr(wr, qp, &seg, &size);
                                if (err) {
                                        mlx5_ib_warn(dev, "\n");
                                        *bad_wr = wr;
                                        goto out;
                                }

                                finish_wqe(qp, ctrl, size, idx, wr->wr_id,
                                           nreq, get_fence(fence, wr),
                                           next_fence, MLX5_OPCODE_UMR);
                                /*
                                 * SET_PSV WQEs are not signaled and solicited
                                 * on error
                                 */
                                err = begin_wqe(qp, &seg, &ctrl, wr,
                                                &idx, &size, nreq, IB_SEND_SOLICITED);
                                if (err) {
                                        mlx5_ib_warn(dev, "\n");
                                        err = -ENOMEM;
                                        *bad_wr = wr;
                                        goto out;
                                }

                                err = set_psv_wr(&sig_handover_wr(wr)->sig_attrs->mem,
                                                 mr->sig->psv_memory.psv_idx, &seg,
                                                 &size);
                                if (err) {
                                        mlx5_ib_warn(dev, "\n");
                                        *bad_wr = wr;
                                        goto out;
                                }

                                finish_wqe(qp, ctrl, size, idx, wr->wr_id,
                                           nreq, get_fence(fence, wr),
                                           next_fence, MLX5_OPCODE_SET_PSV);
                                err = begin_wqe(qp, &seg, &ctrl, wr,
                                                &idx, &size, nreq, wr->send_flags);
                                if (err) {
                                        mlx5_ib_warn(dev, "\n");
                                        err = -ENOMEM;
                                        *bad_wr = wr;
                                        goto out;
                                }

                                next_fence = MLX5_FENCE_MODE_INITIATOR_SMALL;
                                err = set_psv_wr(&sig_handover_wr(wr)->sig_attrs->wire,
                                                 mr->sig->psv_wire.psv_idx, &seg,
                                                 &size);
                                if (err) {
                                        mlx5_ib_warn(dev, "\n");
                                        *bad_wr = wr;
                                        goto out;
                                }

                                finish_wqe(qp, ctrl, size, idx, wr->wr_id,
                                           nreq, get_fence(fence, wr),
                                           next_fence, MLX5_OPCODE_SET_PSV);
                                num_sge = 0;
                                goto skip_psv;

                        default:
                                break;
                        }
                        break;

                case IB_QPT_UC:
                        switch (wr->opcode) {
                        case IB_WR_RDMA_WRITE:
                        case IB_WR_RDMA_WRITE_WITH_IMM:
                                set_raddr_seg(seg, rdma_wr(wr)->remote_addr,
                                              rdma_wr(wr)->rkey);
                                seg  += sizeof(struct mlx5_wqe_raddr_seg);
                                size += sizeof(struct mlx5_wqe_raddr_seg) / 16;
                                break;

                        default:
                                break;
                        }
                        break;

                case IB_QPT_SMI:
                case MLX5_IB_QPT_HW_GSI:
                        set_datagram_seg(seg, wr);
                        seg += sizeof(struct mlx5_wqe_datagram_seg);
                        size += sizeof(struct mlx5_wqe_datagram_seg) / 16;
                        if (unlikely((seg == qend)))
                                seg = mlx5_get_send_wqe(qp, 0);
                        break;
                case IB_QPT_UD:
                        set_datagram_seg(seg, wr);
                        seg += sizeof(struct mlx5_wqe_datagram_seg);
                        size += sizeof(struct mlx5_wqe_datagram_seg) / 16;

                        if (unlikely((seg == qend)))
                                seg = mlx5_get_send_wqe(qp, 0);

                        /* handle qp that supports ud offload */
                        if (qp->flags & IB_QP_CREATE_IPOIB_UD_LSO) {
                                struct mlx5_wqe_eth_pad *pad;

                                pad = seg;
                                memset(pad, 0, sizeof(struct mlx5_wqe_eth_pad));
                                seg += sizeof(struct mlx5_wqe_eth_pad);
                                size += sizeof(struct mlx5_wqe_eth_pad) / 16;

                                seg = set_eth_seg(seg, wr, qend, qp, &size);

                                if (unlikely((seg == qend)))
                                        seg = mlx5_get_send_wqe(qp, 0);
                        }
                        break;
                case MLX5_IB_QPT_REG_UMR:
                        if (wr->opcode != MLX5_IB_WR_UMR) {
                                err = -EINVAL;
                                mlx5_ib_warn(dev, "bad opcode\n");
                                goto out;
                        }
                        qp->sq.wr_data[idx] = MLX5_IB_WR_UMR;
                        ctrl->imm = cpu_to_be32(umr_wr(wr)->mkey);
                        set_reg_umr_segment(seg, wr);
                        seg += sizeof(struct mlx5_wqe_umr_ctrl_seg);
                        size += sizeof(struct mlx5_wqe_umr_ctrl_seg) / 16;
                        if (unlikely((seg == qend)))
                                seg = mlx5_get_send_wqe(qp, 0);
                        set_reg_mkey_segment(seg, wr);
                        seg += sizeof(struct mlx5_mkey_seg);
                        size += sizeof(struct mlx5_mkey_seg) / 16;
                        if (unlikely((seg == qend)))
                                seg = mlx5_get_send_wqe(qp, 0);
                        break;

                default:
                        break;
                }

                if (wr->send_flags & IB_SEND_INLINE && num_sge) {
                        int uninitialized_var(sz);

                        err = set_data_inl_seg(qp, wr, seg, &sz);
                        if (unlikely(err)) {
                                mlx5_ib_warn(dev, "\n");
                                *bad_wr = wr;
                                goto out;
                        }
                        size += sz;
                } else {
                        dpseg = seg;
                        for (i = 0; i < num_sge; i++) {
                                if (unlikely(dpseg == qend)) {
                                        seg = mlx5_get_send_wqe(qp, 0);
                                        dpseg = seg;
                                }
                                if (likely(wr->sg_list[i].length)) {
                                        set_data_ptr_seg(dpseg, wr->sg_list + i);
                                        size += sizeof(struct mlx5_wqe_data_seg) / 16;
                                        dpseg++;
                                }
                        }
                }

                finish_wqe(qp, ctrl, size, idx, wr->wr_id, nreq,
                           get_fence(fence, wr), next_fence,
                           mlx5_ib_opcode[wr->opcode]);
skip_psv:
                if (0)
                        dump_wqe(qp, idx, size);
        }

out:
        if (likely(nreq)) {
                qp->sq.head += nreq;

                /* Make sure that descriptors are written before
                 * updating doorbell record and ringing the doorbell
                 */
                wmb();

                qp->db.db[MLX5_SND_DBR] = cpu_to_be32(qp->sq.cur_post);

                /* Make sure doorbell record is visible to the HCA before
                 * we hit doorbell */
                wmb();

                mlx5_write64((__be32 *)ctrl, bf->bfreg->map + bf->offset,
                             MLX5_GET_DOORBELL_LOCK(&bf->lock32));
                /* Make sure doorbells don't leak out of SQ spinlock
                 * and reach the HCA out of order.
                 */
                bf->offset ^= bf->buf_size;
        }

        spin_unlock_irqrestore(&qp->sq.lock, flags);

        return err;
}

static void set_sig_seg(struct mlx5_rwqe_sig *sig, int size)
{
        sig->signature = calc_sig(sig, size);
}

int mlx5_ib_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr,
                      const struct ib_recv_wr **bad_wr)
{
        struct mlx5_ib_qp *qp = to_mqp(ibqp);
        struct mlx5_wqe_data_seg *scat;
        struct mlx5_rwqe_sig *sig;
        struct mlx5_ib_dev *dev = to_mdev(ibqp->device);
        struct mlx5_core_dev *mdev = dev->mdev;
        unsigned long flags;
        int err = 0;
        int nreq;
        int ind;
        int i;

        if (unlikely(ibqp->qp_type == IB_QPT_GSI))
                return mlx5_ib_gsi_post_recv(ibqp, wr, bad_wr);

        spin_lock_irqsave(&qp->rq.lock, flags);

        if (mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) {
                err = -EIO;
                *bad_wr = wr;
                nreq = 0;
                goto out;
        }

        ind = qp->rq.head & (qp->rq.wqe_cnt - 1);

        for (nreq = 0; wr; nreq++, wr = wr->next) {
                if (mlx5_wq_overflow(&qp->rq, nreq, qp->ibqp.recv_cq)) {
                        err = -ENOMEM;
                        *bad_wr = wr;
                        goto out;
                }

                if (unlikely(wr->num_sge > qp->rq.max_gs)) {
                        err = -EINVAL;
                        *bad_wr = wr;
                        goto out;
                }

                scat = get_recv_wqe(qp, ind);
                if (qp->wq_sig)
                        scat++;

                for (i = 0; i < wr->num_sge; i++)
                        set_data_ptr_seg(scat + i, wr->sg_list + i);

                if (i < qp->rq.max_gs) {
                        scat[i].byte_count = 0;
                        scat[i].lkey       = cpu_to_be32(MLX5_INVALID_LKEY);
                        scat[i].addr       = 0;
                }

                if (qp->wq_sig) {
                        sig = (struct mlx5_rwqe_sig *)scat;
                        set_sig_seg(sig, (qp->rq.max_gs + 1) << 2);
                }

                qp->rq.wrid[ind] = wr->wr_id;

                ind = (ind + 1) & (qp->rq.wqe_cnt - 1);
        }

out:
        if (likely(nreq)) {
                qp->rq.head += nreq;

                /* Make sure that descriptors are written before
                 * doorbell record.
                 */
                wmb();

                *qp->db.db = cpu_to_be32(qp->rq.head & 0xffff);
        }

        spin_unlock_irqrestore(&qp->rq.lock, flags);

        return err;
}

static inline enum ib_qp_state to_ib_qp_state(enum mlx5_qp_state mlx5_state)
{
        switch (mlx5_state) {
        case MLX5_QP_STATE_RST:      return IB_QPS_RESET;
        case MLX5_QP_STATE_INIT:     return IB_QPS_INIT;
        case MLX5_QP_STATE_RTR:      return IB_QPS_RTR;
        case MLX5_QP_STATE_RTS:      return IB_QPS_RTS;
        case MLX5_QP_STATE_SQ_DRAINING:
        case MLX5_QP_STATE_SQD:      return IB_QPS_SQD;
        case MLX5_QP_STATE_SQER:     return IB_QPS_SQE;
        case MLX5_QP_STATE_ERR:      return IB_QPS_ERR;
        default:                     return -1;
        }
}

static inline enum ib_mig_state to_ib_mig_state(int mlx5_mig_state)
{
        switch (mlx5_mig_state) {
        case MLX5_QP_PM_ARMED:          return IB_MIG_ARMED;
        case MLX5_QP_PM_REARM:          return IB_MIG_REARM;
        case MLX5_QP_PM_MIGRATED:       return IB_MIG_MIGRATED;
        default: return -1;
        }
}

static int to_ib_qp_access_flags(int mlx5_flags)
{
        int ib_flags = 0;

        if (mlx5_flags & MLX5_QP_BIT_RRE)
                ib_flags |= IB_ACCESS_REMOTE_READ;
        if (mlx5_flags & MLX5_QP_BIT_RWE)
                ib_flags |= IB_ACCESS_REMOTE_WRITE;
        if (mlx5_flags & MLX5_QP_BIT_RAE)
                ib_flags |= IB_ACCESS_REMOTE_ATOMIC;

        return ib_flags;
}

static void to_ib_ah_attr(struct mlx5_ib_dev *ibdev, struct ib_ah_attr *ib_ah_attr,
                                struct mlx5_qp_path *path)
{
        struct mlx5_core_dev *dev = ibdev->mdev;

        memset(ib_ah_attr, 0, sizeof(*ib_ah_attr));
        ib_ah_attr->port_num      = path->port;

        if (ib_ah_attr->port_num == 0 ||
            ib_ah_attr->port_num > MLX5_CAP_GEN(dev, num_ports))
                return;

        ib_ah_attr->sl = path->dci_cfi_prio_sl & 0xf;

        ib_ah_attr->dlid          = be16_to_cpu(path->rlid);
        ib_ah_attr->src_path_bits = path->grh_mlid & 0x7f;
        ib_ah_attr->static_rate   = path->static_rate ? path->static_rate - 5 : 0;
        ib_ah_attr->ah_flags      = (path->grh_mlid & (1 << 7)) ? IB_AH_GRH : 0;
        if (ib_ah_attr->ah_flags) {
                ib_ah_attr->grh.sgid_index = path->mgid_index;
                ib_ah_attr->grh.hop_limit  = path->hop_limit;
                ib_ah_attr->grh.traffic_class =
                        (be32_to_cpu(path->tclass_flowlabel) >> 20) & 0xff;
                ib_ah_attr->grh.flow_label =
                        be32_to_cpu(path->tclass_flowlabel) & 0xfffff;
                memcpy(ib_ah_attr->grh.dgid.raw,
                       path->rgid, sizeof(ib_ah_attr->grh.dgid.raw));
        }
}

static int query_raw_packet_qp_sq_state(struct mlx5_ib_dev *dev,
                                        struct mlx5_ib_sq *sq,
                                        u8 *sq_state)
{
        void *out;
        void *sqc;
        int inlen;
        int err;

        inlen = MLX5_ST_SZ_BYTES(query_sq_out);
        out = mlx5_vzalloc(inlen);
        if (!out)
                return -ENOMEM;

        err = mlx5_core_query_sq(dev->mdev, sq->base.mqp.qpn, out);
        if (err)
                goto out;

        sqc = MLX5_ADDR_OF(query_sq_out, out, sq_context);
        *sq_state = MLX5_GET(sqc, sqc, state);
        sq->state = *sq_state;

out:
        kvfree(out);
        return err;
}

static int query_raw_packet_qp_rq_state(struct mlx5_ib_dev *dev,
                                        struct mlx5_ib_rq *rq,
                                        u8 *rq_state)
{
        void *out;
        void *rqc;
        int inlen;
        int err;

        inlen = MLX5_ST_SZ_BYTES(query_rq_out);
        out = mlx5_vzalloc(inlen);
        if (!out)
                return -ENOMEM;

        err = mlx5_core_query_rq(dev->mdev, rq->base.mqp.qpn, out);
        if (err)
                goto out;

        rqc = MLX5_ADDR_OF(query_rq_out, out, rq_context);
        *rq_state = MLX5_GET(rqc, rqc, state);
        rq->state = *rq_state;

out:
        kvfree(out);
        return err;
}

static int sqrq_state_to_qp_state(u8 sq_state, u8 rq_state,
                                  struct mlx5_ib_qp *qp, u8 *qp_state)
{
        static const u8 sqrq_trans[MLX5_RQ_NUM_STATE][MLX5_SQ_NUM_STATE] = {
                [MLX5_RQC_STATE_RST] = {
                        [MLX5_SQC_STATE_RST]    = IB_QPS_RESET,
                        [MLX5_SQC_STATE_RDY]    = MLX5_QP_STATE_BAD,
                        [MLX5_SQC_STATE_ERR]    = MLX5_QP_STATE_BAD,
                        [MLX5_SQ_STATE_NA]      = IB_QPS_RESET,
                },
                [MLX5_RQC_STATE_RDY] = {
                        [MLX5_SQC_STATE_RST]    = MLX5_QP_STATE_BAD,
                        [MLX5_SQC_STATE_RDY]    = MLX5_QP_STATE,
                        [MLX5_SQC_STATE_ERR]    = IB_QPS_SQE,
                        [MLX5_SQ_STATE_NA]      = MLX5_QP_STATE,
                },
                [MLX5_RQC_STATE_ERR] = {
                        [MLX5_SQC_STATE_RST]    = MLX5_QP_STATE_BAD,
                        [MLX5_SQC_STATE_RDY]    = MLX5_QP_STATE_BAD,
                        [MLX5_SQC_STATE_ERR]    = IB_QPS_ERR,
                        [MLX5_SQ_STATE_NA]      = IB_QPS_ERR,
                },
                [MLX5_RQ_STATE_NA] = {
                        [MLX5_SQC_STATE_RST]    = IB_QPS_RESET,
                        [MLX5_SQC_STATE_RDY]    = MLX5_QP_STATE,
                        [MLX5_SQC_STATE_ERR]    = MLX5_QP_STATE,
                        [MLX5_SQ_STATE_NA]      = MLX5_QP_STATE_BAD,
                },
        };

        *qp_state = sqrq_trans[rq_state][sq_state];

        if (*qp_state == MLX5_QP_STATE_BAD) {
                WARN(1, "Buggy Raw Packet QP state, SQ 0x%x state: 0x%x, RQ 0x%x state: 0x%x",
                     qp->raw_packet_qp.sq.base.mqp.qpn, sq_state,
                     qp->raw_packet_qp.rq.base.mqp.qpn, rq_state);
                return -EINVAL;
        }

        if (*qp_state == MLX5_QP_STATE)
                *qp_state = qp->state;

        return 0;
}

static int query_raw_packet_qp_state(struct mlx5_ib_dev *dev,
                                     struct mlx5_ib_qp *qp,
                                     u8 *raw_packet_qp_state)
{
        struct mlx5_ib_raw_packet_qp *raw_packet_qp = &qp->raw_packet_qp;
        struct mlx5_ib_sq *sq = &raw_packet_qp->sq;
        struct mlx5_ib_rq *rq = &raw_packet_qp->rq;
        int err;
        u8 sq_state = MLX5_SQ_STATE_NA;
        u8 rq_state = MLX5_RQ_STATE_NA;

        if (qp->sq.wqe_cnt) {
                err = query_raw_packet_qp_sq_state(dev, sq, &sq_state);
                if (err)
                        return err;
        }

        if (qp->rq.wqe_cnt) {
                err = query_raw_packet_qp_rq_state(dev, rq, &rq_state);
                if (err)
                        return err;
        }

        return sqrq_state_to_qp_state(sq_state, rq_state, qp,
                                      raw_packet_qp_state);
}

static int query_qp_attr(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp,
                         struct ib_qp_attr *qp_attr)
{
        int outlen = MLX5_ST_SZ_BYTES(query_qp_out);
        struct mlx5_qp_context *context;
        int mlx5_state;
        u32 *outb;
        int err = 0;

        outb = kzalloc(outlen, GFP_KERNEL);
        if (!outb)
                return -ENOMEM;

        err = mlx5_core_qp_query(dev->mdev, &qp->trans_qp.base.mqp, outb,
                                 outlen);
        if (err)
                goto out;

        /* FIXME: use MLX5_GET rather than mlx5_qp_context manual struct */
        context = (struct mlx5_qp_context *)MLX5_ADDR_OF(query_qp_out, outb, qpc);

        mlx5_state = be32_to_cpu(context->flags) >> 28;

        qp->state                    = to_ib_qp_state(mlx5_state);
        qp_attr->path_mtu            = context->mtu_msgmax >> 5;
        qp_attr->path_mig_state      =
                to_ib_mig_state((be32_to_cpu(context->flags) >> 11) & 0x3);
        qp_attr->qkey                = be32_to_cpu(context->qkey);
        qp_attr->rq_psn              = be32_to_cpu(context->rnr_nextrecvpsn) & 0xffffff;
        qp_attr->sq_psn              = be32_to_cpu(context->next_send_psn) & 0xffffff;
        qp_attr->dest_qp_num         = be32_to_cpu(context->log_pg_sz_remote_qpn) & 0xffffff;
        qp_attr->qp_access_flags     =
                to_ib_qp_access_flags(be32_to_cpu(context->params2));

        if (qp->ibqp.qp_type == IB_QPT_RC || qp->ibqp.qp_type == IB_QPT_UC) {
                to_ib_ah_attr(dev, &qp_attr->ah_attr, &context->pri_path);
                to_ib_ah_attr(dev, &qp_attr->alt_ah_attr, &context->alt_path);
                qp_attr->alt_pkey_index =
                        be16_to_cpu(context->alt_path.pkey_index);
                qp_attr->alt_port_num   = qp_attr->alt_ah_attr.port_num;
        }

        qp_attr->pkey_index = be16_to_cpu(context->pri_path.pkey_index);
        qp_attr->port_num = context->pri_path.port;

        /* qp_attr->en_sqd_async_notify is only applicable in modify qp */
        qp_attr->sq_draining = mlx5_state == MLX5_QP_STATE_SQ_DRAINING;

        qp_attr->max_rd_atomic = 1 << ((be32_to_cpu(context->params1) >> 21) & 0x7);

        qp_attr->max_dest_rd_atomic =
                1 << ((be32_to_cpu(context->params2) >> 21) & 0x7);
        qp_attr->min_rnr_timer      =
                (be32_to_cpu(context->rnr_nextrecvpsn) >> 24) & 0x1f;
        qp_attr->timeout            = context->pri_path.ackto_lt >> 3;
        qp_attr->retry_cnt          = (be32_to_cpu(context->params1) >> 16) & 0x7;
        qp_attr->rnr_retry          = (be32_to_cpu(context->params1) >> 13) & 0x7;
        qp_attr->alt_timeout        = context->alt_path.ackto_lt >> 3;

out:
        kfree(outb);
        return err;
}

int mlx5_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr,
                     int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr)
{
        struct mlx5_ib_dev *dev = to_mdev(ibqp->device);
        struct mlx5_ib_qp *qp = to_mqp(ibqp);
        int err = 0;
        u8 raw_packet_qp_state;

        if (ibqp->rwq_ind_tbl)
                return -ENOSYS;

        if (unlikely(ibqp->qp_type == IB_QPT_GSI))
                return mlx5_ib_gsi_query_qp(ibqp, qp_attr, qp_attr_mask,
                                            qp_init_attr);

#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
        /*
         * Wait for any outstanding page faults, in case the user frees memory
         * based upon this query's result.
         */
        flush_workqueue(mlx5_ib_page_fault_wq);
#endif

        mutex_lock(&qp->mutex);

        if (qp->ibqp.qp_type == IB_QPT_RAW_PACKET) {
                err = query_raw_packet_qp_state(dev, qp, &raw_packet_qp_state);
                if (err)
                        goto out;
                qp->state = raw_packet_qp_state;
                qp_attr->port_num = 1;
        } else {
                err = query_qp_attr(dev, qp, qp_attr);
                if (err)
                        goto out;
        }

        qp_attr->qp_state            = qp->state;
        qp_attr->cur_qp_state        = qp_attr->qp_state;
        qp_attr->cap.max_recv_wr     = qp->rq.wqe_cnt;
        qp_attr->cap.max_recv_sge    = qp->rq.max_gs;

        if (!ibqp->uobject) {
                qp_attr->cap.max_send_wr  = qp->sq.max_post;
                qp_attr->cap.max_send_sge = qp->sq.max_gs;
                qp_init_attr->qp_context = ibqp->qp_context;
        } else {
                qp_attr->cap.max_send_wr  = 0;
                qp_attr->cap.max_send_sge = 0;
        }

        qp_init_attr->qp_type = ibqp->qp_type;
        qp_init_attr->recv_cq = ibqp->recv_cq;
        qp_init_attr->send_cq = ibqp->send_cq;
        qp_init_attr->srq = ibqp->srq;
        qp_attr->cap.max_inline_data = qp->max_inline_data;

        qp_init_attr->cap            = qp_attr->cap;

        qp_init_attr->create_flags = 0;
        if (qp->flags & MLX5_IB_QP_BLOCK_MULTICAST_LOOPBACK)
                qp_init_attr->create_flags |= IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK;

        if (qp->flags & MLX5_IB_QP_CROSS_CHANNEL)
                qp_init_attr->create_flags |= IB_QP_CREATE_CROSS_CHANNEL;
        if (qp->flags & MLX5_IB_QP_MANAGED_SEND)
                qp_init_attr->create_flags |= IB_QP_CREATE_MANAGED_SEND;
        if (qp->flags & MLX5_IB_QP_MANAGED_RECV)
                qp_init_attr->create_flags |= IB_QP_CREATE_MANAGED_RECV;
        if (qp->flags & MLX5_IB_QP_SQPN_QP1)
                qp_init_attr->create_flags |= MLX5_IB_QP_CREATE_SQPN_QP1;

        qp_init_attr->sq_sig_type = qp->sq_signal_bits & MLX5_WQE_CTRL_CQ_UPDATE ?
                IB_SIGNAL_ALL_WR : IB_SIGNAL_REQ_WR;

out:
        mutex_unlock(&qp->mutex);
        return err;
}

struct ib_xrcd *mlx5_ib_alloc_xrcd(struct ib_device *ibdev,
                                   struct ib_udata *udata)
{
        struct mlx5_ib_dev *dev = to_mdev(ibdev);
        struct mlx5_ib_xrcd *xrcd;
        int err;

        if (!MLX5_CAP_GEN(dev->mdev, xrc))
                return ERR_PTR(-ENOSYS);

        xrcd = kmalloc(sizeof(*xrcd), GFP_KERNEL);
        if (!xrcd)
                return ERR_PTR(-ENOMEM);

        err = mlx5_core_xrcd_alloc(dev->mdev, &xrcd->xrcdn);
        if (err) {
                kfree(xrcd);
                return ERR_PTR(-ENOMEM);
        }

        return &xrcd->ibxrcd;
}

int mlx5_ib_dealloc_xrcd(struct ib_xrcd *xrcd, struct ib_udata *udata)
{
        struct mlx5_ib_dev *dev = to_mdev(xrcd->device);
        u32 xrcdn = to_mxrcd(xrcd)->xrcdn;
        int err;

        err = mlx5_core_xrcd_dealloc(dev->mdev, xrcdn);
        if (err)
                mlx5_ib_warn(dev, "failed to dealloc xrcdn 0x%x\n", xrcdn);

        kfree(xrcd);
        return 0;
}

static void mlx5_ib_wq_event(struct mlx5_core_qp *core_qp, int type)
{
        struct mlx5_ib_rwq *rwq = to_mibrwq(core_qp);
        struct mlx5_ib_dev *dev = to_mdev(rwq->ibwq.device);
        struct ib_event event;

        if (rwq->ibwq.event_handler) {
                event.device     = rwq->ibwq.device;
                event.element.wq = &rwq->ibwq;
                switch (type) {
                case MLX5_EVENT_TYPE_WQ_CATAS_ERROR:
                        event.event = IB_EVENT_WQ_FATAL;
                        break;
                default:
                        mlx5_ib_warn(dev, "Unexpected event type %d on WQ %06x\n", type, core_qp->qpn);
                        return;
                }

                rwq->ibwq.event_handler(&event, rwq->ibwq.wq_context);
        }
}

static int  create_rq(struct mlx5_ib_rwq *rwq, struct ib_pd *pd,
                      struct ib_wq_init_attr *init_attr)
{
        struct mlx5_ib_dev *dev;
        __be64 *rq_pas0;
        void *in;
        void *rqc;
        void *wq;
        int inlen;
        int err;

        dev = to_mdev(pd->device);

        inlen = MLX5_ST_SZ_BYTES(create_rq_in) + sizeof(u64) * rwq->rq_num_pas;
        in = mlx5_vzalloc(inlen);
        if (!in)
                return -ENOMEM;

        MLX5_SET(create_rq_in, in, uid, to_mpd(pd)->uid);
        rqc = MLX5_ADDR_OF(create_rq_in, in, ctx);
        MLX5_SET(rqc,  rqc, mem_rq_type,
                 MLX5_RQC_RQ_TYPE_MEMORY_RQ_INLINE);
        MLX5_SET(rqc, rqc, user_index, rwq->user_index);
        MLX5_SET(rqc,  rqc, cqn, to_mcq(init_attr->cq)->mcq.cqn);
        MLX5_SET(rqc,  rqc, state, MLX5_RQC_STATE_RST);
        MLX5_SET(rqc,  rqc, flush_in_error_en, 1);
        wq = MLX5_ADDR_OF(rqc, rqc, wq);
        MLX5_SET(wq, wq, wq_type, MLX5_WQ_TYPE_CYCLIC);
        MLX5_SET(wq, wq, end_padding_mode, MLX5_WQ_END_PAD_MODE_ALIGN);
        MLX5_SET(wq, wq, log_wq_stride, rwq->log_rq_stride);
        MLX5_SET(wq, wq, log_wq_sz, rwq->log_rq_size);
        MLX5_SET(wq, wq, pd, to_mpd(pd)->pdn);
        MLX5_SET(wq, wq, page_offset, rwq->rq_page_offset);
        MLX5_SET(wq, wq, log_wq_pg_sz, rwq->log_page_size);
        MLX5_SET(wq, wq, wq_signature, rwq->wq_sig);
        MLX5_SET64(wq, wq, dbr_addr, rwq->db.dma);
        rq_pas0 = (__be64 *)MLX5_ADDR_OF(wq, wq, pas);
        mlx5_ib_populate_pas(dev, rwq->umem, rwq->page_shift, rq_pas0, 0);
        err = mlx5_core_create_rq_tracked(dev->mdev, in, inlen, &rwq->core_qp);
        kvfree(in);
        return err;
}

static int set_user_rq_size(struct mlx5_ib_dev *dev,
                            struct ib_wq_init_attr *wq_init_attr,
                            struct mlx5_ib_create_wq *ucmd,
                            struct mlx5_ib_rwq *rwq)
{
        /* Sanity check RQ size before proceeding */
        if (wq_init_attr->max_wr > (1 << MLX5_CAP_GEN(dev->mdev, log_max_wq_sz)))
                return -EINVAL;

        if (!ucmd->rq_wqe_count)
                return -EINVAL;

        rwq->wqe_count = ucmd->rq_wqe_count;
        rwq->wqe_shift = ucmd->rq_wqe_shift;
        rwq->buf_size = (rwq->wqe_count << rwq->wqe_shift);
        rwq->log_rq_stride = rwq->wqe_shift;
        rwq->log_rq_size = ilog2(rwq->wqe_count);
        return 0;
}

static int prepare_user_rq(struct ib_pd *pd,
                           struct ib_wq_init_attr *init_attr,
                           struct ib_udata *udata,
                           struct mlx5_ib_rwq *rwq)
{
        struct mlx5_ib_dev *dev = to_mdev(pd->device);
        struct mlx5_ib_create_wq ucmd = {};
        int err;
        size_t required_cmd_sz;

        required_cmd_sz = offsetof(typeof(ucmd), reserved) + sizeof(ucmd.reserved);
        if (udata->inlen < required_cmd_sz) {
                mlx5_ib_dbg(dev, "invalid inlen\n");
                return -EINVAL;
        }

        if (udata->inlen > sizeof(ucmd) &&
            !ib_is_udata_cleared(udata, sizeof(ucmd),
                                 udata->inlen - sizeof(ucmd))) {
                mlx5_ib_dbg(dev, "inlen is not supported\n");
                return -EOPNOTSUPP;
        }

        if (ib_copy_from_udata(&ucmd, udata, min(sizeof(ucmd), udata->inlen))) {
                mlx5_ib_dbg(dev, "copy failed\n");
                return -EFAULT;
        }

        if (ucmd.comp_mask) {
                mlx5_ib_dbg(dev, "invalid comp mask\n");
                return -EOPNOTSUPP;
        }

        if (ucmd.reserved) {
                mlx5_ib_dbg(dev, "invalid reserved\n");
                return -EOPNOTSUPP;
        }

        err = set_user_rq_size(dev, init_attr, &ucmd, rwq);
        if (err) {
                mlx5_ib_dbg(dev, "err %d\n", err);
                return err;
        }

        err = create_user_rq(dev, pd, rwq, &ucmd);
        if (err) {
                mlx5_ib_dbg(dev, "err %d\n", err);
                if (err)
                        return err;
        }

        rwq->user_index = ucmd.user_index;
        return 0;
}

struct ib_wq *mlx5_ib_create_wq(struct ib_pd *pd,
                                struct ib_wq_init_attr *init_attr,
                                struct ib_udata *udata)
{
        struct mlx5_ib_dev *dev;
        struct mlx5_ib_rwq *rwq;
        struct mlx5_ib_create_wq_resp resp = {};
        size_t min_resp_len;
        int err;

        if (!udata)
                return ERR_PTR(-ENOSYS);

        min_resp_len = offsetof(typeof(resp), reserved) + sizeof(resp.reserved);
        if (udata->outlen && udata->outlen < min_resp_len)
                return ERR_PTR(-EINVAL);

        dev = to_mdev(pd->device);
        switch (init_attr->wq_type) {
        case IB_WQT_RQ:
                rwq = kzalloc(sizeof(*rwq), GFP_KERNEL);
                if (!rwq)
                        return ERR_PTR(-ENOMEM);
                err = prepare_user_rq(pd, init_attr, udata, rwq);
                if (err)
                        goto err;
                err = create_rq(rwq, pd, init_attr);
                if (err)
                        goto err_user_rq;
                break;
        default:
                mlx5_ib_dbg(dev, "unsupported wq type %d\n",
                            init_attr->wq_type);
                return ERR_PTR(-EINVAL);
        }

        rwq->ibwq.wq_num = rwq->core_qp.qpn;
        rwq->ibwq.state = IB_WQS_RESET;
        if (udata->outlen) {
                resp.response_length = offsetof(typeof(resp), response_length) +
                                sizeof(resp.response_length);
                err = ib_copy_to_udata(udata, &resp, resp.response_length);
                if (err)
                        goto err_copy;
        }

        rwq->core_qp.event = mlx5_ib_wq_event;
        rwq->ibwq.event_handler = init_attr->event_handler;
        return &rwq->ibwq;

err_copy:
        mlx5_core_destroy_rq_tracked(dev->mdev, &rwq->core_qp);
err_user_rq:
        destroy_user_rq(pd, rwq, udata);
err:
        kfree(rwq);
        return ERR_PTR(err);
}

void mlx5_ib_destroy_wq(struct ib_wq *wq, struct ib_udata *udata)
{
        struct mlx5_ib_dev *dev = to_mdev(wq->device);
        struct mlx5_ib_rwq *rwq = to_mrwq(wq);

        mlx5_core_destroy_rq_tracked(dev->mdev, &rwq->core_qp);
        destroy_user_rq(wq->pd, rwq, udata);
        kfree(rwq);
}

struct ib_rwq_ind_table *mlx5_ib_create_rwq_ind_table(struct ib_device *device,
                                                      struct ib_rwq_ind_table_init_attr *init_attr,
                                                      struct ib_udata *udata)
{
        struct mlx5_ib_dev *dev = to_mdev(device);
        struct mlx5_ib_rwq_ind_table *rwq_ind_tbl;
        int sz = 1 << init_attr->log_ind_tbl_size;
        struct mlx5_ib_create_rwq_ind_tbl_resp resp = {};
        size_t min_resp_len;
        int inlen;
        int err;
        int i;
        u32 *in;
        void *rqtc;

        if (udata->inlen > 0 &&
            !ib_is_udata_cleared(udata, 0,
                                 udata->inlen))
                return ERR_PTR(-EOPNOTSUPP);

        if (init_attr->log_ind_tbl_size >
            MLX5_CAP_GEN(dev->mdev, log_max_rqt_size)) {
                mlx5_ib_dbg(dev, "log_ind_tbl_size = %d is bigger than supported = %d\n",
                            init_attr->log_ind_tbl_size,
                            MLX5_CAP_GEN(dev->mdev, log_max_rqt_size));
                return ERR_PTR(-EINVAL);
        }

        min_resp_len = offsetof(typeof(resp), reserved) + sizeof(resp.reserved);
        if (udata->outlen && udata->outlen < min_resp_len)
                return ERR_PTR(-EINVAL);

        rwq_ind_tbl = kzalloc(sizeof(*rwq_ind_tbl), GFP_KERNEL);
        if (!rwq_ind_tbl)
                return ERR_PTR(-ENOMEM);

        inlen = MLX5_ST_SZ_BYTES(create_rqt_in) + sizeof(u32) * sz;
        in = mlx5_vzalloc(inlen);
        if (!in) {
                err = -ENOMEM;
                goto err;
        }

        rqtc = MLX5_ADDR_OF(create_rqt_in, in, rqt_context);

        MLX5_SET(rqtc, rqtc, rqt_actual_size, sz);
        MLX5_SET(rqtc, rqtc, rqt_max_size, sz);

        for (i = 0; i < sz; i++)
                MLX5_SET(rqtc, rqtc, rq_num[i], init_attr->ind_tbl[i]->wq_num);

        rwq_ind_tbl->uid = to_mpd(init_attr->ind_tbl[0]->pd)->uid;
        MLX5_SET(create_rqt_in, in, uid, rwq_ind_tbl->uid);

        err = mlx5_core_create_rqt(dev->mdev, in, inlen, &rwq_ind_tbl->rqtn);
        kvfree(in);

        if (err)
                goto err;

        rwq_ind_tbl->ib_rwq_ind_tbl.ind_tbl_num = rwq_ind_tbl->rqtn;
        if (udata->outlen) {
                resp.response_length = offsetof(typeof(resp), response_length) +
                                        sizeof(resp.response_length);
                err = ib_copy_to_udata(udata, &resp, resp.response_length);
                if (err)
                        goto err_copy;
        }

        return &rwq_ind_tbl->ib_rwq_ind_tbl;

err_copy:
        mlx5_core_destroy_rqt(dev->mdev, rwq_ind_tbl->rqtn, rwq_ind_tbl->uid);
err:
        kfree(rwq_ind_tbl);
        return ERR_PTR(err);
}

int mlx5_ib_destroy_rwq_ind_table(struct ib_rwq_ind_table *ib_rwq_ind_tbl)
{
        struct mlx5_ib_rwq_ind_table *rwq_ind_tbl = to_mrwq_ind_table(ib_rwq_ind_tbl);
        struct mlx5_ib_dev *dev = to_mdev(ib_rwq_ind_tbl->device);

        mlx5_core_destroy_rqt(dev->mdev, rwq_ind_tbl->rqtn, rwq_ind_tbl->uid);

        kfree(rwq_ind_tbl);
        return 0;
}

int mlx5_ib_modify_wq(struct ib_wq *wq, struct ib_wq_attr *wq_attr,
                      u32 wq_attr_mask, struct ib_udata *udata)
{
        struct mlx5_ib_dev *dev = to_mdev(wq->device);
        struct mlx5_ib_rwq *rwq = to_mrwq(wq);
        struct mlx5_ib_modify_wq ucmd = {};
        size_t required_cmd_sz;
        int curr_wq_state;
        int wq_state;
        int inlen;
        int err;
        void *rqc;
        void *in;

        required_cmd_sz = offsetof(typeof(ucmd), reserved) + sizeof(ucmd.reserved);
        if (udata->inlen < required_cmd_sz)
                return -EINVAL;

        if (udata->inlen > sizeof(ucmd) &&
            !ib_is_udata_cleared(udata, sizeof(ucmd),
                                 udata->inlen - sizeof(ucmd)))
                return -EOPNOTSUPP;

        if (ib_copy_from_udata(&ucmd, udata, min(sizeof(ucmd), udata->inlen)))
                return -EFAULT;

        if (ucmd.comp_mask || ucmd.reserved)
                return -EOPNOTSUPP;

        inlen = MLX5_ST_SZ_BYTES(modify_rq_in);
        in = mlx5_vzalloc(inlen);
        if (!in)
                return -ENOMEM;

        rqc = MLX5_ADDR_OF(modify_rq_in, in, ctx);

        MLX5_SET(modify_rq_in, in, rqn, rwq->core_qp.qpn);
        curr_wq_state = (wq_attr_mask & IB_WQ_CUR_STATE) ?
                wq_attr->curr_wq_state : wq->state;
        wq_state = (wq_attr_mask & IB_WQ_STATE) ?
                wq_attr->wq_state : curr_wq_state;
        if (curr_wq_state == IB_WQS_ERR)
                curr_wq_state = MLX5_RQC_STATE_ERR;
        if (wq_state == IB_WQS_ERR)
                wq_state = MLX5_RQC_STATE_ERR;
        MLX5_SET(modify_rq_in, in, rq_state, curr_wq_state);
        MLX5_SET(modify_rq_in, in, uid, to_mpd(wq->pd)->uid);
        MLX5_SET(rqc, rqc, state, wq_state);

        err = mlx5_core_modify_rq(dev->mdev, in, inlen);
        kvfree(in);
        if (!err)
                rwq->ibwq.state = (wq_state == MLX5_RQC_STATE_ERR) ? IB_WQS_ERR : wq_state;

        return err;
}