root/drivers/infiniband/hw/erdma/erdma_main.c
// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause

/* Authors: Cheng Xu <chengyou@linux.alibaba.com> */
/*          Kai Shen <kaishen@linux.alibaba.com> */
/* Copyright (c) 2020-2022, Alibaba Group. */

#include <linux/module.h>
#include <net/addrconf.h>
#include <rdma/erdma-abi.h>

#include "erdma.h"
#include "erdma_cm.h"
#include "erdma_verbs.h"

MODULE_AUTHOR("Cheng Xu <chengyou@linux.alibaba.com>");
MODULE_DESCRIPTION("Alibaba elasticRDMA adapter driver");
MODULE_LICENSE("Dual BSD/GPL");

static int erdma_netdev_event(struct notifier_block *nb, unsigned long event,
                              void *arg)
{
        struct net_device *netdev = netdev_notifier_info_to_dev(arg);
        struct erdma_dev *dev = container_of(nb, struct erdma_dev, netdev_nb);

        if (dev->netdev == NULL || dev->netdev != netdev)
                goto done;

        switch (event) {
        case NETDEV_CHANGEMTU:
                if (dev->mtu != netdev->mtu) {
                        erdma_set_mtu(dev, netdev->mtu);
                        dev->mtu = netdev->mtu;
                }
                break;
        case NETDEV_REGISTER:
        case NETDEV_UNREGISTER:
        case NETDEV_CHANGEADDR:
        case NETDEV_GOING_DOWN:
        case NETDEV_CHANGE:
        default:
                break;
        }

done:
        return NOTIFY_OK;
}

static int erdma_enum_and_get_netdev(struct erdma_dev *dev)
{
        struct net_device *netdev;
        int ret = -EPROBE_DEFER;

        /* Already binded to a net_device, so we skip. */
        if (dev->netdev)
                return 0;

        rtnl_lock();
        for_each_netdev(&init_net, netdev) {
                /*
                 * In erdma, the paired netdev and ibdev should have the same
                 * MAC address. erdma can get the value from its PCIe bar
                 * registers. Since erdma can not get the paired netdev
                 * reference directly, we do a traverse here to get the paired
                 * netdev.
                 */
                if (ether_addr_equal_unaligned(netdev->perm_addr,
                                               dev->attrs.peer_addr)) {
                        ret = ib_device_set_netdev(&dev->ibdev, netdev, 1);
                        if (ret) {
                                rtnl_unlock();
                                ibdev_warn(&dev->ibdev,
                                           "failed (%d) to link netdev", ret);
                                return ret;
                        }

                        dev->netdev = netdev;
                        break;
                }
        }

        rtnl_unlock();

        return ret;
}

static int erdma_device_register(struct erdma_dev *dev)
{
        struct ib_device *ibdev = &dev->ibdev;
        int ret;

        ret = erdma_enum_and_get_netdev(dev);
        if (ret)
                return ret;

        dev->mtu = dev->netdev->mtu;
        addrconf_addr_eui48((u8 *)&ibdev->node_guid, dev->netdev->dev_addr);

        ret = ib_register_device(ibdev, "erdma_%d", &dev->pdev->dev);
        if (ret) {
                dev_err(&dev->pdev->dev,
                        "ib_register_device failed: ret = %d\n", ret);
                return ret;
        }

        dev->netdev_nb.notifier_call = erdma_netdev_event;
        ret = register_netdevice_notifier(&dev->netdev_nb);
        if (ret) {
                ibdev_err(&dev->ibdev, "failed to register notifier.\n");
                ib_unregister_device(ibdev);
        }

        return ret;
}

static irqreturn_t erdma_comm_irq_handler(int irq, void *data)
{
        struct erdma_dev *dev = data;

        erdma_cmdq_completion_handler(&dev->cmdq);
        erdma_aeq_event_handler(dev);

        return IRQ_HANDLED;
}

static int erdma_request_vectors(struct erdma_dev *dev)
{
        int expect_irq_num = min(num_possible_cpus() + 1, ERDMA_NUM_MSIX_VEC);
        int ret;

        ret = pci_alloc_irq_vectors(dev->pdev, 1, expect_irq_num, PCI_IRQ_MSIX);
        if (ret < 0) {
                dev_err(&dev->pdev->dev, "request irq vectors failed(%d)\n",
                        ret);
                return ret;
        }
        dev->attrs.irq_num = ret;

        return 0;
}

static int erdma_comm_irq_init(struct erdma_dev *dev)
{
        snprintf(dev->comm_irq.name, ERDMA_IRQNAME_SIZE, "erdma-common@pci:%s",
                 pci_name(dev->pdev));
        dev->comm_irq.msix_vector =
                pci_irq_vector(dev->pdev, ERDMA_MSIX_VECTOR_CMDQ);

        cpumask_set_cpu(cpumask_first(cpumask_of_pcibus(dev->pdev->bus)),
                        &dev->comm_irq.affinity_hint_mask);
        irq_set_affinity_hint(dev->comm_irq.msix_vector,
                              &dev->comm_irq.affinity_hint_mask);

        return request_irq(dev->comm_irq.msix_vector, erdma_comm_irq_handler, 0,
                           dev->comm_irq.name, dev);
}

static void erdma_comm_irq_uninit(struct erdma_dev *dev)
{
        irq_set_affinity_hint(dev->comm_irq.msix_vector, NULL);
        free_irq(dev->comm_irq.msix_vector, dev);
}

static int erdma_device_init(struct erdma_dev *dev, struct pci_dev *pdev)
{
        int ret;

        dev->proto = erdma_reg_read32(dev, ERDMA_REGS_DEV_PROTO_REG);

        dev->resp_pool = dma_pool_create("erdma_resp_pool", &pdev->dev,
                                         ERDMA_HW_RESP_SIZE, ERDMA_HW_RESP_SIZE,
                                         0);
        if (!dev->resp_pool)
                return -ENOMEM;

        dev->db_pool = dma_pool_create("erdma_db_pool", &pdev->dev,
                                       ERDMA_DB_SIZE, ERDMA_DB_SIZE, 0);
        if (!dev->db_pool) {
                ret = -ENOMEM;
                goto destroy_resp_pool;
        }

        ret = dma_set_mask_and_coherent(&pdev->dev,
                                        DMA_BIT_MASK(ERDMA_PCI_WIDTH));
        if (ret)
                goto destroy_db_pool;

        dma_set_max_seg_size(&pdev->dev, UINT_MAX);

        return 0;

destroy_db_pool:
        dma_pool_destroy(dev->db_pool);

destroy_resp_pool:
        dma_pool_destroy(dev->resp_pool);

        return ret;
}

static void erdma_device_uninit(struct erdma_dev *dev)
{
        dma_pool_destroy(dev->db_pool);
        dma_pool_destroy(dev->resp_pool);
}

static void erdma_hw_reset(struct erdma_dev *dev)
{
        u32 ctrl = FIELD_PREP(ERDMA_REG_DEV_CTRL_RESET_MASK, 1);

        erdma_reg_write32(dev, ERDMA_REGS_DEV_CTRL_REG, ctrl);
}

static int erdma_wait_hw_init_done(struct erdma_dev *dev)
{
        int i;

        erdma_reg_write32(dev, ERDMA_REGS_DEV_CTRL_REG,
                          FIELD_PREP(ERDMA_REG_DEV_CTRL_INIT_MASK, 1));

        for (i = 0; i < ERDMA_WAIT_DEV_DONE_CNT; i++) {
                if (erdma_reg_read32_filed(dev, ERDMA_REGS_DEV_ST_REG,
                                           ERDMA_REG_DEV_ST_INIT_DONE_MASK))
                        break;

                msleep(ERDMA_REG_ACCESS_WAIT_MS);
        }

        if (i == ERDMA_WAIT_DEV_DONE_CNT) {
                dev_err(&dev->pdev->dev, "wait init done failed.\n");
                return -ETIMEDOUT;
        }

        return 0;
}

static const struct pci_device_id erdma_pci_tbl[] = {
        { PCI_DEVICE(PCI_VENDOR_ID_ALIBABA, 0x107f) },
        {}
};

static int erdma_probe_dev(struct pci_dev *pdev)
{
        struct erdma_dev *dev;
        int bars, err;
        u32 version;

        err = pci_enable_device(pdev);
        if (err) {
                dev_err(&pdev->dev, "pci_enable_device failed(%d)\n", err);
                return err;
        }

        pci_set_master(pdev);

        dev = ib_alloc_device(erdma_dev, ibdev);
        if (!dev) {
                dev_err(&pdev->dev, "ib_alloc_device failed\n");
                err = -ENOMEM;
                goto err_disable_device;
        }

        pci_set_drvdata(pdev, dev);
        dev->pdev = pdev;
        dev->attrs.numa_node = dev_to_node(&pdev->dev);

        bars = pci_select_bars(pdev, IORESOURCE_MEM);
        err = pci_request_selected_regions(pdev, bars, DRV_MODULE_NAME);
        if (bars != ERDMA_BAR_MASK || err) {
                err = err ? err : -EINVAL;
                goto err_ib_device_release;
        }

        dev->func_bar_addr = pci_resource_start(pdev, ERDMA_FUNC_BAR);
        dev->func_bar_len = pci_resource_len(pdev, ERDMA_FUNC_BAR);

        dev->func_bar =
                devm_ioremap(&pdev->dev, dev->func_bar_addr, dev->func_bar_len);
        if (!dev->func_bar) {
                dev_err(&pdev->dev, "devm_ioremap failed.\n");
                err = -EFAULT;
                goto err_release_bars;
        }

        version = erdma_reg_read32(dev, ERDMA_REGS_VERSION_REG);
        if (version == 0) {
                /* we knows that it is a non-functional function. */
                err = -ENODEV;
                goto err_iounmap_func_bar;
        }

        err = erdma_device_init(dev, pdev);
        if (err)
                goto err_iounmap_func_bar;

        err = erdma_request_vectors(dev);
        if (err)
                goto err_uninit_device;

        err = erdma_comm_irq_init(dev);
        if (err)
                goto err_free_vectors;

        err = erdma_aeq_init(dev);
        if (err)
                goto err_uninit_comm_irq;

        err = erdma_cmdq_init(dev);
        if (err)
                goto err_uninit_aeq;

        err = erdma_wait_hw_init_done(dev);
        if (err)
                goto err_uninit_cmdq;

        err = erdma_ceqs_init(dev);
        if (err)
                goto err_reset_hw;

        erdma_finish_cmdq_init(dev);

        return 0;

err_reset_hw:
        erdma_hw_reset(dev);

err_uninit_cmdq:
        erdma_cmdq_destroy(dev);

err_uninit_aeq:
        erdma_eq_destroy(dev, &dev->aeq);

err_uninit_comm_irq:
        erdma_comm_irq_uninit(dev);

err_free_vectors:
        pci_free_irq_vectors(dev->pdev);

err_uninit_device:
        erdma_device_uninit(dev);

err_iounmap_func_bar:
        devm_iounmap(&pdev->dev, dev->func_bar);

err_release_bars:
        pci_release_selected_regions(pdev, bars);

err_ib_device_release:
        ib_dealloc_device(&dev->ibdev);

err_disable_device:
        pci_disable_device(pdev);

        return err;
}

static void erdma_remove_dev(struct pci_dev *pdev)
{
        struct erdma_dev *dev = pci_get_drvdata(pdev);

        erdma_ceqs_uninit(dev);
        erdma_hw_reset(dev);
        erdma_cmdq_destroy(dev);
        erdma_eq_destroy(dev, &dev->aeq);
        erdma_comm_irq_uninit(dev);
        pci_free_irq_vectors(dev->pdev);
        erdma_device_uninit(dev);

        devm_iounmap(&pdev->dev, dev->func_bar);
        pci_release_selected_regions(pdev, ERDMA_BAR_MASK);

        ib_dealloc_device(&dev->ibdev);

        pci_disable_device(pdev);
}

#define ERDMA_GET_CAP(name, cap) FIELD_GET(ERDMA_CMD_DEV_CAP_##name##_MASK, cap)

static int erdma_dev_attrs_init(struct erdma_dev *dev)
{
        int err;
        u64 req_hdr, cap0, cap1;

        erdma_cmdq_build_reqhdr(&req_hdr, CMDQ_SUBMOD_RDMA,
                                CMDQ_OPCODE_QUERY_DEVICE);

        err = erdma_post_cmd_wait(&dev->cmdq, &req_hdr, sizeof(req_hdr), &cap0,
                                  &cap1, true);
        if (err)
                return err;

        dev->attrs.max_cqe = 1 << ERDMA_GET_CAP(MAX_CQE, cap0);
        dev->attrs.max_mr_size = 1ULL << ERDMA_GET_CAP(MAX_MR_SIZE, cap0);
        dev->attrs.max_mw = 1 << ERDMA_GET_CAP(MAX_MW, cap1);
        dev->attrs.max_recv_wr = 1 << ERDMA_GET_CAP(MAX_RECV_WR, cap0);
        dev->attrs.max_gid = 1 << ERDMA_GET_CAP(MAX_GID, cap0);
        dev->attrs.max_ah = 1 << ERDMA_GET_CAP(MAX_AH, cap0);
        dev->attrs.local_dma_key = ERDMA_GET_CAP(DMA_LOCAL_KEY, cap1);
        dev->attrs.cc = ERDMA_GET_CAP(DEFAULT_CC, cap1);
        dev->attrs.max_qp = ERDMA_NQP_PER_QBLOCK * ERDMA_GET_CAP(QBLOCK, cap1);
        dev->attrs.max_mr = dev->attrs.max_qp << 1;
        dev->attrs.max_cq = dev->attrs.max_qp << 1;
        dev->attrs.cap_flags = ERDMA_GET_CAP(FLAGS, cap0);

        dev->attrs.max_send_wr = ERDMA_MAX_SEND_WR;
        dev->attrs.max_ord = ERDMA_MAX_ORD;
        dev->attrs.max_ird = ERDMA_MAX_IRD;
        dev->attrs.max_send_sge = ERDMA_MAX_SEND_SGE;
        dev->attrs.max_recv_sge = ERDMA_MAX_RECV_SGE;
        dev->attrs.max_sge_rd = ERDMA_MAX_SGE_RD;
        dev->attrs.max_pd = ERDMA_MAX_PD;

        dev->res_cb[ERDMA_RES_TYPE_PD].max_cap = ERDMA_MAX_PD;
        dev->res_cb[ERDMA_RES_TYPE_STAG_IDX].max_cap = dev->attrs.max_mr;
        dev->res_cb[ERDMA_RES_TYPE_AH].max_cap = dev->attrs.max_ah;

        erdma_cmdq_build_reqhdr(&req_hdr, CMDQ_SUBMOD_COMMON,
                                CMDQ_OPCODE_QUERY_FW_INFO);

        err = erdma_post_cmd_wait(&dev->cmdq, &req_hdr, sizeof(req_hdr), &cap0,
                                  &cap1, true);
        if (!err)
                dev->attrs.fw_version =
                        FIELD_GET(ERDMA_CMD_INFO0_FW_VER_MASK, cap0);

        return err;
}

static int erdma_device_config(struct erdma_dev *dev)
{
        struct erdma_cmdq_config_device_req req = {};

        if (!(dev->attrs.cap_flags & ERDMA_DEV_CAP_FLAGS_EXTEND_DB))
                return 0;

        erdma_cmdq_build_reqhdr(&req.hdr, CMDQ_SUBMOD_COMMON,
                                CMDQ_OPCODE_CONF_DEVICE);

        req.cfg = FIELD_PREP(ERDMA_CMD_CONFIG_DEVICE_PGSHIFT_MASK, PAGE_SHIFT) |
                  FIELD_PREP(ERDMA_CMD_CONFIG_DEVICE_PS_EN_MASK, 1);

        return erdma_post_cmd_wait(&dev->cmdq, &req, sizeof(req), NULL, NULL,
                                   true);
}

static int erdma_res_cb_init(struct erdma_dev *dev)
{
        int i, j;

        for (i = 0; i < ERDMA_RES_CNT; i++) {
                dev->res_cb[i].next_alloc_idx = 1;
                spin_lock_init(&dev->res_cb[i].lock);
                dev->res_cb[i].bitmap =
                        bitmap_zalloc(dev->res_cb[i].max_cap, GFP_KERNEL);
                if (!dev->res_cb[i].bitmap)
                        goto err;
        }

        return 0;

err:
        for (j = 0; j < i; j++)
                bitmap_free(dev->res_cb[j].bitmap);

        return -ENOMEM;
}

static void erdma_res_cb_free(struct erdma_dev *dev)
{
        int i;

        for (i = 0; i < ERDMA_RES_CNT; i++)
                bitmap_free(dev->res_cb[i].bitmap);
}

static const struct ib_device_ops erdma_device_ops_rocev2 = {
        .get_link_layer = erdma_get_link_layer,
        .add_gid = erdma_add_gid,
        .del_gid = erdma_del_gid,
        .query_pkey = erdma_query_pkey,
        .create_ah = erdma_create_ah,
        .destroy_ah = erdma_destroy_ah,
        .query_ah = erdma_query_ah,

        INIT_RDMA_OBJ_SIZE(ib_ah, erdma_ah, ibah),
};

static const struct ib_device_ops erdma_device_ops_iwarp = {
        .iw_accept = erdma_accept,
        .iw_add_ref = erdma_qp_get_ref,
        .iw_connect = erdma_connect,
        .iw_create_listen = erdma_create_listen,
        .iw_destroy_listen = erdma_destroy_listen,
        .iw_get_qp = erdma_get_ibqp,
        .iw_reject = erdma_reject,
        .iw_rem_ref = erdma_qp_put_ref,
};

static const struct ib_device_ops erdma_device_ops = {
        .owner = THIS_MODULE,
        .driver_id = RDMA_DRIVER_ERDMA,
        .uverbs_abi_ver = ERDMA_ABI_VERSION,

        .alloc_hw_port_stats = erdma_alloc_hw_port_stats,
        .alloc_mr = erdma_ib_alloc_mr,
        .alloc_pd = erdma_alloc_pd,
        .alloc_ucontext = erdma_alloc_ucontext,
        .create_cq = erdma_create_cq,
        .create_qp = erdma_create_qp,
        .dealloc_pd = erdma_dealloc_pd,
        .dealloc_ucontext = erdma_dealloc_ucontext,
        .dereg_mr = erdma_dereg_mr,
        .destroy_cq = erdma_destroy_cq,
        .destroy_qp = erdma_destroy_qp,
        .disassociate_ucontext = erdma_disassociate_ucontext,
        .get_dma_mr = erdma_get_dma_mr,
        .get_hw_stats = erdma_get_hw_stats,
        .get_port_immutable = erdma_get_port_immutable,
        .map_mr_sg = erdma_map_mr_sg,
        .mmap = erdma_mmap,
        .mmap_free = erdma_mmap_free,
        .post_recv = erdma_post_recv,
        .post_send = erdma_post_send,
        .poll_cq = erdma_poll_cq,
        .query_device = erdma_query_device,
        .query_gid = erdma_query_gid,
        .query_port = erdma_query_port,
        .query_qp = erdma_query_qp,
        .req_notify_cq = erdma_req_notify_cq,
        .reg_user_mr = erdma_reg_user_mr,
        .modify_qp = erdma_modify_qp,

        INIT_RDMA_OBJ_SIZE(ib_cq, erdma_cq, ibcq),
        INIT_RDMA_OBJ_SIZE(ib_pd, erdma_pd, ibpd),
        INIT_RDMA_OBJ_SIZE(ib_ucontext, erdma_ucontext, ibucontext),
        INIT_RDMA_OBJ_SIZE(ib_qp, erdma_qp, ibqp),
};

static int erdma_ib_device_add(struct pci_dev *pdev)
{
        struct erdma_dev *dev = pci_get_drvdata(pdev);
        struct ib_device *ibdev = &dev->ibdev;
        u64 mac;
        int ret;

        ret = erdma_dev_attrs_init(dev);
        if (ret)
                return ret;

        ret = erdma_device_config(dev);
        if (ret)
                return ret;

        if (erdma_device_iwarp(dev)) {
                ibdev->node_type = RDMA_NODE_RNIC;
                ib_set_device_ops(ibdev, &erdma_device_ops_iwarp);
        } else {
                ibdev->node_type = RDMA_NODE_IB_CA;
                ib_set_device_ops(ibdev, &erdma_device_ops_rocev2);
        }

        memcpy(ibdev->node_desc, ERDMA_NODE_DESC, sizeof(ERDMA_NODE_DESC));

        /*
         * Current model (one-to-one device association):
         * One ERDMA device per net_device or, equivalently,
         * per physical port.
         */
        ibdev->phys_port_cnt = 1;
        ibdev->num_comp_vectors = dev->attrs.irq_num - 1;

        ib_set_device_ops(ibdev, &erdma_device_ops);

        INIT_LIST_HEAD(&dev->cep_list);

        spin_lock_init(&dev->lock);
        xa_init_flags(&dev->qp_xa, XA_FLAGS_ALLOC1);
        xa_init_flags(&dev->cq_xa, XA_FLAGS_ALLOC1);
        dev->next_alloc_cqn = 1;
        dev->next_alloc_qpn = 1;

        ret = erdma_res_cb_init(dev);
        if (ret)
                return ret;

        atomic_set(&dev->num_ctx, 0);

        mac = erdma_reg_read32(dev, ERDMA_REGS_NETDEV_MAC_L_REG);
        mac |= (u64)erdma_reg_read32(dev, ERDMA_REGS_NETDEV_MAC_H_REG) << 32;

        u64_to_ether_addr(mac, dev->attrs.peer_addr);

        dev->reflush_wq = alloc_workqueue("erdma-reflush-wq", WQ_UNBOUND,
                                          WQ_UNBOUND_MAX_ACTIVE);
        if (!dev->reflush_wq) {
                ret = -ENOMEM;
                goto err_alloc_workqueue;
        }

        ret = erdma_device_register(dev);
        if (ret)
                goto err_register;

        return 0;

err_register:
        destroy_workqueue(dev->reflush_wq);
err_alloc_workqueue:
        xa_destroy(&dev->qp_xa);
        xa_destroy(&dev->cq_xa);

        erdma_res_cb_free(dev);

        return ret;
}

static void erdma_ib_device_remove(struct pci_dev *pdev)
{
        struct erdma_dev *dev = pci_get_drvdata(pdev);

        unregister_netdevice_notifier(&dev->netdev_nb);
        ib_unregister_device(&dev->ibdev);

        destroy_workqueue(dev->reflush_wq);
        erdma_res_cb_free(dev);
        xa_destroy(&dev->qp_xa);
        xa_destroy(&dev->cq_xa);
}

static int erdma_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
{
        int ret;

        ret = erdma_probe_dev(pdev);
        if (ret)
                return ret;

        ret = erdma_ib_device_add(pdev);
        if (ret) {
                erdma_remove_dev(pdev);
                return ret;
        }

        return 0;
}

static void erdma_remove(struct pci_dev *pdev)
{
        erdma_ib_device_remove(pdev);
        erdma_remove_dev(pdev);
}

static struct pci_driver erdma_pci_driver = {
        .name = DRV_MODULE_NAME,
        .id_table = erdma_pci_tbl,
        .probe = erdma_probe,
        .remove = erdma_remove
};

MODULE_DEVICE_TABLE(pci, erdma_pci_tbl);

static __init int erdma_init_module(void)
{
        int ret;

        ret = erdma_cm_init();
        if (ret)
                return ret;

        ret = pci_register_driver(&erdma_pci_driver);
        if (ret)
                erdma_cm_exit();

        return ret;
}

static void __exit erdma_exit_module(void)
{
        pci_unregister_driver(&erdma_pci_driver);

        erdma_cm_exit();
}

module_init(erdma_init_module);
module_exit(erdma_exit_module);