root/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c
// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
/* Copyright (c) 2019 Mellanox Technologies. */

#include <linux/pci.h>
#include <linux/interrupt.h>
#include <linux/notifier.h>
#include <linux/mlx5/driver.h>
#include <linux/mlx5/vport.h>
#include "mlx5_core.h"
#include "mlx5_irq.h"
#include "pci_irq.h"
#include "lib/sf.h"
#include "lib/eq.h"
#ifdef CONFIG_RFS_ACCEL
#include <linux/cpu_rmap.h>
#endif

#define MLX5_SFS_PER_CTRL_IRQ 64
#define MLX5_MAX_MSIX_PER_SF 256
#define MLX5_IRQ_CTRL_SF_MAX 8
/* min num of vectors for SFs to be enabled */
#define MLX5_IRQ_VEC_COMP_BASE_SF 2
#define MLX5_IRQ_VEC_COMP_BASE 1

#define MLX5_EQ_SHARE_IRQ_MAX_COMP (8)
#define MLX5_EQ_SHARE_IRQ_MAX_CTRL (UINT_MAX)
#define MLX5_EQ_SHARE_IRQ_MIN_COMP (1)
#define MLX5_EQ_SHARE_IRQ_MIN_CTRL (4)

struct mlx5_irq {
        struct atomic_notifier_head nh;
        cpumask_var_t mask;
        char name[MLX5_MAX_IRQ_FORMATTED_NAME];
        struct mlx5_irq_pool *pool;
        int refcount;
        struct msi_map map;
        u32 pool_index;
};

struct mlx5_irq_table {
        struct mlx5_irq_pool *pcif_pool;
        struct mlx5_irq_pool *sf_ctrl_pool;
        struct mlx5_irq_pool *sf_comp_pool;
};

static int mlx5_core_func_to_vport(const struct mlx5_core_dev *dev,
                                   int func,
                                   bool ec_vf_func)
{
        if (!ec_vf_func)
                return func;
        return mlx5_core_ec_vf_vport_base(dev) + func - 1;
}

/**
 * mlx5_get_default_msix_vec_count - Get the default number of MSI-X vectors
 *                                   to be assigned to each VF.
 * @dev: PF to work on
 * @num_vfs: Number of enabled VFs
 */
int mlx5_get_default_msix_vec_count(struct mlx5_core_dev *dev, int num_vfs)
{
        int num_vf_msix, min_msix, max_msix;

        num_vf_msix = MLX5_CAP_GEN_MAX(dev, num_total_dynamic_vf_msix);
        if (!num_vf_msix)
                return 0;

        min_msix = MLX5_CAP_GEN(dev, min_dynamic_vf_msix_table_size);
        max_msix = MLX5_CAP_GEN(dev, max_dynamic_vf_msix_table_size);

        /* Limit maximum number of MSI-X vectors so the default configuration
         * has some available in the pool. This will allow the user to increase
         * the number of vectors in a VF without having to first size-down other
         * VFs.
         */
        return max(min(num_vf_msix / num_vfs, max_msix / 2), min_msix);
}

/**
 * mlx5_set_msix_vec_count - Set dynamically allocated MSI-X on the VF
 * @dev: PF to work on
 * @function_id: Internal PCI VF function IDd
 * @msix_vec_count: Number of MSI-X vectors to set
 */
int mlx5_set_msix_vec_count(struct mlx5_core_dev *dev, int function_id,
                            int msix_vec_count)
{
        int query_sz = MLX5_ST_SZ_BYTES(query_hca_cap_out);
        int set_sz = MLX5_ST_SZ_BYTES(set_hca_cap_in);
        void *hca_cap = NULL, *query_cap = NULL, *cap;
        int num_vf_msix, min_msix, max_msix;
        bool ec_vf_function;
        int vport;
        int ret;

        num_vf_msix = MLX5_CAP_GEN_MAX(dev, num_total_dynamic_vf_msix);
        if (!num_vf_msix)
                return 0;

        if (!MLX5_CAP_GEN(dev, vport_group_manager) || !mlx5_core_is_pf(dev))
                return -EOPNOTSUPP;

        min_msix = MLX5_CAP_GEN(dev, min_dynamic_vf_msix_table_size);
        max_msix = MLX5_CAP_GEN(dev, max_dynamic_vf_msix_table_size);

        if (msix_vec_count < min_msix)
                return -EINVAL;

        if (msix_vec_count > max_msix)
                return -EOVERFLOW;

        query_cap = kvzalloc(query_sz, GFP_KERNEL);
        hca_cap = kvzalloc(set_sz, GFP_KERNEL);
        if (!hca_cap || !query_cap) {
                ret = -ENOMEM;
                goto out;
        }

        ec_vf_function = mlx5_core_ec_sriov_enabled(dev);
        vport = mlx5_core_func_to_vport(dev, function_id, ec_vf_function);
        ret = mlx5_vport_get_other_func_general_cap(dev, vport, query_cap);
        if (ret)
                goto out;

        cap = MLX5_ADDR_OF(set_hca_cap_in, hca_cap, capability);
        memcpy(cap, MLX5_ADDR_OF(query_hca_cap_out, query_cap, capability),
               MLX5_UN_SZ_BYTES(hca_cap_union));
        MLX5_SET(cmd_hca_cap, cap, dynamic_msix_table_size, msix_vec_count);

        MLX5_SET(set_hca_cap_in, hca_cap, opcode, MLX5_CMD_OP_SET_HCA_CAP);
        MLX5_SET(set_hca_cap_in, hca_cap, other_function, 1);
        MLX5_SET(set_hca_cap_in, hca_cap, ec_vf_function, ec_vf_function);
        MLX5_SET(set_hca_cap_in, hca_cap, function_id, function_id);

        MLX5_SET(set_hca_cap_in, hca_cap, op_mod,
                 MLX5_SET_HCA_CAP_OP_MOD_GENERAL_DEVICE << 1);
        ret = mlx5_cmd_exec_in(dev, set_hca_cap, hca_cap);
out:
        kvfree(hca_cap);
        kvfree(query_cap);
        return ret;
}

/* mlx5_system_free_irq - Free an IRQ
 * @irq: IRQ to free
 *
 * Free the IRQ and other resources such as rmap from the system.
 * BUT doesn't free or remove reference from mlx5.
 * This function is very important for the shutdown flow, where we need to
 * cleanup system resources but keep mlx5 objects alive,
 * see mlx5_irq_table_free_irqs().
 */
static void mlx5_system_free_irq(struct mlx5_irq *irq)
{
        struct mlx5_irq_pool *pool = irq->pool;
#ifdef CONFIG_RFS_ACCEL
        struct cpu_rmap *rmap;
#endif

        /* free_irq requires that affinity_hint and rmap will be cleared before
         * calling it. To satisfy this requirement, we call
         * irq_cpu_rmap_remove() to remove the notifier
         */
        irq_update_affinity_hint(irq->map.virq, NULL);
#ifdef CONFIG_RFS_ACCEL
        rmap = mlx5_eq_table_get_rmap(pool->dev);
        if (rmap)
                irq_cpu_rmap_remove(rmap, irq->map.virq);
#endif

        free_irq(irq->map.virq, &irq->nh);
        if (irq->map.index && pci_msix_can_alloc_dyn(pool->dev->pdev))
                pci_msix_free_irq(pool->dev->pdev, irq->map);
}

static void irq_release(struct mlx5_irq *irq)
{
        struct mlx5_irq_pool *pool = irq->pool;

        xa_erase(&pool->irqs, irq->pool_index);
        mlx5_system_free_irq(irq);
        free_cpumask_var(irq->mask);
        kfree(irq);
}

int mlx5_irq_put(struct mlx5_irq *irq)
{
        struct mlx5_irq_pool *pool = irq->pool;
        int ret = 0;

        mutex_lock(&pool->lock);
        irq->refcount--;
        if (!irq->refcount) {
                irq_release(irq);
                ret = 1;
        }
        mutex_unlock(&pool->lock);
        return ret;
}

int mlx5_irq_read_locked(struct mlx5_irq *irq)
{
        lockdep_assert_held(&irq->pool->lock);
        return irq->refcount;
}

int mlx5_irq_get_locked(struct mlx5_irq *irq)
{
        lockdep_assert_held(&irq->pool->lock);
        if (WARN_ON_ONCE(!irq->refcount))
                return 0;
        irq->refcount++;
        return 1;
}

static int irq_get(struct mlx5_irq *irq)
{
        int err;

        mutex_lock(&irq->pool->lock);
        err = mlx5_irq_get_locked(irq);
        mutex_unlock(&irq->pool->lock);
        return err;
}

static irqreturn_t irq_int_handler(int irq, void *nh)
{
        atomic_notifier_call_chain(nh, 0, NULL);
        return IRQ_HANDLED;
}

static void irq_sf_set_name(struct mlx5_irq_pool *pool, char *name, int vecidx)
{
        snprintf(name, MLX5_MAX_IRQ_NAME, "%s%d", pool->name, vecidx);
}

static void irq_set_name(struct mlx5_irq_pool *pool, char *name, int vecidx)
{
        if (!pool->xa_num_irqs.max) {
                /* in case we only have a single irq for the device */
                snprintf(name, MLX5_MAX_IRQ_NAME, "mlx5_combined%d", vecidx);
                return;
        }

        if (!vecidx) {
                snprintf(name, MLX5_MAX_IRQ_NAME, "mlx5_async%d", vecidx);
                return;
        }

        vecidx -= MLX5_IRQ_VEC_COMP_BASE;
        snprintf(name, MLX5_MAX_IRQ_NAME, "mlx5_comp%d", vecidx);
}

struct mlx5_irq *mlx5_irq_alloc(struct mlx5_irq_pool *pool, int i,
                                struct irq_affinity_desc *af_desc,
                                struct cpu_rmap **rmap)
{
        struct mlx5_core_dev *dev = pool->dev;
        char name[MLX5_MAX_IRQ_NAME];
        struct mlx5_irq *irq;
        int err;

        irq = kzalloc_obj(*irq);
        if (!irq || !zalloc_cpumask_var(&irq->mask, GFP_KERNEL)) {
                kfree(irq);
                return ERR_PTR(-ENOMEM);
        }

        if (!i || !pci_msix_can_alloc_dyn(dev->pdev)) {
                /* The vector at index 0 is always statically allocated. If
                 * dynamic irq is not supported all vectors are statically
                 * allocated. In both cases just get the irq number and set
                 * the index.
                 */
                irq->map.virq = pci_irq_vector(dev->pdev, i);
                irq->map.index = i;
        } else {
                irq->map = pci_msix_alloc_irq_at(dev->pdev, MSI_ANY_INDEX, af_desc);
                if (!irq->map.virq) {
                        err = irq->map.index;
                        goto err_alloc_irq;
                }
        }

        if (i && rmap && *rmap) {
#ifdef CONFIG_RFS_ACCEL
                err = irq_cpu_rmap_add(*rmap, irq->map.virq);
                if (err)
                        goto err_irq_rmap;
#endif
        }
        if (!mlx5_irq_pool_is_sf_pool(pool))
                irq_set_name(pool, name, i);
        else
                irq_sf_set_name(pool, name, i);
        ATOMIC_INIT_NOTIFIER_HEAD(&irq->nh);
        snprintf(irq->name, MLX5_MAX_IRQ_FORMATTED_NAME,
                 MLX5_IRQ_NAME_FORMAT_STR, name, pci_name(dev->pdev));
        err = request_irq(irq->map.virq, irq_int_handler, 0, irq->name,
                          &irq->nh);
        if (err) {
                mlx5_core_err(dev, "Failed to request irq. err = %d\n", err);
                goto err_req_irq;
        }

        if (af_desc) {
                cpumask_copy(irq->mask, &af_desc->mask);
                irq_set_affinity_and_hint(irq->map.virq, irq->mask);
        }
        irq->pool = pool;
        irq->refcount = 1;
        irq->pool_index = i;
        err = xa_err(xa_store(&pool->irqs, irq->pool_index, irq, GFP_KERNEL));
        if (err) {
                mlx5_core_err(dev, "Failed to alloc xa entry for irq(%u). err = %d\n",
                              irq->pool_index, err);
                goto err_xa;
        }
        return irq;
err_xa:
        if (af_desc)
                irq_update_affinity_hint(irq->map.virq, NULL);
        free_irq(irq->map.virq, &irq->nh);
err_req_irq:
#ifdef CONFIG_RFS_ACCEL
        if (i && rmap && *rmap)
                irq_cpu_rmap_remove(*rmap, irq->map.virq);
err_irq_rmap:
#endif
        if (i && pci_msix_can_alloc_dyn(dev->pdev))
                pci_msix_free_irq(dev->pdev, irq->map);
err_alloc_irq:
        free_cpumask_var(irq->mask);
        kfree(irq);
        return ERR_PTR(err);
}

int mlx5_irq_attach_nb(struct mlx5_irq *irq, struct notifier_block *nb)
{
        int ret;

        ret = irq_get(irq);
        if (!ret)
                /* Something very bad happens here, we are enabling EQ
                 * on non-existing IRQ.
                 */
                return -ENOENT;
        ret = atomic_notifier_chain_register(&irq->nh, nb);
        if (ret)
                mlx5_irq_put(irq);
        return ret;
}

int mlx5_irq_detach_nb(struct mlx5_irq *irq, struct notifier_block *nb)
{
        int err = 0;

        err = atomic_notifier_chain_unregister(&irq->nh, nb);
        mlx5_irq_put(irq);
        return err;
}

struct cpumask *mlx5_irq_get_affinity_mask(struct mlx5_irq *irq)
{
        return irq->mask;
}

int mlx5_irq_get_irq(const struct mlx5_irq *irq)
{
        return irq->map.virq;
}

int mlx5_irq_get_index(struct mlx5_irq *irq)
{
        return irq->map.index;
}

struct mlx5_irq_pool *mlx5_irq_get_pool(struct mlx5_irq *irq)
{
        return irq->pool;
}

/* irq_pool API */

/* requesting an irq from a given pool according to given index */
static struct mlx5_irq *
irq_pool_request_vector(struct mlx5_irq_pool *pool, int vecidx,
                        struct irq_affinity_desc *af_desc,
                        struct cpu_rmap **rmap)
{
        struct mlx5_irq *irq;

        mutex_lock(&pool->lock);
        irq = xa_load(&pool->irqs, vecidx);
        if (irq) {
                mlx5_irq_get_locked(irq);
                goto unlock;
        }
        irq = mlx5_irq_alloc(pool, vecidx, af_desc, rmap);
unlock:
        mutex_unlock(&pool->lock);
        return irq;
}

static struct mlx5_irq_pool *sf_ctrl_irq_pool_get(struct mlx5_irq_table *irq_table)
{
        return irq_table->sf_ctrl_pool;
}

static struct mlx5_irq_pool *
sf_comp_irq_pool_get(struct mlx5_irq_table *irq_table)
{
        return irq_table->sf_comp_pool;
}

struct mlx5_irq_pool *
mlx5_irq_table_get_comp_irq_pool(struct mlx5_core_dev *dev)
{
        struct mlx5_irq_table *irq_table = mlx5_irq_table_get(dev);
        struct mlx5_irq_pool *pool = NULL;

        if (mlx5_core_is_sf(dev))
                pool = sf_comp_irq_pool_get(irq_table);

        /* In some configs, there won't be a pool of SFs IRQs. Hence, returning
         * the PF IRQs pool in case the SF pool doesn't exist.
         */
        return pool ? pool : irq_table->pcif_pool;
}

static struct mlx5_irq_pool *ctrl_irq_pool_get(struct mlx5_core_dev *dev)
{
        struct mlx5_irq_table *irq_table = mlx5_irq_table_get(dev);
        struct mlx5_irq_pool *pool = NULL;

        if (mlx5_core_is_sf(dev))
                pool = sf_ctrl_irq_pool_get(irq_table);

        /* In some configs, there won't be a pool of SFs IRQs. Hence, returning
         * the PF IRQs pool in case the SF pool doesn't exist.
         */
        return pool ? pool : irq_table->pcif_pool;
}

static void _mlx5_irq_release(struct mlx5_irq *irq)
{
        synchronize_irq(irq->map.virq);
        mlx5_irq_put(irq);
}

/**
 * mlx5_ctrl_irq_release - release a ctrl IRQ back to the system.
 * @dev: mlx5 device that releasing the IRQ.
 * @ctrl_irq: ctrl IRQ to be released.
 */
void mlx5_ctrl_irq_release(struct mlx5_core_dev *dev, struct mlx5_irq *ctrl_irq)
{
        mlx5_irq_affinity_irq_release(dev, ctrl_irq);
}

/**
 * mlx5_ctrl_irq_request - request a ctrl IRQ for mlx5 device.
 * @dev: mlx5 device that requesting the IRQ.
 *
 * This function returns a pointer to IRQ, or ERR_PTR in case of error.
 */
struct mlx5_irq *mlx5_ctrl_irq_request(struct mlx5_core_dev *dev)
{
        struct mlx5_irq_pool *pool = ctrl_irq_pool_get(dev);
        struct irq_affinity_desc *af_desc;
        struct mlx5_irq *irq;

        af_desc = kvzalloc_obj(*af_desc);
        if (!af_desc)
                return ERR_PTR(-ENOMEM);

        cpumask_copy(&af_desc->mask, cpu_online_mask);
        af_desc->is_managed = false;
        if (!mlx5_irq_pool_is_sf_pool(pool)) {
                /* In case we are allocating a control IRQ from a pci device's pool.
                 * This can happen also for a SF if the SFs pool is empty.
                 */
                if (!pool->xa_num_irqs.max) {
                        cpumask_clear(&af_desc->mask);
                        /* In case we only have a single IRQ for PF/VF */
                        cpumask_set_cpu(cpumask_first(cpu_online_mask), &af_desc->mask);
                }
                /* Allocate the IRQ in index 0. The vector was already allocated */
                irq = irq_pool_request_vector(pool, 0, af_desc, NULL);
        } else {
                irq = mlx5_irq_affinity_request(dev, pool, af_desc);
        }

        kvfree(af_desc);

        return irq;
}

/**
 * mlx5_irq_request - request an IRQ for mlx5 PF/VF device.
 * @dev: mlx5 device that requesting the IRQ.
 * @vecidx: vector index of the IRQ. This argument is ignore if affinity is
 * provided.
 * @af_desc: affinity descriptor for this IRQ.
 * @rmap: pointer to reverse map pointer for completion interrupts
 *
 * This function returns a pointer to IRQ, or ERR_PTR in case of error.
 */
struct mlx5_irq *mlx5_irq_request(struct mlx5_core_dev *dev, u16 vecidx,
                                  struct irq_affinity_desc *af_desc,
                                  struct cpu_rmap **rmap)
{
        struct mlx5_irq_table *irq_table = mlx5_irq_table_get(dev);
        struct mlx5_irq_pool *pool;
        struct mlx5_irq *irq;

        pool = irq_table->pcif_pool;
        irq = irq_pool_request_vector(pool, vecidx, af_desc, rmap);
        if (IS_ERR(irq))
                return irq;
        mlx5_core_dbg(dev, "irq %u mapped to cpu %*pbl, %u EQs on this irq\n",
                      irq->map.virq, cpumask_pr_args(&af_desc->mask),
                      irq->refcount / MLX5_EQ_REFS_PER_IRQ);
        return irq;
}

/**
 * mlx5_irq_release_vector - release one IRQ back to the system.
 * @irq: the irq to release.
 */
void mlx5_irq_release_vector(struct mlx5_irq *irq)
{
        _mlx5_irq_release(irq);
}

/**
 * mlx5_irq_request_vector - request one IRQ for mlx5 device.
 * @dev: mlx5 device that is requesting the IRQ.
 * @cpu: CPU to bind the IRQ to.
 * @vecidx: vector index to request an IRQ for.
 * @rmap: pointer to reverse map pointer for completion interrupts
 *
 * Each IRQ is bound to at most 1 CPU.
 * This function is requests one IRQ, for the given @vecidx.
 *
 * This function returns a pointer to the irq on success, or an error pointer
 * in case of an error.
 */
struct mlx5_irq *mlx5_irq_request_vector(struct mlx5_core_dev *dev, u16 cpu,
                                         u16 vecidx, struct cpu_rmap **rmap)
{
        struct mlx5_irq_table *table = mlx5_irq_table_get(dev);
        struct mlx5_irq_pool *pool = table->pcif_pool;
        int offset = MLX5_IRQ_VEC_COMP_BASE;
        struct irq_affinity_desc *af_desc;
        struct mlx5_irq *irq;

        af_desc = kvzalloc_obj(*af_desc);
        if (!af_desc)
                return ERR_PTR(-ENOMEM);

        if (!pool->xa_num_irqs.max)
                offset = 0;

        af_desc->is_managed = false;
        cpumask_clear(&af_desc->mask);
        cpumask_set_cpu(cpu, &af_desc->mask);

        irq = mlx5_irq_request(dev, vecidx + offset, af_desc, rmap);

        kvfree(af_desc);

        return irq;
}

static struct mlx5_irq_pool *
irq_pool_alloc(struct mlx5_core_dev *dev, int start, int size, char *name,
               u32 min_threshold, u32 max_threshold)
{
        struct mlx5_irq_pool *pool = kvzalloc_obj(*pool);

        if (!pool)
                return ERR_PTR(-ENOMEM);
        pool->dev = dev;
        mutex_init(&pool->lock);
        xa_init_flags(&pool->irqs, XA_FLAGS_ALLOC);
        pool->xa_num_irqs.min = start;
        pool->xa_num_irqs.max = start + size - 1;
        if (name)
                snprintf(pool->name, MLX5_MAX_IRQ_NAME - MLX5_MAX_IRQ_IDX_CHARS,
                         "%s", name);
        pool->min_threshold = min_threshold * MLX5_EQ_REFS_PER_IRQ;
        pool->max_threshold = max_threshold * MLX5_EQ_REFS_PER_IRQ;
        mlx5_core_dbg(dev, "pool->name = %s, pool->size = %d, pool->start = %d",
                      name ? name : "mlx5_pcif_pool", size, start);
        return pool;
}

static void irq_pool_free(struct mlx5_irq_pool *pool)
{
        struct mlx5_irq *irq;
        unsigned long index;

        /* There are cases in which we are destroying the irq_table before
         * freeing all the IRQs, fast teardown for example. Hence, free the irqs
         * which might not have been freed.
         */
        xa_for_each(&pool->irqs, index, irq)
                irq_release(irq);
        xa_destroy(&pool->irqs);
        mutex_destroy(&pool->lock);
        kfree(pool->irqs_per_cpu);
        kvfree(pool);
}

static int irq_pools_init(struct mlx5_core_dev *dev, int sf_vec, int pcif_vec,
                          bool dynamic_vec)
{
        struct mlx5_irq_table *table = dev->priv.irq_table;
        int sf_vec_available = sf_vec;
        int num_sf_ctrl;
        int err;

        /* init pcif_pool */
        table->pcif_pool = irq_pool_alloc(dev, 0, pcif_vec, NULL,
                                          MLX5_EQ_SHARE_IRQ_MIN_COMP,
                                          MLX5_EQ_SHARE_IRQ_MAX_COMP);
        if (IS_ERR(table->pcif_pool))
                return PTR_ERR(table->pcif_pool);
        if (!mlx5_sf_max_functions(dev))
                return 0;
        if (sf_vec < MLX5_IRQ_VEC_COMP_BASE_SF) {
                mlx5_core_dbg(dev, "Not enough IRQs for SFs. SF may run at lower performance\n");
                return 0;
        }

        /* init sf_ctrl_pool */
        num_sf_ctrl = DIV_ROUND_UP(mlx5_sf_max_functions(dev),
                                   MLX5_SFS_PER_CTRL_IRQ);
        num_sf_ctrl = min_t(int, MLX5_IRQ_CTRL_SF_MAX, num_sf_ctrl);
        if (!dynamic_vec && (num_sf_ctrl + 1) > sf_vec_available) {
                mlx5_core_dbg(dev,
                              "Not enough IRQs for SFs control and completion pool, required=%d avail=%d\n",
                              num_sf_ctrl + 1, sf_vec_available);
                return 0;
        }

        table->sf_ctrl_pool = irq_pool_alloc(dev, pcif_vec, num_sf_ctrl,
                                             "mlx5_sf_ctrl",
                                             MLX5_EQ_SHARE_IRQ_MIN_CTRL,
                                             MLX5_EQ_SHARE_IRQ_MAX_CTRL);
        if (IS_ERR(table->sf_ctrl_pool)) {
                err = PTR_ERR(table->sf_ctrl_pool);
                goto err_pf;
        }
        sf_vec_available -= num_sf_ctrl;

        /* init sf_comp_pool, remaining vectors are for the SF completions */
        table->sf_comp_pool = irq_pool_alloc(dev, pcif_vec + num_sf_ctrl,
                                             sf_vec_available, "mlx5_sf_comp",
                                             MLX5_EQ_SHARE_IRQ_MIN_COMP,
                                             MLX5_EQ_SHARE_IRQ_MAX_COMP);
        if (IS_ERR(table->sf_comp_pool)) {
                err = PTR_ERR(table->sf_comp_pool);
                goto err_sf_ctrl;
        }

        table->sf_comp_pool->irqs_per_cpu = kcalloc(nr_cpu_ids, sizeof(u16), GFP_KERNEL);
        if (!table->sf_comp_pool->irqs_per_cpu) {
                err = -ENOMEM;
                goto err_irqs_per_cpu;
        }

        return 0;

err_irqs_per_cpu:
        irq_pool_free(table->sf_comp_pool);
err_sf_ctrl:
        irq_pool_free(table->sf_ctrl_pool);
err_pf:
        irq_pool_free(table->pcif_pool);
        return err;
}

static void irq_pools_destroy(struct mlx5_irq_table *table)
{
        if (table->sf_ctrl_pool) {
                irq_pool_free(table->sf_comp_pool);
                irq_pool_free(table->sf_ctrl_pool);
        }
        irq_pool_free(table->pcif_pool);
}

static void mlx5_irq_pool_free_irqs(struct mlx5_irq_pool *pool)
{
        struct mlx5_irq *irq;
        unsigned long index;

        xa_for_each(&pool->irqs, index, irq)
                mlx5_system_free_irq(irq);

}

static void mlx5_irq_pools_free_irqs(struct mlx5_irq_table *table)
{
        if (table->sf_ctrl_pool) {
                mlx5_irq_pool_free_irqs(table->sf_comp_pool);
                mlx5_irq_pool_free_irqs(table->sf_ctrl_pool);
        }
        mlx5_irq_pool_free_irqs(table->pcif_pool);
}

/* irq_table API */

int mlx5_irq_table_init(struct mlx5_core_dev *dev)
{
        struct mlx5_irq_table *irq_table;

        if (mlx5_core_is_sf(dev))
                return 0;

        irq_table = kvzalloc_node(sizeof(*irq_table), GFP_KERNEL,
                                  dev->priv.numa_node);
        if (!irq_table)
                return -ENOMEM;

        dev->priv.irq_table = irq_table;
        return 0;
}

void mlx5_irq_table_cleanup(struct mlx5_core_dev *dev)
{
        if (mlx5_core_is_sf(dev))
                return;

        kvfree(dev->priv.irq_table);
}

int mlx5_irq_table_get_num_comp(struct mlx5_irq_table *table)
{
        if (!table->pcif_pool->xa_num_irqs.max)
                return 1;
        return table->pcif_pool->xa_num_irqs.max - table->pcif_pool->xa_num_irqs.min;
}

int mlx5_irq_table_create(struct mlx5_core_dev *dev)
{
        int num_eqs = mlx5_max_eq_cap_get(dev);
        bool dynamic_vec;
        int total_vec;
        int pcif_vec;
        int req_vec;
        int err;
        int n;

        if (mlx5_core_is_sf(dev))
                return 0;

        /* PCI PF vectors usage is limited by online cpus, device EQs and
         * PCI MSI-X capability.
         */
        pcif_vec = MLX5_CAP_GEN(dev, num_ports) * num_online_cpus() + 1;
        pcif_vec = min_t(int, pcif_vec, num_eqs);
        pcif_vec = min_t(int, pcif_vec, pci_msix_vec_count(dev->pdev));

        total_vec = pcif_vec;
        if (mlx5_sf_max_functions(dev))
                total_vec += MLX5_MAX_MSIX_PER_SF * mlx5_sf_max_functions(dev);
        total_vec = min_t(int, total_vec, pci_msix_vec_count(dev->pdev));

        req_vec = pci_msix_can_alloc_dyn(dev->pdev) ? 1 : total_vec;
        n = pci_alloc_irq_vectors(dev->pdev, 1, req_vec, PCI_IRQ_MSIX);
        if (n < 0)
                return n;

        /* Further limit vectors of the pools based on platform for non dynamic case */
        dynamic_vec = pci_msix_can_alloc_dyn(dev->pdev);
        if (!dynamic_vec) {
                pcif_vec = min_t(int, n, pcif_vec);
                total_vec = min_t(int, n, total_vec);
        }

        err = irq_pools_init(dev, total_vec - pcif_vec, pcif_vec, dynamic_vec);
        if (err)
                pci_free_irq_vectors(dev->pdev);

        return err;
}

void mlx5_irq_table_destroy(struct mlx5_core_dev *dev)
{
        struct mlx5_irq_table *table = dev->priv.irq_table;

        if (mlx5_core_is_sf(dev))
                return;

        /* There are cases where IRQs still will be in used when we reaching
         * to here. Hence, making sure all the irqs are released.
         */
        irq_pools_destroy(table);
        pci_free_irq_vectors(dev->pdev);
}

void mlx5_irq_table_free_irqs(struct mlx5_core_dev *dev)
{
        struct mlx5_irq_table *table = dev->priv.irq_table;

        if (mlx5_core_is_sf(dev))
                return;

        mlx5_irq_pools_free_irqs(table);
        pci_free_irq_vectors(dev->pdev);
}

int mlx5_irq_table_get_sfs_vec(struct mlx5_irq_table *table)
{
        if (table->sf_comp_pool)
                return min_t(int, num_online_cpus(),
                             table->sf_comp_pool->xa_num_irqs.max -
                             table->sf_comp_pool->xa_num_irqs.min + 1);
        else
                return mlx5_irq_table_get_num_comp(table);
}

struct mlx5_irq_table *mlx5_irq_table_get(struct mlx5_core_dev *dev)
{
#ifdef CONFIG_MLX5_SF
        if (mlx5_core_is_sf(dev))
                return dev->priv.parent_mdev->priv.irq_table;
#endif
        return dev->priv.irq_table;
}