root/usr/src/uts/common/io/ena/ena_gld.c
/*
 * This file and its contents are supplied under the terms of the
 * Common Development and Distribution License ("CDDL"), version 1.0.
 * You may only use this file in accordance with the terms of version
 * 1.0 of the CDDL.
 *
 * A full copy of the text of the CDDL should have accompanied this
 * source.  A copy of the CDDL is also available via the Internet at
 * http://www.illumos.org/license/CDDL.
 */

/*
 * Copyright 2024 Oxide Computer Company
 */

#include "ena.h"

/*
 * Group/Ring callbacks
 */

/*
 * The ena driver supports only a single mac address: the one assigned
 * to it by the hypervisor. If mac requests an address besides this
 * one, then return ENOTSUP. This will prevent VNICs from being
 * created, as it should.
 */
static int
ena_group_add_mac(void *arg, const uint8_t *mac_addr)
{
        ena_t *ena = arg;

        if (ETHER_IS_MULTICAST(mac_addr)) {
                return (EINVAL);
        }

        if (bcmp(ena->ena_mac_addr, mac_addr, ETHERADDRL) == 0) {
                return (0);
        }

        return (ENOTSUP);
}

static int
ena_group_rem_mac(void *arg, const uint8_t *mac_addr)
{
        ena_t *ena = arg;

        if (ETHER_IS_MULTICAST(mac_addr)) {
                return (EINVAL);
        }

        if (bcmp(ena->ena_mac_addr, mac_addr, ETHERADDRL) == 0) {
                return (0);
        }

        return (ENOTSUP);
}

static int
ena_ring_rx_intr_disable(mac_intr_handle_t mih)
{
        ena_rxq_t *rxq = (ena_rxq_t *)mih;
        uint32_t intr_ctrl;

        mutex_enter(&rxq->er_lock);
        intr_ctrl = ena_hw_abs_read32(rxq->er_ena, rxq->er_cq_unmask_addr);
        ENAHW_REG_INTR_MASK(intr_ctrl);
        ena_hw_abs_write32(rxq->er_ena, rxq->er_cq_unmask_addr, intr_ctrl);
        rxq->er_mode = ENA_RXQ_MODE_POLLING;
        mutex_exit(&rxq->er_lock);
        return (0);
}

static int
ena_ring_rx_intr_enable(mac_intr_handle_t mih)
{
        ena_rxq_t *rxq = (ena_rxq_t *)mih;
        uint32_t intr_ctrl;

        mutex_enter(&rxq->er_lock);
        intr_ctrl = ena_hw_abs_read32(rxq->er_ena, rxq->er_cq_unmask_addr);
        ENAHW_REG_INTR_UNMASK(intr_ctrl);
        ena_hw_abs_write32(rxq->er_ena, rxq->er_cq_unmask_addr, intr_ctrl);
        rxq->er_mode = ENA_RXQ_MODE_INTR;
        mutex_exit(&rxq->er_lock);
        return (0);
}

static void
ena_fill_rx_group(void *arg, mac_ring_type_t rtype, const int index,
    mac_group_info_t *infop, mac_group_handle_t gh)
{
        ena_t *ena = arg;

        VERIFY3S(rtype, ==, MAC_RING_TYPE_RX);
        /*
         * Typically you pass an Rx group data structure as
         * mgi_driver, but given we should only ever have one group we
         * just pass the top-level ena_t.
         */
        infop->mgi_driver = (mac_group_driver_t)ena;
        infop->mgi_start = NULL;
        infop->mgi_stop = NULL;
        infop->mgi_addmac = ena_group_add_mac;
        infop->mgi_remmac = ena_group_rem_mac;
        infop->mgi_count = ena->ena_num_intrs - 1;
}

static void
ena_fill_tx_ring(void *arg, mac_ring_type_t rtype, const int group_index,
    const int ring_index, mac_ring_info_t *infop, mac_ring_handle_t rh)
{
        ena_t *ena = arg;
        ena_txq_t *txq = &ena->ena_txqs[ring_index];

        VERIFY3S(rtype, ==, MAC_RING_TYPE_TX);
        VERIFY3S(ring_index, <, ena->ena_num_txqs);
        /* Link driver Tx queue to mac ring handle and vice versa. */
        txq->et_mrh = rh;
        infop->mri_driver = (mac_ring_driver_t)txq;
        infop->mri_start = ena_ring_tx_start;
        infop->mri_stop = ena_ring_tx_stop;
        infop->mri_tx = ena_ring_tx;
        infop->mri_stat = ena_ring_tx_stat;
}

static void
ena_fill_rx_ring(void *arg, mac_ring_type_t rtype, const int group_index,
    const int ring_index, mac_ring_info_t *infop, mac_ring_handle_t rh)
{
        ena_t *ena = arg;
        ena_rxq_t *rxq = &ena->ena_rxqs[ring_index];

        VERIFY3S(rtype, ==, MAC_RING_TYPE_RX);
        VERIFY3S(ring_index, <, ena->ena_num_rxqs);
        rxq->er_mrh = rh;
        infop->mri_driver = (mac_ring_driver_t)rxq;
        infop->mri_start = ena_ring_rx_start;
        infop->mri_stop = ena_ring_rx_stop;
        infop->mri_poll = ena_ring_rx_poll;
        infop->mri_stat = ena_ring_rx_stat;
        infop->mri_intr.mi_handle = (mac_intr_handle_t)rxq;
        infop->mri_intr.mi_enable = ena_ring_rx_intr_enable;
        infop->mri_intr.mi_disable = ena_ring_rx_intr_disable;
        infop->mri_intr.mi_ddi_handle =
            ena->ena_intr_handles[rxq->er_intr_vector];
}

static int
ena_m_start(void *arg)
{
        ena_t *ena = arg;

        atomic_or_32(&ena->ena_state, ENA_STATE_STARTED);
        ena_enable_watchdog(ena);

        return (0);
}

static void
ena_m_stop(void *arg)
{
        ena_t *ena = arg;

        ena_disable_watchdog(ena);
        atomic_and_32(&ena->ena_state, ~ENA_STATE_STARTED);
}

/*
 * As discussed in ena_group_add_mac(), ENA only supports a single MAC
 * address, and therefore we prevent VNICs from being created. That
 * means there is no chance for promisc to be used as a means for
 * implementing VNIC support on ENA, as we never allow them to be
 * created in the first place.
 *
 * As for promisc itself, returning success is about the best we can
 * do. There is no promisc API for an ENA device -- you get only the
 * exact traffic AWS wants you to see.
 */
static int
ena_m_setpromisc(void *arg, boolean_t on)
{
        return (0);
}

/*
 * Similarly to promisc, there is no multicast API for an ENA
 * device.
 */
static int
ena_m_multicast(void *arg, boolean_t add, const uint8_t *multicast_address)
{
        return (0);
}

static boolean_t
ena_m_getcapab(void *arg, mac_capab_t capab, void *cap_data)
{
        ena_t *ena = arg;
        mac_capab_rings_t *cap_rings;

        switch (capab) {
        case MAC_CAPAB_RINGS:
                cap_rings = cap_data;
                cap_rings->mr_group_type = MAC_GROUP_TYPE_STATIC;
                cap_rings->mr_gaddring = NULL;
                cap_rings->mr_gremring = NULL;
                ASSERT3U(ena->ena_num_intrs, >=, 2);

                switch (cap_rings->mr_type) {
                case MAC_RING_TYPE_TX:
                        /*
                         * We use pseudo Tx groups for now.
                         */
                        cap_rings->mr_gnum = 0;
                        cap_rings->mr_rnum = ena->ena_num_intrs - 1;
                        cap_rings->mr_rget = ena_fill_tx_ring;
                        break;
                case MAC_RING_TYPE_RX:
                        cap_rings->mr_rnum = ena->ena_num_intrs - 1;
                        cap_rings->mr_rget = ena_fill_rx_ring;
                        /*
                         * The ENA device provides no means to add mac
                         * filters or set promisc mode; it's only
                         * meant to receive its pre-designated unicast
                         * address. However, we still want rings as
                         * the device does provide multiple queues and
                         * RSS.
                         */
                        cap_rings->mr_gnum = 1;
                        cap_rings->mr_gget = ena_fill_rx_group;
                        break;
                }

                break;

        case MAC_CAPAB_HCKSUM:
        case MAC_CAPAB_LSO:
                return (B_FALSE);
        default:
                return (B_FALSE);
        }

        return (B_TRUE);
}

static int
ena_m_setprop(void *arg, const char *pr_name, mac_prop_id_t pr_num,
    uint_t pr_valsize, const void *pr_val)
{
        return (ENOTSUP);
}

static int
ena_m_getprop(void *arg, const char *pr_name, mac_prop_id_t pr_num,
    uint_t pr_valsize, void *pr_val)
{
        ena_t *ena = arg;
        int ret = 0;
        uint64_t speed;
        uint8_t *u8;

        mutex_enter(&ena->ena_lock);

        switch (pr_num) {
        case MAC_PROP_DUPLEX:
                if (pr_valsize < sizeof (link_duplex_t)) {
                        ret = EOVERFLOW;
                        break;
                }

                bcopy(&ena->ena_link_duplex, pr_val, sizeof (link_duplex_t));
                break;

        case MAC_PROP_SPEED:
                if (pr_valsize < sizeof (uint64_t)) {
                        ret = EOVERFLOW;
                        break;
                }

                speed = ena->ena_link_speed_mbits * 1000000ULL;
                bcopy(&speed, pr_val, sizeof (speed));
                break;

        case MAC_PROP_STATUS:
                if (pr_valsize < sizeof (link_state_t)) {
                        ret = EOVERFLOW;
                        break;
                }

                bcopy(&ena->ena_link_state, pr_val, sizeof (link_state_t));
                break;

        case MAC_PROP_AUTONEG:
                if (pr_valsize < sizeof (uint8_t)) {
                        ret = EOVERFLOW;
                        break;
                }

                u8 = pr_val;
                *u8 = (ena->ena_link_autoneg ? 0 : 1);
                break;

        case MAC_PROP_MTU:
                if (pr_valsize < sizeof (uint32_t)) {
                        ret = EOVERFLOW;
                        break;
                }

                bcopy(&ena->ena_mtu, pr_val, sizeof (uint32_t));
                break;

        case MAC_PROP_ADV_1000FDX_CAP:
        case MAC_PROP_EN_1000FDX_CAP:
                if (pr_valsize < sizeof (uint8_t)) {
                        ret = EOVERFLOW;
                        break;
                }

                u8 = pr_val;
                *u8 = (ena->ena_link_speeds & ENAHW_LINK_SPEED_1G) != 0;
                break;

        case MAC_PROP_ADV_2500FDX_CAP:
        case MAC_PROP_EN_2500FDX_CAP:
                if (pr_valsize < sizeof (uint8_t)) {
                        ret = EOVERFLOW;
                        break;
                }

                u8 = pr_val;
                *u8 = (ena->ena_link_speeds & ENAHW_LINK_SPEED_2_HALF_G) != 0;
                break;

        case MAC_PROP_ADV_5000FDX_CAP:
        case MAC_PROP_EN_5000FDX_CAP:
                if (pr_valsize < sizeof (uint8_t)) {
                        ret = EOVERFLOW;
                        break;
                }

                u8 = pr_val;
                *u8 = (ena->ena_link_speeds & ENAHW_LINK_SPEED_5G) != 0;
                break;

        case MAC_PROP_ADV_10GFDX_CAP:
        case MAC_PROP_EN_10GFDX_CAP:
                if (pr_valsize < sizeof (uint8_t)) {
                        ret = EOVERFLOW;
                        break;
                }

                u8 = pr_val;
                *u8 = (ena->ena_link_speeds & ENAHW_LINK_SPEED_10G) != 0;
                break;

        case MAC_PROP_ADV_25GFDX_CAP:
        case MAC_PROP_EN_25GFDX_CAP:
                if (pr_valsize < sizeof (uint8_t)) {
                        ret = EOVERFLOW;
                        break;
                }

                u8 = pr_val;
                *u8 = (ena->ena_link_speeds & ENAHW_LINK_SPEED_25G) != 0;
                break;

        case MAC_PROP_ADV_40GFDX_CAP:
        case MAC_PROP_EN_40GFDX_CAP:
                if (pr_valsize < sizeof (uint8_t)) {
                        ret = EOVERFLOW;
                        break;
                }

                u8 = pr_val;
                *u8 = (ena->ena_link_speeds & ENAHW_LINK_SPEED_40G) != 0;
                break;

        case MAC_PROP_ADV_100GFDX_CAP:
        case MAC_PROP_EN_100GFDX_CAP:
                if (pr_valsize < sizeof (uint8_t)) {
                        ret = EOVERFLOW;
                        break;
                }

                u8 = pr_val;
                *u8 = (ena->ena_link_speeds & ENAHW_LINK_SPEED_100G) != 0;
                break;

        default:
                ret = ENOTSUP;
                break;
        }

        mutex_exit(&ena->ena_lock);
        return (ret);
}

static void
ena_m_propinfo(void *arg, const char *pr_name, mac_prop_id_t pr_num,
    mac_prop_info_handle_t prh)
{
}

static mac_callbacks_t ena_m_callbacks = {
        .mc_callbacks = MC_GETCAPAB | MC_SETPROP | MC_GETPROP | MC_PROPINFO,
        .mc_getstat = ena_m_stat,
        .mc_start = ena_m_start,
        .mc_stop = ena_m_stop,
        .mc_setpromisc = ena_m_setpromisc,
        .mc_multicst = ena_m_multicast,
        .mc_getcapab = ena_m_getcapab,
        .mc_setprop = ena_m_setprop,
        .mc_getprop = ena_m_getprop,
        .mc_propinfo = ena_m_propinfo,
};

int
ena_mac_unregister(ena_t *ena)
{
        if (ena->ena_mh == NULL) {
                return (0);
        }

        return (mac_unregister(ena->ena_mh));
}

bool
ena_mac_register(ena_t *ena)
{
        int ret;
        mac_register_t *regp;

        if ((regp = mac_alloc(MAC_VERSION)) == NULL) {
                ena_err(ena, "failed to allocate MAC handle");
                return (false);
        }

        regp->m_type_ident = MAC_PLUGIN_IDENT_ETHER;
        regp->m_driver = ena;
        regp->m_dip = ena->ena_dip;
        regp->m_instance = 0;
        regp->m_src_addr = ena->ena_mac_addr;
        regp->m_dst_addr = NULL;
        regp->m_callbacks = &ena_m_callbacks;
        regp->m_min_sdu = 0;
        regp->m_max_sdu = ena->ena_mtu;
        regp->m_pdata = NULL;
        regp->m_pdata_size = 0;
        regp->m_priv_props = NULL;
        regp->m_margin = VLAN_TAGSZ;
        regp->m_v12n = MAC_VIRT_LEVEL1;

        if ((ret = mac_register(regp, &ena->ena_mh)) != 0) {
                ena_err(ena, "failed to register ena with mac: %d", ret);
        }

        mac_free(regp);

        if (ret == 0) {
                /*
                 * Until we get the first AENQ link change event, we
                 * do not actually know the status of the link.
                 */
                mac_link_update(ena->ena_mh, LINK_STATE_UNKNOWN);
        }

        return (ret == 0);
}