root/usr/src/uts/common/sys/mac_flow_impl.h
/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */

/*
 * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 * Copyright 2026 Oxide Computer Company
 */

#ifndef _MAC_FLOW_IMPL_H
#define _MAC_FLOW_IMPL_H

#ifdef  __cplusplus
extern "C" {
#endif

#include <sys/param.h>
#include <sys/atomic.h>
#include <sys/ksynch.h>
#include <sys/mac_flow.h>
#include <sys/stream.h>
#include <sys/sdt.h>
#include <net/if.h>

/*
 * Macros to increment/decrement the reference count on a flow_entry_t.
 */
#define FLOW_REFHOLD(flent) {                                   \
        DTRACE_PROBE1(flow_refhold, flow_entry_t *, (flent));   \
        mutex_enter(&(flent)->fe_lock);                         \
        (flent)->fe_refcnt++;                                   \
        mutex_exit(&(flent)->fe_lock);                          \
}

/*
 * Data paths must not attempt to use a flow entry if it is marked INCIPIENT
 * or QUIESCE. In the former case the set up is not yet complete and the
 * data path could stumble on inconsistent data structures. In the latter
 * case a control operation is waiting for quiescence so that it can
 * change callbacks or other structures without the use of locks.
 */
#define FLOW_TRY_REFHOLD(flent, err) {                          \
        DTRACE_PROBE1(flow_refhold, flow_entry_t *, (flent));   \
        (err) = 0;                                              \
        mutex_enter(&(flent)->fe_lock);                         \
        if ((flent)->fe_flags & (FE_INCIPIENT | FE_QUIESCE | FE_CONDEMNED | \
            FE_UF_NO_DATAPATH | FE_MC_NO_DATAPATH))                     \
                (err) = -1;                                     \
        else                                                    \
                (flent)->fe_refcnt++;                           \
        mutex_exit(&(flent)->fe_lock);                          \
}

#define FLOW_REFRELE(flent) {                                   \
        DTRACE_PROBE1(flow_refrele, flow_entry_t *, (flent));   \
        mutex_enter(&(flent)->fe_lock);                         \
        ASSERT((flent)->fe_refcnt != 0);                        \
        (flent)->fe_refcnt--;                                   \
        if ((flent)->fe_flags & FE_WAITER) {                    \
                ASSERT((flent)->fe_refcnt != 0);                \
                cv_signal(&(flent)->fe_cv);                     \
                mutex_exit(&(flent)->fe_lock);                  \
        } else if ((flent)->fe_refcnt == 0) {                   \
                mac_flow_destroy(flent);                        \
        } else {                                                \
                mutex_exit(&(flent)->fe_lock);                  \
        }                                                       \
}

#define FLOW_USER_REFHOLD(flent) {                      \
        mutex_enter(&(flent)->fe_lock);                 \
        (flent)->fe_user_refcnt++;                      \
        mutex_exit(&(flent)->fe_lock);                  \
}

#define FLOW_USER_REFRELE(flent) {                      \
        mutex_enter(&(flent)->fe_lock);                 \
        ASSERT((flent)->fe_user_refcnt != 0);           \
        if (--(flent)->fe_user_refcnt == 0 &&           \
            ((flent)->fe_flags & FE_WAITER))            \
                cv_signal(&(flent)->fe_cv);             \
        mutex_exit(&(flent)->fe_lock);                  \
}

#define FLOW_FINAL_REFRELE(flent) {                     \
        ASSERT(flent->fe_refcnt == 1 && flent->fe_user_refcnt == 0);    \
        FLOW_REFRELE(flent);                            \
}

/*
 * Mark or unmark the flent with a bit flag
 */
#define FLOW_MARK(flent, flag) {                \
        mutex_enter(&(flent)->fe_lock);         \
        (flent)->fe_flags |= flag;              \
        mutex_exit(&(flent)->fe_lock);          \
}

#define FLOW_UNMARK(flent, flag) {              \
        mutex_enter(&(flent)->fe_lock);         \
        (flent)->fe_flags &= ~flag;             \
        mutex_exit(&(flent)->fe_lock);          \
}

#define FLENT_TO_MIP(flent)                     \
        (flent->fe_mbg != NULL ? mac_bcast_grp_mip(flent->fe_mbg) :     \
        ((mac_client_impl_t *)flent->fe_mcip)->mci_mip)

/* Convert a bandwidth expressed in bps to a number of bytes per tick. */
#define FLOW_BYTES_PER_TICK(bps)        (((bps) >> 3) / hz)

/*
 * Given an underlying range and a priority level, obtain the minimum for the
 * new range.
 */
#define FLOW_MIN_PRIORITY(min, max, pri)        \
        ((min) + ((((max) - (min)) / MRP_PRIORITY_LEVELS) * (pri)))

/*
 * Given an underlying range and a minimum level (base), obtain the maximum
 * for the new range.
 */
#define FLOW_MAX_PRIORITY(min, max, base)       \
        ((base) + (((max) - (min)) / MRP_PRIORITY_LEVELS))

/*
 * Given an underlying range and a priority level, get the absolute
 * priority value. For now there are just 3 values, high, low and
 * medium  so we can just return max, min or min + (max - min) / 2.
 * If there are more than three we need to change this computation.
 */
#define FLOW_PRIORITY(min, max, pri)            \
        (pri) == MPL_HIGH ? (max) :     \
        (pri) == MPL_LOW ? (min) :      \
        ((min) + (((max) - (min)) / 2))

#define MAC_FLOW_TAB_SIZE               500

typedef struct flow_entry_s             flow_entry_t;
typedef struct flow_tab_s               flow_tab_t;
typedef struct flow_state_s             flow_state_t;
struct mac_impl_s;
struct mac_client_impl_s;
struct mac_soft_ring_set_s;
struct mac_group_s;
struct mac_bcast_grp_s;

/*
 * Classification flags used to lookup the flow.
 */
#define FLOW_INBOUND            0x01
#define FLOW_OUTBOUND           0x02
/* Don't compare VID when classifying the packets, see mac_rx_classify() */
#define FLOW_IGNORE_VLAN        0x04

/* Generic flow client function signature */
typedef void            (*flow_fn_t)(void *, void *, mblk_t *, boolean_t);

/* Flow state */
typedef enum {
        FLOW_DRIVER_UPCALL,
        FLOW_USER_REF
} mac_flow_state_t;

/* Matches a flow_entry_t using the extracted flow_state_t info */
typedef boolean_t       (*flow_match_fn_t)(flow_tab_t *, flow_entry_t *,
                            flow_state_t *);

typedef enum {
        /* Quiesce the flow */
        FE_QUIESCE              = 0x01,
        /* Flow has a waiter */
        FE_WAITER               = 0x02,
        /* Flow is in the flow tab list */
        FE_FLOW_TAB             = 0x04,
        /* Flow is in the global flow hash */
        FE_G_FLOW_HASH          = 0x08,
        /* Being setup */
        FE_INCIPIENT            = 0x10,
        /* Being deleted */
        FE_CONDEMNED            = 0x20,
        /* No datapath setup for User flow */
        FE_UF_NO_DATAPATH       = 0x40,
        /* No datapath setup for mac client */
        FE_MC_NO_DATAPATH       = 0x80,
} flow_entry_flags_t;

typedef enum {
        /* NIC primary MAC address */
        FLOW_PRIMARY_MAC        = 0x01,
        /* VNIC flow */
        FLOW_VNIC_MAC           = 0x02,
        /* Multicast (and broadcast) */
        FLOW_MCAST              = 0x04,
        /* Other flows configured */
        FLOW_OTHER              = 0x08,
        /* User defined flow */
        FLOW_USER               = 0x10,
        /* Don't create stats for the flow */
        FLOW_NO_STATS           = 0x20,
} flow_entry_type_t;

#define FLOW_VNIC               FLOW_VNIC_MAC

/*
 * Shared Bandwidth control counters between the soft ring set and its
 * associated soft rings. In case the flow associated with NIC/VNIC
 * has a group of Rx rings assigned to it, we have the same
 * number of soft ring sets as we have the Rx ring in the group
 * and each individual SRS (and its soft rings) decide when to
 * poll their Rx ring independently. But if there is a B/W limit
 * associated with the NIC/VNIC, then the B/W control counter is
 * shared across all the SRS in the group and their associated
 * soft rings.
 *
 * There is a many to 1 mapping between the SRS and
 * mac_bw_ctl if the flow has a group of Rx rings associated with
 * it.
 */
typedef struct mac_bw_ctl_s {
        kmutex_t        mac_bw_lock;
        uint32_t        mac_bw_state;
        size_t          mac_bw_sz;      /* ?? Is it needed */
        size_t          mac_bw_limit;   /* Max bytes to process per tick */
        size_t          mac_bw_used;    /* Bytes processed in current tick */
        size_t          mac_bw_drop_threshold; /* Max queue length */
        size_t          mac_bw_drop_bytes;
        size_t          mac_bw_polled;
        size_t          mac_bw_intr;
        clock_t         mac_bw_curr_time;
} mac_bw_ctl_t;

struct flow_entry_s {                                   /* Protected by */
        flow_entry_t            *fe_next;               /* ft_lock */

        datalink_id_t           fe_link_id;             /* WO */

        /* Properties as specified for this flow */
        mac_resource_props_t    fe_resource_props;      /* SL */

        /* Properties actually effective at run time for this flow */
        mac_resource_props_t    fe_effective_props;     /* SL */

        kmutex_t                fe_lock;
        char                    fe_flow_name[MAXFLOWNAMELEN];   /* fe_lock */
        flow_desc_t             fe_flow_desc;           /* fe_lock */
        kcondvar_t              fe_cv;                  /* fe_lock */
        /*
         * Initial flow ref is 1 on creation. A thread that lookups the
         * flent typically by a mac_flow_lookup() dynamically holds a ref.
         * If the ref is 1, it means there arent' any upcalls from the driver
         * or downcalls from the stack using this flent. Structures pointing
         * to the flent or flent inserted in lists don't count towards this
         * refcnt. Instead they are tracked using fe_flags. Only a control
         * thread doing a teardown operation deletes the flent, after waiting
         * for upcalls to finish synchronously. The fe_refcnt tracks
         * the number of upcall refs
         */
        uint32_t                fe_refcnt;              /* fe_lock */

        /*
         * This tracks lookups done using the global hash list for user
         * generated flows. This refcnt only protects the flent itself
         * from disappearing and helps walkers to read the flent info such
         * as flow spec. However the flent may be quiesced and the SRS could
         * be deleted. The fe_user_refcnt tracks the number of global flow
         * has refs.
         */
        uint32_t                fe_user_refcnt;         /* fe_lock */
        flow_entry_flags_t      fe_flags;               /* fe_lock */

        /*
         * Function/args to invoke for delivering matching packets
         * Only the function ff_fn may be changed dynamically and atomically.
         * The ff_arg1 and ff_arg2 are set at creation time and may not
         * be changed.
         */
        flow_fn_t               fe_cb_fn;               /* fe_lock */
        void                    *fe_cb_arg1;            /* fe_lock */
        void                    *fe_cb_arg2;            /* fe_lock */

        void                    *fe_client_cookie;      /* WO */
        struct mac_group_s      *fe_rx_ring_group;      /* SL */

                                                        /* fe_lock */
        struct mac_soft_ring_set_s      *fe_rx_srs[MAX_RINGS_PER_GROUP];
        uint32_t                        fe_rx_srs_cnt;          /* fe_lock */
        struct mac_group_s              *fe_tx_ring_group;
        struct mac_soft_ring_set_s      *fe_tx_srs;             /* WO */

        /*
         * This is a unicast flow, and is a mac_client_impl_t
         */
        struct mac_client_impl_s        *fe_mcip;               /* WO */

        /*
         * Used by mci_flent_list of mac_client_impl_t to track flows sharing
         * the same mac_client_impl_t.
         */
        flow_entry_t            *fe_client_next;

        /*
         * This is a broadcast or multicast flow and is a mac_bcast_grp_t
         */
        struct mac_bcast_grp_s  *fe_mbg;                /* WO */
        flow_entry_type_t       fe_type;                /* WO */

        /*
         * BW control info.
         */
        mac_bw_ctl_t            fe_tx_bw;
        mac_bw_ctl_t            fe_rx_bw;

        /*
         * Used by flow table lookup code
         */
        flow_match_fn_t         fe_match;

        /*
         * Used by mac_flow_remove().
         */
        int                     fe_index;
        flow_tab_t              *fe_flow_tab;

        kstat_t                 *fe_ksp;
        kstat_t                 *fe_misc_stat_ksp;

        boolean_t               fe_desc_logged;
        uint64_t                fe_nic_speed;
};

/*
 * Various structures used by the flows framework for keeping track
 * of packet state information.
 */

/* Layer 2 */
typedef struct flow_l2info_s {
        uchar_t         *l2_start;
        uint8_t         *l2_daddr;
        uint16_t        l2_vid;
        uint32_t        l2_sap;
        uint_t          l2_hdrsize;
} flow_l2info_t;

/* Layer 3 */
typedef struct flow_l3info_s {
        uchar_t         *l3_start;
        uint8_t         l3_protocol;
        uint8_t         l3_version;
        boolean_t       l3_dst_or_src;
        uint_t          l3_hdrsize;
        boolean_t       l3_fragmented;
} flow_l3info_t;

/* Layer 4 */
typedef struct flow_l4info_s {
        uchar_t         *l4_start;
        uint16_t        l4_src_port;
        uint16_t        l4_dst_port;
        uint16_t        l4_hash_port;
} flow_l4info_t;

/*
 * Combined state structure.
 * Holds flow direction and an mblk_t pointer.
 */
struct flow_state_s {
        uint_t          fs_flags;
        mblk_t          *fs_mp;
        flow_l2info_t   fs_l2info;
        flow_l3info_t   fs_l3info;
        flow_l4info_t   fs_l4info;
};

/*
 * Flow ops vector.
 * There are two groups of functions. The ones ending with _fe are
 * called when a flow is being added. The others (hash, accept) are
 * called at flow lookup time.
 */
#define FLOW_MAX_ACCEPT 16
typedef struct flow_ops_s {
        /*
         * fo_accept_fe():
         * Validates the contents of the flow and checks whether
         * it's compatible with the flow table. sets the fe_match
         * function of the flow.
         */
        int             (*fo_accept_fe)(flow_tab_t *, flow_entry_t *);
        /*
         * fo_hash_fe():
         * Generates a hash index to the flow table. This function
         * must use the same algorithm as fo_hash(), which is used
         * by the flow lookup code path.
         */
        uint32_t        (*fo_hash_fe)(flow_tab_t *, flow_entry_t *);
        /*
         * fo_match_fe():
         * This is used for finding identical flows.
         */
        boolean_t       (*fo_match_fe)(flow_tab_t *, flow_entry_t *,
                            flow_entry_t *);
        /*
         * fo_insert_fe():
         * Used for inserting a flow to a flow chain.
         * Protocols that have special ordering requirements would
         * need to implement this. For those that don't,
         * flow_generic_insert_fe() may be used.
         */
        int             (*fo_insert_fe)(flow_tab_t *, flow_entry_t **,
                            flow_entry_t *);

        /*
         * Calculates the flow hash index based on the accumulated
         * state in flow_state_t. Must use the same algorithm as
         * fo_hash_fe().
         */
        uint32_t        (*fo_hash)(flow_tab_t *, flow_state_t *);

        /*
         * Array of accept fuctions.
         * Each function in the array will accumulate enough state
         * (header length, protocol) to allow the next function to
         * proceed. We support up to FLOW_MAX_ACCEPT functions which
         * should be sufficient for all practical purposes.
         */
        int             (*fo_accept[FLOW_MAX_ACCEPT])(flow_tab_t *,
                            flow_state_t *);
} flow_ops_t;

/*
 * Generic flow table.
 */
struct flow_tab_s {
        krwlock_t               ft_lock;
        /*
         * Contains a list of functions (described above)
         * specific to this table type.
         */
        flow_ops_t              ft_ops;

        /*
         * Indicates what types of flows are supported.
         */
        flow_mask_t             ft_mask;

        /*
         * An array of flow_entry_t * of size ft_size.
         * Each element is the beginning of a hash chain.
         */
        flow_entry_t            **ft_table;
        uint_t                  ft_size;

        /*
         * The number of flows inserted into ft_table.
         */
        uint_t                  ft_flow_count;
        struct mac_impl_s       *ft_mip;
        struct mac_client_impl_s        *ft_mcip;
};

/*
 * This is used for describing what type of flow table can be created.
 * mac_flow.c contains a list of these structures.
 */
typedef struct flow_tab_info_s {
        flow_ops_t              *fti_ops;
        flow_mask_t             fti_mask;
        uint_t                  fti_size;
} flow_tab_info_t;

#define FLOW_TAB_EMPTY(ft)      ((ft) == NULL || (ft)->ft_flow_count == 0)


#define MCIP_STAT_UPDATE(m, s, c) {                                     \
        ((mac_client_impl_t *)(m))->mci_misc_stat.mms_##s               \
        += ((uint64_t)(c));                                             \
}

#define SRS_RX_STAT_UPDATE(m, s, c)  {                                  \
        ((mac_soft_ring_set_t *)(m))->srs_rx.sr_stat.mrs_##s            \
        += ((uint64_t)(c));                                             \
}

#define SRS_TX_STAT_UPDATE(m, s, c)  {                                  \
        ((mac_soft_ring_set_t *)(m))->srs_tx.st_stat.mts_##s            \
        += ((uint64_t)(c));                                             \
}

#define SRS_TX_STATS_UPDATE(m, s) {                                     \
        SRS_TX_STAT_UPDATE((m), opackets, (s)->mts_opackets);           \
        SRS_TX_STAT_UPDATE((m), obytes, (s)->mts_obytes);               \
        SRS_TX_STAT_UPDATE((m), oerrors, (s)->mts_oerrors);             \
}

#define SOFTRING_TX_STAT_UPDATE(m, s, c)  {                             \
        ((mac_soft_ring_t *)(m))->s_st_stat.mts_##s += ((uint64_t)(c)); \
}

#define SOFTRING_TX_STATS_UPDATE(m, s) {                                \
        SOFTRING_TX_STAT_UPDATE((m), opackets, (s)->mts_opackets);      \
        SOFTRING_TX_STAT_UPDATE((m), obytes, (s)->mts_obytes);          \
        SOFTRING_TX_STAT_UPDATE((m), oerrors, (s)->mts_oerrors);        \
}

extern void     mac_flow_init();
extern void     mac_flow_fini();
extern int      mac_flow_create(flow_desc_t *, mac_resource_props_t *,
                    char *, void *, uint_t, flow_entry_t **);

extern int      mac_flow_add(flow_tab_t *, flow_entry_t *);
extern int      mac_flow_add_subflow(mac_client_handle_t, flow_entry_t *,
                    boolean_t);
extern int      mac_flow_hash_add(flow_entry_t *);
extern int      mac_flow_lookup_byname(char *, flow_entry_t **);
extern int      mac_flow_lookup(flow_tab_t *, mblk_t *, uint_t,
                    flow_entry_t **);

extern int      mac_flow_walk(flow_tab_t *, int (*)(flow_entry_t *, void *),
                    void *);

extern int      mac_flow_walk_nolock(flow_tab_t *,
                    int (*)(flow_entry_t *, void *), void *);

extern void     mac_flow_modify(flow_tab_t *, flow_entry_t *,
                    mac_resource_props_t *);

extern void     *mac_flow_get_client_cookie(flow_entry_t *);

extern uint32_t mac_flow_modify_props(flow_entry_t *, mac_resource_props_t *);

extern void     mac_flow_get_desc(flow_entry_t *, flow_desc_t *);
extern void     mac_flow_set_desc(flow_entry_t *, flow_desc_t *);

extern void     mac_flow_remove(flow_tab_t *, flow_entry_t *, boolean_t);
extern void     mac_flow_hash_remove(flow_entry_t *);
extern void     mac_flow_wait(flow_entry_t *, mac_flow_state_t);
extern void     mac_flow_cleanup(flow_entry_t *);
extern void     mac_flow_destroy(flow_entry_t *);

extern void     mac_flow_tab_create(flow_ops_t *, flow_mask_t, uint_t,
                    struct mac_impl_s *, flow_tab_t **);
extern void     mac_flow_l2tab_create(struct mac_impl_s *, flow_tab_t **);
extern void     mac_flow_tab_destroy(flow_tab_t *);
extern void     flow_stat_destroy(flow_entry_t *);

#ifdef  __cplusplus
}
#endif

#endif  /* _MAC_FLOW_IMPL_H */