root/usr/src/uts/common/io/vioif/vioif.h
/*
 * This file and its contents are supplied under the terms of the
 * Common Development and Distribution License ("CDDL"), version 1.0.
 * You may only use this file in accordance with the terms of version
 * 1.0 of the CDDL.
 *
 * A full copy of the text of the CDDL should have accompanied this
 * source.  A copy of the CDDL is also available via the Internet at
 * http://www.illumos.org/license/CDDL.
 */

/*
 * Copyright 2021 Joyent, Inc.
 * Copyright 2026 Oxide Computer Company
 */

/*
 * VIRTIO NETWORK DRIVER
 */

#ifndef _VIOIF_H
#define _VIOIF_H

#include "virtio.h"

#ifdef __cplusplus
extern "C" {
#endif

/*
 * VIRTIO NETWORK CONFIGURATION REGISTERS
 *
 * These are offsets into the device-specific configuration space available
 * through the virtio_dev_*() family of functions.
 */
#define VIRTIO_NET_CONFIG_MAC           0x00    /* 48 R/W */
#define VIRTIO_NET_CONFIG_STATUS        0x06    /* 16 R   */
#define VIRTIO_NET_CONFIG_MAX_VQ_PAIRS  0x08    /* 16 R   */
#define VIRTIO_NET_CONFIG_MTU           0x0A    /* 16 R   */
#define VIRTIO_NET_CONFIG_SPEED         0x0C    /* 32 R   */
#define VIRTIO_NET_CONFIG_DUPLEX        0x10    /* 8  R   */

#define VIRTIO_NET_CONFIG_STATUS_LINK_UP        1
#define VIRTIO_NET_CONFIG_SPEED_UNKNOWN         UINT32_MAX
#define VIRTIO_NET_CONFIG_DUPLEX_HALF           0
#define VIRTIO_NET_CONFIG_DUPLEX_FULL           1
#define VIRTIO_NET_CONFIG_DUPLEX_UNKNOWN        0xff

/*
 * VIRTIO NETWORK VIRTQUEUES
 *
 * Note that the control queue is only present if VIRTIO_NET_F_CTRL_VQ is
 * negotiated with the device.
 */
#define VIRTIO_NET_VIRTQ_RX             0
#define VIRTIO_NET_VIRTQ_TX             1
#define VIRTIO_NET_VIRTQ_CONTROL        2

/*
 * VIRTIO NETWORK FEATURE BITS
 */

/*
 * CSUM, GUEST_CSUM:
 *      Partial checksum support.  These features signal that the device will
 *      accept packets with partial checksums (CSUM), and that the driver will
 *      accept packets with partial checksums (GUEST_CSUM).  These features
 *      combine the use of the VIRTIO_NET_HDR_F_NEEDS_CSUM flag, and the
 *      "csum_start" and "csum_offset" fields, in the virtio net header.
 */
#define VIRTIO_NET_F_CSUM               (1ULL << 0)
#define VIRTIO_NET_F_GUEST_CSUM         (1ULL << 1)

/*
 * MTU:
 *      The device offers a maximum MTU value at VIRTIO_NET_CONFIG_MTU.  If
 *      this is not negotiated, we allow the largest possible MTU that our
 *      buffer allocations support in case jumbo frames are tacitly supported
 *      by the device.  The default MTU is always 1500.
 */
#define VIRTIO_NET_F_MTU                (1ULL << 3)

/*
 * MAC:
 *      The device has an assigned primary MAC address.  If this feature bit is
 *      not set, the driver must provide a locally assigned MAC address.  See
 *      IEEE 802, "48-bit universal LAN MAC addresses" for more details on
 *      assignment.
 */
#define VIRTIO_NET_F_MAC                (1ULL << 5)

/*
 * GUEST_TSO4, GUEST_TSO6, GUEST_UFO:
 *      Inbound segmentation offload support.  These features depend on having
 *      VIRTIO_NET_F_GUEST_CSUM and signal that the driver can accept large
 *      combined TCP (v4 or v6) packets, or reassembled UDP fragments.
 */
#define VIRTIO_NET_F_GUEST_TSO4         (1ULL << 7)
#define VIRTIO_NET_F_GUEST_TSO6         (1ULL << 8)
#define VIRTIO_NET_F_GUEST_UFO          (1ULL << 10)

/*
 * GUEST_ECN:
 *      Depends on either VIRTIO_NET_F_GUEST_TSO4 or VIRTIO_NET_F_GUEST_TSO6.
 *      This feature means the driver will look for the VIRTIO_NET_HDR_GSO_ECN
 *      bit in the "gso_type" of the virtio net header.  This bit tells the
 *      driver that the Explicit Congestion Notification (ECN) bit was set in
 *      the original TCP packets.
 */
#define VIRTIO_NET_F_GUEST_ECN          (1ULL << 9)

/*
 * HOST_TSO4, HOST_TSO6, HOST_UFO:
 *      Outbound segmentation offload support.  These features depend on having
 *      VIRTIO_NET_F_CSUM and signal that the device will accept large combined
 *      TCP (v4 or v6) packets that require segmentation offload, or large
 *      combined UDP packets that require fragmentation offload.
 */
#define VIRTIO_NET_F_HOST_TSO4          (1ULL << 11)
#define VIRTIO_NET_F_HOST_TSO6          (1ULL << 12)
#define VIRTIO_NET_F_HOST_UFO           (1ULL << 14)

/*
 * HOST_ECN:
 *      Depends on either VIRTIO_NET_F_HOST_TSO4 or VIRTIO_NET_F_HOST_TSO6.
 *      This features means the device will accept packets that both require
 *      segmentation offload and have the Explicit Congestion Notification
 *      (ECN) bit set.  If this feature is not present, the device must not
 *      send large segments that require ECN to be set.
 */
#define VIRTIO_NET_F_HOST_ECN           (1ULL << 13)

/*
 * GSO:
 *      The GSO feature is, in theory, the combination of HOST_TSO4, HOST_TSO6,
 *      and HOST_ECN.  This is only useful for legacy devices; newer devices
 *      should be using the more specific bits above.
 */
#define VIRTIO_NET_F_GSO                (1ULL << 6)

/*
 * MRG_RXBUF:
 *      This feature allows the receipt of large packets without needing to
 *      allocate large buffers.  The "virtio_net_hdr" will include an extra
 *      value: the number of buffers to gang together.
 */
#define VIRTIO_NET_F_MRG_RXBUF          (1ULL << 15)

/*
 * STATUS:
 *      The VIRTIO_NET_CONFIG_STATUS configuration register is available, which
 *      allows the driver to read the link state from the device.
 */
#define VIRTIO_NET_F_STATUS             (1ULL << 16)

/*
 * CTRL_VQ, CTRL_RX, CTRL_VLAN:
 *      These features signal that the device exposes the control queue
 *      (VIRTIO_NET_VIRTQ_CONTROL), in the case of CTRL_VQ; and that the
 *      control queue supports extra commands (CTRL_RX, CTRL_VLAN).
 */
#define VIRTIO_NET_F_CTRL_VQ            (1ULL << 17)
#define VIRTIO_NET_F_CTRL_RX            (1ULL << 18)
#define VIRTIO_NET_F_CTRL_VLAN          (1ULL << 19)
#define VIRTIO_NET_F_CTRL_RX_EXTRA      (1ULL << 20)

/*
 * SPEED_DUPLEX:
 *      The VIRTIO_NET_CONFIG_SPEED and VIRTIO_NET_CONFIG_DUPLEX registers are
 *      available, which allows the driver to determine the link speed and
 *      duplex of the device.
 */
#define VIRTIO_NET_F_SPEED_DUPLEX       (1ULL << 63)

/*
 * These features are supported by the driver and we will request them from the
 * device.  Note that we do not currently request GUEST_CSUM, as the driver
 * does not presently support receiving frames with any offload features from
 * the device.
 */
#define VIRTIO_NET_WANTED_FEATURES      (VIRTIO_NET_F_CSUM |            \
                                        VIRTIO_NET_F_GSO |              \
                                        VIRTIO_NET_F_HOST_TSO4 |        \
                                        VIRTIO_NET_F_HOST_TSO6 |        \
                                        VIRTIO_NET_F_HOST_ECN |         \
                                        VIRTIO_NET_F_MAC |              \
                                        VIRTIO_NET_F_MTU |              \
                                        VIRTIO_NET_F_CTRL_VQ |          \
                                        VIRTIO_NET_F_CTRL_RX |          \
                                        VIRTIO_NET_F_STATUS)

/* The following features are only available with the modern interface. */
#define VIRTIO_NET_WANTED_FEATURES_MODERN       \
                                        (VIRTIO_NET_F_SPEED_DUPLEX)

/*
 * VIRTIO NETWORK HEADER
 *
 * This structure appears at the start of each transmit or receive packet
 * buffer. When using the legacy interface, the `vnh_num_buffers` field only
 * appears when VIRTIO_NET_F_MRG_RXBUF is negotiated, whereas it is always
 * present with the modern interface. We use a common struct since it is always
 * constructed in a buffer of at least this size, and then only put the
 * requisite number of bytes (VIRTIO_NET_HDR_LEN()) into the descriptor.
 */
struct virtio_net_hdr {
        uint8_t                         vnh_flags;
        uint8_t                         vnh_gso_type;
        uint16_t                        vnh_hdr_len;
        uint16_t                        vnh_gso_size;
        uint16_t                        vnh_csum_start;
        uint16_t                        vnh_csum_offset;
        uint16_t                        vnh_num_buffers;
} __packed;
#define VIRTIO_NET_HDR_LEN(modern) \
        ((modern) ? sizeof (struct virtio_net_hdr) : \
        offsetof(struct virtio_net_hdr, vnh_num_buffers))

/*
 * VIRTIO NETWORK HEADER: FLAGS (vnh_flags)
 */
#define VIRTIO_NET_HDR_F_NEEDS_CSUM     0x01

/*
 * VIRTIO NETWORK HEADER: OFFLOAD OPTIONS (vnh_gso_type)
 *
 * Each of these is an offload type, except for the ECN value which is
 * logically OR-ed with one of the other types.
 */
#define VIRTIO_NET_HDR_GSO_NONE         0
#define VIRTIO_NET_HDR_GSO_TCPV4        1
#define VIRTIO_NET_HDR_GSO_UDP          3
#define VIRTIO_NET_HDR_GSO_TCPV6        4
#define VIRTIO_NET_HDR_GSO_ECN          0x80

/*
 * VIRTIO CONTROL VIRTQUEUE HEADER
 *
 * This structure appears at the start of each control virtqueue request.
 */
struct virtio_net_ctrlq_hdr {
        uint8_t         vnch_class;
        uint8_t         vnch_command;
} __packed;

/*
 * Control Queue Classes
 */
#define VIRTIO_NET_CTRL_RX              0

/*
 * CTRL_RX commands
 */
#define VIRTIO_NET_CTRL_RX_PROMISC      0
#define VIRTIO_NET_CTRL_RX_ALLMULTI     1
#define VIRTIO_NET_CTRL_RX_ALLUNI       2
#define VIRTIO_NET_CTRL_RX_NOMULTI      3
#define VIRTIO_NET_CTRL_RX_NOUNI        4
#define VIRTIO_NET_CTRL_RX_NOBCAST      5

/*
 * Control queue ack values
 */
#define VIRTIO_NET_CQ_OK                0
#define VIRTIO_NET_CQ_ERR               1


/*
 * DRIVER PARAMETERS
 */

/*
 * At attach, we allocate a fixed pool of buffers for receipt and transmission
 * of frames.  The maximum number of buffers of each type that we will allocate
 * is specified here.  If the ring size is smaller than this number, we will
 * use the ring size instead.
 */
#define VIRTIO_NET_TX_BUFS              256
#define VIRTIO_NET_RX_BUFS              256

/*
 * Initially, only use a single buf for control queue requests (when
 * present). If this becomes a bottleneck, we can simply increase this
 * value as necessary.
 */
#define VIRTIO_NET_CTRL_BUFS            1

/*
 * The virtio net header and the first buffer segment share the same DMA
 * allocation.  We round up the virtio header size to a multiple of 4 and add 2
 * bytes so that the IP header, which starts immediately after the 14 or 18
 * byte Ethernet header, is then correctly aligned:
 *
 *   0                10      16   18                              32/36
 *   | virtio_net_hdr | %4==0 | +2 | Ethernet header (14/18 bytes) | IPv4 ...
 *
 * Note that for this to work correctly, the DMA allocation must also be 4 byte
 * aligned.
 */
#define VIOIF_HEADER_ALIGN              4
#define VIOIF_HEADER_SKIP               (P2ROUNDUP( \
                                            sizeof (struct virtio_net_hdr), \
                                            VIOIF_HEADER_ALIGN) + 2)

/*
 * Given we are not negotiating VIRTIO_NET_F_MRG_RXBUF, the specification says
 * we must be able to accept a 1514 byte packet, or if any segmentation offload
 * features have been negotiated a 65550 byte packet.  To keep things simple,
 * we'll assume segmentation offload is possible in most cases.  In addition to
 * the packet payload, we need to account for the Ethernet header and the
 * virtio_net_hdr.
 */
#define VIOIF_RX_DATA_SIZE              65550
#define VIOIF_RX_BUF_SIZE               (VIOIF_RX_DATA_SIZE + \
                                            sizeof (struct ether_header) + \
                                            VIOIF_HEADER_SKIP)

/*
 * If we assume that a large allocation will probably have mostly 4K page sized
 * cookies, 64 segments allows us 256KB for a single frame.  We're in control
 * of the allocation we use for receive buffers, so this value only has an
 * impact on the length of chain we're able to create for external transmit
 * buffer mappings.
 */
#define VIOIF_MAX_SEGS                  64

/*
 * We pre-allocate a reasonably large buffer to copy small packets
 * there. Bigger packets are mapped, packets with multiple
 * cookies are mapped as indirect buffers.
 */
#define VIOIF_TX_INLINE_SIZE            (2 * 1024)

/*
 * Control queue messages are very small. This is a rather arbitrary small
 * bufer size that should be sufficiently large for any control queue
 * messages we will send.
 */
#define VIOIF_CTRL_SIZE                 256

/*
 * TYPE DEFINITIONS
 */

typedef struct vioif vioif_t;

/*
 * Receive buffers are allocated in advance as a combination of DMA memory and
 * a descriptor chain.  Receive buffers can be loaned to the networking stack
 * to avoid copying, and this object contains the free routine to pass to
 * desballoc().
 *
 * When receive buffers are not in use, they are linked into the per-instance
 * free list, "vif_rxbufs" via "rb_link".  Under normal conditions, we expect
 * the free list to be empty much of the time; most buffers will be in the ring
 * or on loan.
 */
typedef struct vioif_rxbuf {
        vioif_t                         *rb_vioif;
        frtn_t                          rb_frtn;

        virtio_dma_t                    *rb_dma;
        virtio_chain_t                  *rb_chain;

        list_node_t                     rb_link;
} vioif_rxbuf_t;

typedef struct vioif_ctrlbuf {
        vioif_t                         *cb_vioif;

        virtio_dma_t                    *cb_dma;
        virtio_chain_t                  *cb_chain;

        list_node_t                     cb_link;
} vioif_ctrlbuf_t;

/*
 * Transmit buffers are also allocated in advance.  DMA memory is allocated for
 * the virtio net header, and to hold small packets.  Larger packets are mapped
 * from storage loaned to the driver by the network stack.
 *
 * When transmit buffers are not in use, they are linked into the per-instance
 * free list, "vif_txbufs" via "tb_link".
 */
typedef struct vioif_txbuf {
        mblk_t                          *tb_mp;

        /*
         * Inline buffer space (VIOIF_TX_INLINE_SIZE) for storage of the virtio
         * net header, and to hold copied (rather than mapped) packet data.
         */
        virtio_dma_t                    *tb_dma;
        virtio_chain_t                  *tb_chain;

        /*
         * External buffer mapping.  The capacity is fixed at allocation time,
         * and "tb_ndmaext" tracks the current number of mappings.
         */
        virtio_dma_t                    **tb_dmaext;
        uint_t                          tb_dmaext_capacity;
        uint_t                          tb_ndmaext;

        list_node_t                     tb_link;
} vioif_txbuf_t;

typedef enum vioif_runstate {
        VIOIF_RUNSTATE_STOPPED = 1,
        VIOIF_RUNSTATE_STOPPING,
        VIOIF_RUNSTATE_RUNNING
} vioif_runstate_t;

/*
 * Per-instance driver object.
 */
struct vioif {
        dev_info_t                      *vif_dip;
        virtio_t                        *vif_virtio;

        kmutex_t                        vif_mutex;

        /*
         * The NIC is considered RUNNING between the mc_start(9E) and
         * mc_stop(9E) calls.  Otherwise it is STOPPING (while draining
         * resources) then STOPPED.  When not RUNNING, we will drop incoming
         * frames and refuse to insert more receive buffers into the receive
         * queue.
         */
        vioif_runstate_t                vif_runstate;

        mac_handle_t                    vif_mac_handle;

        virtio_queue_t                  *vif_rx_vq;
        virtio_queue_t                  *vif_tx_vq;
        virtio_queue_t                  *vif_ctrl_vq;

        /* TX virtqueue management resources */
        boolean_t                       vif_tx_corked;
        boolean_t                       vif_tx_drain;
        timeout_id_t                    vif_tx_reclaim_tid;

        /*
         * Configured offload features:
         */
        unsigned int                    vif_tx_csum:1;
        unsigned int                    vif_tx_tso4:1;
        unsigned int                    vif_tx_tso6:1;

        /*
         * For debugging, it is useful to know whether the MAC address we
         * are using came from the host (via VIRTIO_NET_CONFIG_MAC) or
         * was otherwise generated or set from within the guest.
         */
        unsigned int                    vif_mac_from_host:1;

        unsigned int                    vif_has_ctrlq:1;
        unsigned int                    vif_has_ctrlq_rx:1;

        uint32_t                        vif_speed;      /* Mb/s */
        uint16_t                        vif_status;
        uint8_t                         vif_duplex;
        uint_t                          vif_mtu;
        uint_t                          vif_mtu_max;
        uint8_t                         vif_mac[ETHERADDRL];

        /*
         * Receive buffer free list and accounting:
         */
        list_t                          vif_rxbufs;
        uint_t                          vif_nrxbufs_alloc;
        uint_t                          vif_nrxbufs_onloan;
        uint_t                          vif_nrxbufs_onloan_max;
        uint_t                          vif_rxbufs_capacity;
        vioif_rxbuf_t                   *vif_rxbufs_mem;
        size_t                          vif_rxbuf_hdrlen;

        /*
         * Transmit buffer free list and accounting:
         */
        list_t                          vif_txbufs;
        uint_t                          vif_ntxbufs_alloc;
        uint_t                          vif_txbufs_capacity;
        vioif_txbuf_t                   *vif_txbufs_mem;

        /*
         * These copy size thresholds are exposed as private MAC properties so
         * that they can be tuned without rebooting.
         */
        uint_t                          vif_rxcopy_thresh;
        uint_t                          vif_txcopy_thresh;

        list_t                          vif_ctrlbufs;
        uint_t                          vif_nctrlbufs_alloc;
        uint_t                          vif_ctrlbufs_capacity;
        vioif_ctrlbuf_t                 *vif_ctrlbufs_mem;

        /*
         * Statistics visible through mac:
         */
        uint64_t                        vif_ipackets;
        uint64_t                        vif_opackets;
        uint64_t                        vif_rbytes;
        uint64_t                        vif_obytes;
        uint64_t                        vif_brdcstxmt;
        uint64_t                        vif_brdcstrcv;
        uint64_t                        vif_multixmt;
        uint64_t                        vif_multircv;
        uint64_t                        vif_norecvbuf;
        uint64_t                        vif_notxbuf;
        uint64_t                        vif_ierrors;
        uint64_t                        vif_oerrors;

        /*
         * Internal debugging statistics:
         */
        uint64_t                        vif_rxfail_dma_handle;
        uint64_t                        vif_rxfail_dma_buffer;
        uint64_t                        vif_rxfail_dma_bind;
        uint64_t                        vif_rxfail_chain_undersize;
        uint64_t                        vif_rxfail_no_descriptors;
        uint64_t                        vif_txfail_dma_handle;
        uint64_t                        vif_txfail_dma_bind;
        uint64_t                        vif_txfail_indirect_limit;

        uint64_t                        vif_stat_tx_reclaim;

        uint64_t                        vif_noctrlbuf;
        uint64_t                        vif_ctrlbuf_toosmall;
};

#ifdef __cplusplus
}
#endif

#endif /* _VIOIF_H */