root/sys/net/if_infiniband.c
/*-
 * Copyright (c) 2020 Mellanox Technologies. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS `AS IS' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

#include "opt_inet.h"
#include "opt_inet6.h"
#include "opt_kbd.h"

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/devctl.h>
#include <sys/eventhandler.h>
#include <sys/kernel.h>
#include <sys/mbuf.h>
#include <sys/module.h>
#include <sys/socket.h>
#include <sys/sysctl.h>
#ifdef KDB
#include <sys/kdb.h>
#endif

#include <net/bpf.h>
#include <net/ethernet.h>
#include <net/infiniband.h>
#include <net/if.h>
#include <net/if_var.h>
#include <net/if_private.h>
#include <net/if_dl.h>
#include <net/if_media.h>
#include <net/if_lagg.h>
#include <net/if_llatbl.h>
#include <net/if_types.h>
#include <net/netisr.h>
#include <net/route.h>
#include <netinet/if_ether.h>
#include <netinet/in.h>
#include <netinet/ip6.h>
#include <netinet6/in6_var.h>
#include <netinet6/nd6.h>

#include <security/mac/mac_framework.h>

/* if_lagg(4) support */
struct mbuf *(*lagg_input_infiniband_p)(struct ifnet *, struct mbuf *);

#ifdef INET
static inline void
infiniband_ipv4_multicast_map(uint32_t addr,
    const uint8_t *broadcast, uint8_t *buf)
{
        uint8_t scope;

        addr = ntohl(addr);
        scope = broadcast[5] & 0xF;

        buf[0] = 0;
        buf[1] = 0xff;
        buf[2] = 0xff;
        buf[3] = 0xff;
        buf[4] = 0xff;
        buf[5] = 0x10 | scope;
        buf[6] = 0x40;
        buf[7] = 0x1b;
        buf[8] = broadcast[8];
        buf[9] = broadcast[9];
        buf[10] = 0;
        buf[11] = 0;
        buf[12] = 0;
        buf[13] = 0;
        buf[14] = 0;
        buf[15] = 0;
        buf[16] = (addr >> 24) & 0xff;
        buf[17] = (addr >> 16) & 0xff;
        buf[18] = (addr >> 8) & 0xff;
        buf[19] = addr & 0xff;
}
#endif

#ifdef INET6
static inline void
infiniband_ipv6_multicast_map(const struct in6_addr *addr,
    const uint8_t *broadcast, uint8_t *buf)
{
        uint8_t scope;

        scope = broadcast[5] & 0xF;

        buf[0] = 0;
        buf[1] = 0xff;
        buf[2] = 0xff;
        buf[3] = 0xff;
        buf[4] = 0xff;
        buf[5] = 0x10 | scope;
        buf[6] = 0x60;
        buf[7] = 0x1b;
        buf[8] = broadcast[8];
        buf[9] = broadcast[9];
        memcpy(&buf[10], &addr->s6_addr[6], 10);
}
#endif

/*
 * This is for clients that have an infiniband_header in the mbuf.
 */
void
infiniband_bpf_mtap(struct ifnet *ifp, struct mbuf *mb)
{
        struct infiniband_header *ibh;
        struct ether_header eh;

        if (!bpf_peers_present(ifp->if_bpf))
                return;

        M_ASSERTVALID(mb);
        if (mb->m_len < sizeof(*ibh))
                return;

        ibh = mtod(mb, struct infiniband_header *);
        eh.ether_type = ibh->ib_protocol;
        memset(eh.ether_shost, 0, ETHER_ADDR_LEN);
        memcpy(eh.ether_dhost, ibh->ib_hwaddr + 4, ETHER_ADDR_LEN);
        mb->m_data += sizeof(*ibh);
        mb->m_len -= sizeof(*ibh);
        mb->m_pkthdr.len -= sizeof(*ibh);
        bpf_mtap2(ifp->if_bpf, &eh, sizeof(eh), mb);
        mb->m_data -= sizeof(*ibh);
        mb->m_len += sizeof(*ibh);
        mb->m_pkthdr.len += sizeof(*ibh);
}

/*
 * For clients using BPF to send broadcasts.
 *
 * This driver binds to BPF as an EN10MB (Ethernet) device type. As such, it is
 * expected BPF and BPF users will send frames with Ethernet headers, which
 * we'll do our best to handle. We can't resolve non-native unicast or multicast
 * link-layer addresses, but we can handle broadcast frames.
 *
 * phlen is populated with IB header size if ibh was populated, 0 otherwise.
 */
static int
infiniband_resolve_bpf(struct ifnet *ifp, const struct sockaddr *dst,
    struct mbuf *mb, const struct route *ro, struct infiniband_header *ibh,
    int *phlen)
{
        struct ether_header *eh = (struct ether_header *)ro->ro_prepend;
        /* If the prepend data & address length don't have the signature of a frame
         * forwarded by BPF, allow frame to passthrough. */
        if (((ro->ro_flags & RT_HAS_HEADER) == 0) ||
            (ro->ro_plen != ETHER_HDR_LEN)) {
                *phlen = 0;
                return (0);
        }

        /* Looks like this frame is from BPF. Handle broadcasts, reject otherwise */
        if (!ETHER_IS_BROADCAST(eh->ether_dhost))
                return (EOPNOTSUPP);

        memcpy(ibh->ib_hwaddr, ifp->if_broadcastaddr, sizeof(ibh->ib_hwaddr));
        ibh->ib_protocol = eh->ether_type;
        mb->m_flags &= ~M_MCAST;
        mb->m_flags |= M_BCAST;

        *phlen = INFINIBAND_HDR_LEN;
        return (0);
}

static void
update_mbuf_csumflags(struct mbuf *src, struct mbuf *dst)
{
        int csum_flags = 0;

        if (src->m_pkthdr.csum_flags & CSUM_IP)
                csum_flags |= (CSUM_IP_CHECKED|CSUM_IP_VALID);
        if (src->m_pkthdr.csum_flags & CSUM_DELAY_DATA)
                csum_flags |= (CSUM_DATA_VALID|CSUM_PSEUDO_HDR);
        if (src->m_pkthdr.csum_flags & CSUM_SCTP)
                csum_flags |= CSUM_SCTP_VALID;
        dst->m_pkthdr.csum_flags |= csum_flags;
        if (csum_flags & CSUM_DATA_VALID)
                dst->m_pkthdr.csum_data = 0xffff;
}

/*
 * Handle link-layer encapsulation requests.
 */
static int
infiniband_requestencap(struct ifnet *ifp, struct if_encap_req *req)
{
        struct infiniband_header *ih;
        struct arphdr *ah;
        uint16_t etype;
        const uint8_t *lladdr;

        if (req->rtype != IFENCAP_LL)
                return (EOPNOTSUPP);

        if (req->bufsize < INFINIBAND_HDR_LEN)
                return (ENOMEM);

        ih = (struct infiniband_header *)req->buf;
        lladdr = req->lladdr;
        req->lladdr_off = 0;

        switch (req->family) {
        case AF_INET:
                etype = htons(ETHERTYPE_IP);
                break;
        case AF_INET6:
                etype = htons(ETHERTYPE_IPV6);
                break;
        case AF_ARP:
                ah = (struct arphdr *)req->hdata;
                ah->ar_hrd = htons(ARPHRD_INFINIBAND);

                switch (ntohs(ah->ar_op)) {
                case ARPOP_REVREQUEST:
                case ARPOP_REVREPLY:
                        etype = htons(ETHERTYPE_REVARP);
                        break;
                case ARPOP_REQUEST:
                case ARPOP_REPLY:
                default:
                        etype = htons(ETHERTYPE_ARP);
                        break;
                }

                if (req->flags & IFENCAP_FLAG_BROADCAST)
                        lladdr = ifp->if_broadcastaddr;
                break;
        default:
                return (EAFNOSUPPORT);
        }

        ih->ib_protocol = etype;
        ih->ib_reserved = 0;
        memcpy(ih->ib_hwaddr, lladdr, INFINIBAND_ADDR_LEN);
        req->bufsize = sizeof(struct infiniband_header);

        return (0);
}

static int
infiniband_resolve_addr(struct ifnet *ifp, struct mbuf *m,
    const struct sockaddr *dst, struct route *ro, uint8_t *phdr,
    uint32_t *pflags, struct llentry **plle)
{
#if defined(INET) || defined(INET6)
        struct infiniband_header *ih = (struct infiniband_header *)phdr;
#endif
        uint32_t lleflags = 0;
        int error = 0;

        if (plle)
                *plle = NULL;

        switch (dst->sa_family) {
#ifdef INET
        case AF_INET:
                if ((m->m_flags & (M_BCAST | M_MCAST)) == 0) {
                        error = arpresolve(ifp, 0, m, dst, phdr, &lleflags, plle);
                } else {
                        if (m->m_flags & M_BCAST) {
                                memcpy(ih->ib_hwaddr, ifp->if_broadcastaddr,
                                    INFINIBAND_ADDR_LEN);
                        } else {
                                infiniband_ipv4_multicast_map(
                                    ((const struct sockaddr_in *)dst)->sin_addr.s_addr,
                                    ifp->if_broadcastaddr, ih->ib_hwaddr);
                        }
                        ih->ib_protocol = htons(ETHERTYPE_IP);
                        ih->ib_reserved = 0;
                }
                break;
#endif
#ifdef INET6
        case AF_INET6:
                if ((m->m_flags & M_MCAST) == 0) {
                        int af = RO_GET_FAMILY(ro, dst);
                        error = nd6_resolve(ifp, LLE_SF(af, 0), m, dst, phdr,
                            &lleflags, plle);
                } else {
                        infiniband_ipv6_multicast_map(
                            &((const struct sockaddr_in6 *)dst)->sin6_addr,
                            ifp->if_broadcastaddr, ih->ib_hwaddr);
                        ih->ib_protocol = htons(ETHERTYPE_IPV6);
                        ih->ib_reserved = 0;
                }
                break;
#endif
        default:
                if_printf(ifp, "can't handle af%d\n", dst->sa_family);
                if (m != NULL)
                        m_freem(m);
                return (EAFNOSUPPORT);
        }

        if (error == EHOSTDOWN) {
                if (ro != NULL && (ro->ro_flags & RT_HAS_GW) != 0)
                        error = EHOSTUNREACH;
        }

        if (error != 0)
                return (error);

        *pflags = RT_MAY_LOOP;
        if (lleflags & LLE_IFADDR)
                *pflags |= RT_L2_ME;

        return (0);
}

/*
 * Infiniband output routine.
 */
static int
infiniband_output(struct ifnet *ifp, struct mbuf *m,
    const struct sockaddr *dst, struct route *ro)
{
        uint8_t linkhdr[INFINIBAND_HDR_LEN];
        uint8_t *phdr;
        struct llentry *lle = NULL;
        struct infiniband_header *ih;
        int error = 0;
        int hlen  = 0;  /* link layer header length */
        uint32_t pflags;
        bool addref;

        NET_EPOCH_ASSERT();

        addref = false;
        phdr = NULL;
        pflags = 0;
        if (ro != NULL) {
                /* XXX BPF and ARP use ro_prepend */
                if (ro->ro_prepend != NULL) {
                        ih = (struct infiniband_header *)linkhdr;
                        /* Assess whether frame is from BPF and handle */
                        error = infiniband_resolve_bpf(ifp, dst, m, ro, ih, &hlen);
                        if (error != 0)
                                goto bad;

                        if (hlen != 0) {
                                phdr = linkhdr;
                        } else {
                                phdr = ro->ro_prepend;
                                hlen = ro->ro_plen;
                        }
                } else if (!(m->m_flags & (M_BCAST | M_MCAST))) {
                        if ((ro->ro_flags & RT_LLE_CACHE) != 0) {
                                lle = ro->ro_lle;
                                if (lle != NULL &&
                                    (lle->la_flags & LLE_VALID) == 0) {
                                        LLE_FREE(lle);
                                        lle = NULL;     /* redundant */
                                        ro->ro_lle = NULL;
                                }
                                if (lle == NULL) {
                                        /* if we lookup, keep cache */
                                        addref = 1;
                                } else
                                        /*
                                         * Notify LLE code that
                                         * the entry was used
                                         * by datapath.
                                         */
                                        llentry_provide_feedback(lle);
                        }
                        if (lle != NULL) {
                                phdr = lle->r_linkdata;
                                hlen = lle->r_hdrlen;
                                pflags = lle->r_flags;
                        }
                }
        }

#ifdef MAC
        error = mac_ifnet_check_transmit(ifp, m);
        if (error)
                goto bad;
#endif

        M_PROFILE(m);
        if (ifp->if_flags & IFF_MONITOR) {
                error = ENETDOWN;
                goto bad;
        }
        if (!((ifp->if_flags & IFF_UP) &&
            (ifp->if_drv_flags & IFF_DRV_RUNNING))) {
                error = ENETDOWN;
                goto bad;
        }

        if (phdr == NULL) {
                /* No prepend data supplied. Try to calculate ourselves. */
                phdr = linkhdr;
                hlen = INFINIBAND_HDR_LEN;
                error = infiniband_resolve_addr(ifp, m, dst, ro, phdr, &pflags,
                    addref ? &lle : NULL);
                if (addref && lle != NULL)
                        ro->ro_lle = lle;
                if (error != 0)
                        return (error == EWOULDBLOCK ? 0 : error);
        }

        if ((pflags & RT_L2_ME) != 0) {
                update_mbuf_csumflags(m, m);
                return (if_simloop(ifp, m, RO_GET_FAMILY(ro, dst), 0));
        }

        /*
         * Add local infiniband header. If no space in first mbuf,
         * allocate another.
         */
        M_PREPEND(m, hlen, M_NOWAIT);
        if (m == NULL) {
                error = ENOBUFS;
                goto bad;
        }
        if ((pflags & RT_HAS_HEADER) == 0) {
                ih = mtod(m, struct infiniband_header *);
                memcpy(ih, phdr, hlen);
        }

        /*
         * Queue message on interface, update output statistics if
         * successful, and start output if interface not yet active.
         */
        return (ifp->if_transmit(ifp, m));
bad:
        if (m != NULL)
                m_freem(m);
        return (error);
}

/*
 * Process a received Infiniband packet.
 */
static void
infiniband_input(struct ifnet *ifp, struct mbuf *m)
{
        struct infiniband_header *ibh;
        struct epoch_tracker et;
        int isr;
        bool needs_epoch;

        needs_epoch = (ifp->if_flags & IFF_NEEDSEPOCH);
#ifdef INVARIANTS
        /*
         * This temporary code is here to prevent epoch unaware and unmarked
         * drivers to panic the system.  Once all drivers are taken care of,
         * the whole INVARIANTS block should go away.
         */
        if (!needs_epoch && !in_epoch(net_epoch_preempt)) {
                static bool printedonce;

                needs_epoch = true;
                if (!printedonce) {
                        printedonce = true;
                        if_printf(ifp, "called %s w/o net epoch! "
                            "PLEASE file a bug report.", __func__);
#ifdef KDB
                        kdb_backtrace();
#endif
                }
        }
#endif

        CURVNET_SET_QUIET(ifp->if_vnet);
        if (__predict_false(needs_epoch))
                NET_EPOCH_ENTER(et);

        if ((ifp->if_flags & IFF_UP) == 0) {
                if_inc_counter(ifp, IFCOUNTER_IERRORS, 1);
                m_freem(m);
                goto done;
        }

        ibh = mtod(m, struct infiniband_header *);

        /*
         * Reset layer specific mbuf flags to avoid confusing upper
         * layers:
         */
        m->m_flags &= ~M_VLANTAG;
        m_clrprotoflags(m);

        if (INFINIBAND_IS_MULTICAST(ibh->ib_hwaddr)) {
                if (memcmp(ibh->ib_hwaddr, ifp->if_broadcastaddr,
                    ifp->if_addrlen) == 0)
                        m->m_flags |= M_BCAST;
                else
                        m->m_flags |= M_MCAST;
                if_inc_counter(ifp, IFCOUNTER_IMCASTS, 1);
        }

        /* Let BPF have it before we strip the header. */
        infiniband_bpf_mtap(ifp, m);

        /* Allow monitor mode to claim this frame, after stats are updated. */
        if (ifp->if_flags & IFF_MONITOR) {
                m_freem(m);
                goto done;
        }

        /* Direct packet to correct FIB based on interface config. */
        M_SETFIB(m, ifp->if_fib);

        /* Handle input from a lagg<N> port */
        if (ifp->if_type == IFT_INFINIBANDLAG) {
                KASSERT(lagg_input_infiniband_p != NULL,
                    ("%s: if_lagg not loaded!", __func__));
                m = (*lagg_input_infiniband_p)(ifp, m);
                if (__predict_false(m == NULL))
                        goto done;
                ifp = m->m_pkthdr.rcvif;
        }

        /*
         * Dispatch frame to upper layer.
         */
        switch (ibh->ib_protocol) {
#ifdef INET
        case htons(ETHERTYPE_IP):
                isr = NETISR_IP;
                break;

        case htons(ETHERTYPE_ARP):
                if (ifp->if_flags & IFF_NOARP) {
                        /* Discard packet if ARP is disabled on interface */
                        m_freem(m);
                        goto done;
                }
                isr = NETISR_ARP;
                break;
#endif
#ifdef INET6
        case htons(ETHERTYPE_IPV6):
                isr = NETISR_IPV6;
                break;
#endif
        default:
                if_inc_counter(ifp, IFCOUNTER_IERRORS, 1);
                m_freem(m);
                goto done;
        }

        /* Strip off the Infiniband header. */
        m_adj(m, INFINIBAND_HDR_LEN);

#ifdef MAC
        /*
         * Tag the mbuf with an appropriate MAC label before any other
         * consumers can get to it.
         */
        mac_ifnet_create_mbuf(ifp, m);
#endif
        /* Allow monitor mode to claim this frame, after stats are updated. */
        netisr_dispatch(isr, m);
done:
        if (__predict_false(needs_epoch))
                NET_EPOCH_EXIT(et);
        CURVNET_RESTORE();
}

static int
infiniband_resolvemulti(struct ifnet *ifp, struct sockaddr **llsa,
    struct sockaddr *sa)
{
        struct sockaddr_dl *sdl;
#ifdef INET
        struct sockaddr_in *sin;
#endif
#ifdef INET6
        struct sockaddr_in6 *sin6;
#endif
        uint8_t *e_addr;

        switch (sa->sa_family) {
        case AF_LINK:
                /*
                 * No mapping needed. Just check that it's a valid MC address.
                 */
                sdl = (struct sockaddr_dl *)sa;
                e_addr = LLADDR(sdl);
                if (!INFINIBAND_IS_MULTICAST(e_addr))
                        return (EADDRNOTAVAIL);
                *llsa = NULL;
                return 0;

#ifdef INET
        case AF_INET:
                sin = (struct sockaddr_in *)sa;
                if (!IN_MULTICAST(ntohl(sin->sin_addr.s_addr)))
                        return (EADDRNOTAVAIL);
                sdl = link_init_sdl(ifp, *llsa, IFT_INFINIBAND);
                sdl->sdl_alen = INFINIBAND_ADDR_LEN;
                e_addr = LLADDR(sdl);
                infiniband_ipv4_multicast_map(
                    sin->sin_addr.s_addr, ifp->if_broadcastaddr, e_addr);
                *llsa = (struct sockaddr *)sdl;
                return (0);
#endif
#ifdef INET6
        case AF_INET6:
                sin6 = (struct sockaddr_in6 *)sa;
                /*
                 * An IP6 address of 0 means listen to all of the
                 * multicast address used for IP6. This has no meaning
                 * in infiniband.
                 */
                if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr))
                        return (EADDRNOTAVAIL);
                if (!IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr))
                        return (EADDRNOTAVAIL);
                sdl = link_init_sdl(ifp, *llsa, IFT_INFINIBAND);
                sdl->sdl_alen = INFINIBAND_ADDR_LEN;
                e_addr = LLADDR(sdl);
                infiniband_ipv6_multicast_map(
                    &sin6->sin6_addr, ifp->if_broadcastaddr, e_addr);
                *llsa = (struct sockaddr *)sdl;
                return (0);
#endif
        default:
                return (EAFNOSUPPORT);
        }
}

void
infiniband_ifattach(struct ifnet *ifp, const uint8_t *lla, const uint8_t *llb)
{
        struct sockaddr_dl *sdl;
        struct ifaddr *ifa;
        int i;

        ifp->if_addrlen = INFINIBAND_ADDR_LEN;
        ifp->if_hdrlen = INFINIBAND_HDR_LEN;
        ifp->if_mtu = INFINIBAND_MTU;
        if_attach(ifp);
        ifp->if_output = infiniband_output;
        ifp->if_input = infiniband_input;
        ifp->if_resolvemulti = infiniband_resolvemulti;
        ifp->if_requestencap = infiniband_requestencap;

        if (ifp->if_baudrate == 0)
                ifp->if_baudrate = IF_Gbps(10); /* default value */
        if (llb != NULL)
                ifp->if_broadcastaddr = llb;

        ifa = ifp->if_addr;
        KASSERT(ifa != NULL, ("%s: no lladdr!\n", __func__));
        sdl = (struct sockaddr_dl *)ifa->ifa_addr;
        sdl->sdl_type = IFT_INFINIBAND;
        sdl->sdl_alen = ifp->if_addrlen;

        if (lla != NULL) {
                memcpy(LLADDR(sdl), lla, ifp->if_addrlen);

                if (ifp->if_hw_addr != NULL)
                        memcpy(ifp->if_hw_addr, lla, ifp->if_addrlen);
        } else {
                lla = LLADDR(sdl);
        }

        /* Attach ethernet compatible network device */
        bpfattach(ifp, DLT_EN10MB, ETHER_HDR_LEN);

        /* Announce Infiniband MAC address if non-zero. */
        for (i = 0; i < ifp->if_addrlen; i++)
                if (lla[i] != 0)
                        break;
        if (i != ifp->if_addrlen)
                if_printf(ifp, "Infiniband address: %20D\n", lla, ":");

        /* Add necessary bits are setup; announce it now. */
        EVENTHANDLER_INVOKE(infiniband_ifattach_event, ifp);

        if (IS_DEFAULT_VNET(curvnet))
                devctl_notify("INFINIBAND", ifp->if_xname, "IFATTACH", NULL);
}

/*
 * Perform common duties while detaching an Infiniband interface
 */
void
infiniband_ifdetach(struct ifnet *ifp)
{
        bpfdetach(ifp);
        if_detach(ifp);
}

static int
infiniband_modevent(module_t mod, int type, void *data)
{
        switch (type) {
        case MOD_LOAD:
        case MOD_UNLOAD:
                return (0);
        default:
                return (EOPNOTSUPP);
        }
}

static moduledata_t infiniband_mod = {
        .name = "if_infiniband",
        .evhand = &infiniband_modevent,
};

DECLARE_MODULE(if_infiniband, infiniband_mod, SI_SUB_INIT_IF, SI_ORDER_ANY);
MODULE_VERSION(if_infiniband, 1);