root/usr/src/uts/common/io/trill.c
/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */

/*
 * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 * Copyright 2024 Oxide Computer Company
 */

/*
 *  This module supports AF_TRILL sockets and TRILL layer-2 forwarding.
 */

#include <sys/strsubr.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/modctl.h>
#include <sys/cmn_err.h>
#include <sys/tihdr.h>
#include <sys/strsun.h>
#include <sys/policy.h>
#include <sys/ethernet.h>
#include <sys/vlan.h>
#include <net/trill.h>
#include <net/if_dl.h>
#include <sys/mac.h>
#include <sys/mac_client.h>
#include <sys/mac_provider.h>
#include <sys/mac_client_priv.h>
#include <sys/sdt.h>
#include <sys/dls.h>
#include <sys/sunddi.h>

#include "trill_impl.h"

static void trill_del_all(trill_inst_t *, boolean_t);
static int trill_del_nick(trill_inst_t *, uint16_t, boolean_t);
static void trill_stop_recv(trill_sock_t *);
static void trill_ctrl_input(trill_sock_t *, mblk_t *, const uint8_t *,
    uint16_t);
static trill_node_t *trill_node_lookup(trill_inst_t *, uint16_t);
static void trill_node_unref(trill_inst_t *, trill_node_t *);
static void trill_sock_unref(trill_sock_t *);
static void trill_kstats_init(trill_sock_t *, const char *);

static list_t trill_inst_list;
static krwlock_t trill_inst_rwlock;

static sock_lower_handle_t trill_create(int, int, int, sock_downcalls_t **,
    uint_t *, int *, int, cred_t *);

static smod_reg_t sinfo = {
        SOCKMOD_VERSION,
        "trill",
        SOCK_UC_VERSION,
        SOCK_DC_VERSION,
        trill_create,
        NULL,
};

/* modldrv structure */
static struct modlsockmod sockmod = {
        &mod_sockmodops, "AF_TRILL socket module", &sinfo
};

/* modlinkage structure */
static struct modlinkage ml = {
        MODREV_1,
        &sockmod,
        NULL
};

#define VALID_NICK(n)   ((n) != RBRIDGE_NICKNAME_NONE && \
                        (n) != RBRIDGE_NICKNAME_UNUSED)

static mblk_t *
create_trill_header(trill_sock_t *tsock, mblk_t *mp, const uint8_t *daddr,
    boolean_t trill_hdr_ok, boolean_t multidest, uint16_t tci,
    size_t msglen)
{
        int extra_hdr_len;
        struct ether_vlan_header *ethvlanhdr;
        mblk_t *hdr_mp;
        uint16_t etype;

        etype = msglen > 0 ? (uint16_t)msglen : ETHERTYPE_TRILL;

        /* When sending on the PVID, we must not give a VLAN ID */
        if (tci == tsock->ts_link->bl_pvid)
                tci = TRILL_NO_TCI;

        /*
         * Create new Ethernet header and include additional space
         * for writing TRILL header and/or VLAN tag.
         */
        extra_hdr_len = (trill_hdr_ok ? 0 : sizeof (trill_header_t)) +
            (tci != TRILL_NO_TCI ? sizeof (struct ether_vlan_extinfo) : 0);
        hdr_mp = mac_header(tsock->ts_link->bl_mh, daddr,
            tci != TRILL_NO_TCI ? ETHERTYPE_VLAN : etype, mp, extra_hdr_len);
        if (hdr_mp == NULL) {
                freemsg(mp);
                return (NULL);
        }

        if (tci != TRILL_NO_TCI) {
                /* LINTED: alignment */
                ethvlanhdr = (struct ether_vlan_header *)hdr_mp->b_rptr;
                ethvlanhdr->ether_tci = htons(tci);
                ethvlanhdr->ether_type = htons(etype);
                hdr_mp->b_wptr += sizeof (struct ether_vlan_extinfo);
        }

        if (!trill_hdr_ok) {
                trill_header_t *thp;
                /* LINTED: alignment */
                thp = (trill_header_t *)hdr_mp->b_wptr;
                (void) memset(thp, 0, sizeof (trill_header_t));
                thp->th_hopcount = TRILL_DEFAULT_HOPS;
                thp->th_multidest = (multidest ? 1:0);
                hdr_mp->b_wptr += sizeof (trill_header_t);
        }

        hdr_mp->b_cont = mp;
        return (hdr_mp);
}

/*
 * TRILL local recv function. TRILL data frames that should be received
 * by the local system are decapsulated here and passed to bridging for
 * learning and local system receive. Only called when we are the forwarder
 * on the link (multi-dest frames) or the frame was destined for us.
 */
static void
trill_recv_local(trill_sock_t *tsock, mblk_t *mp, uint16_t ingressnick)
{
        struct ether_header *inner_ethhdr;

        /* LINTED: alignment */
        inner_ethhdr = (struct ether_header *)mp->b_rptr;
        DTRACE_PROBE1(trill__recv__local, struct ether_header *, inner_ethhdr);

        DB_CKSUMFLAGS(mp) = 0;
        /*
         * Transmit the decapsulated frame on the link via Bridging.
         * Bridging does source address learning and appropriate forwarding.
         */
        bridge_trill_decaps(tsock->ts_link, mp, ingressnick);
        KSPINCR(tks_decap);
}

/*
 * Determines the outgoing link to reach a RBridge having the given nick
 * Assumes caller has acquired the trill instance rwlock.
 */
static trill_sock_t *
find_trill_link(trill_inst_t *tip, datalink_id_t linkid)
{
        trill_sock_t *tsp = NULL;

        ASSERT(RW_LOCK_HELD(&tip->ti_rwlock));
        for (tsp = list_head(&tip->ti_socklist); tsp != NULL;
            tsp = list_next(&tip->ti_socklist, tsp)) {
                if (tsp->ts_link != NULL && tsp->ts_link->bl_linkid == linkid) {
                        ASSERT(tsp->ts_link->bl_mh != NULL);
                        ASSERT(!(tsp->ts_flags & TSF_SHUTDOWN));
                        atomic_inc_uint(&tsp->ts_refs);
                        break;
                }
        }
        return (tsp);
}

/*
 * TRILL destination forwarding function. Transmits the TRILL data packet
 * to the next-hop, adjacent RBridge.  Consumes passed mblk_t.
 */
static void
trill_dest_fwd(trill_inst_t *tip, mblk_t *fwd_mp, uint16_t adj_nick,
    boolean_t has_trill_hdr, boolean_t multidest, uint16_t dtnick)
{
        trill_node_t *adj;
        trill_sock_t *tsock = NULL;
        trill_header_t *trillhdr;
        struct ether_header *ethhdr;
        int ethtype;
        int ethhdrlen;

        adj = trill_node_lookup(tip, adj_nick);
        if (adj == NULL || ((tsock = adj->tn_tsp) == NULL))
                goto dest_fwd_fail;

        ASSERT(tsock->ts_link != NULL);
        ASSERT(!(tsock->ts_flags & TSF_SHUTDOWN));
        ASSERT(adj->tn_ni != NULL);

        DTRACE_PROBE3(trill__dest__fwd, uint16_t, adj_nick, trill_node_t,
            adj, trill_sock_t, tsock);

        /*
         * For broadcast links by using the dest address of
         * the RBridge to forward the frame should result in
         * savings. When the link is a bridged LAN or there are
         * many end stations the frame will not always be flooded.
         */
        fwd_mp = create_trill_header(tsock, fwd_mp, adj->tn_ni->tni_adjsnpa,
            has_trill_hdr, multidest, tsock->ts_desigvlan, 0);
        if (fwd_mp == NULL)
                goto dest_fwd_fail;

        /* LINTED: alignment */
        ethhdr = (struct ether_header *)fwd_mp->b_rptr;
        ethtype = ntohs(ethhdr->ether_type);
        ASSERT(ethtype == ETHERTYPE_VLAN || ethtype == ETHERTYPE_TRILL);

        /* Pullup Ethernet and TRILL header (w/o TRILL options) */
        ethhdrlen = sizeof (struct ether_header) +
            (ethtype == ETHERTYPE_VLAN ? sizeof (struct ether_vlan_extinfo):0);
        if (!pullupmsg(fwd_mp, ethhdrlen + sizeof (trill_header_t)))
                goto dest_fwd_fail;
        /* LINTED: alignment */
        trillhdr = (struct trill_header *)(fwd_mp->b_rptr + ethhdrlen);

        /* Update TRILL header with ingress and egress nicks for new frames */
        if (!has_trill_hdr) {
                /* We are creating a new TRILL frame */
                trillhdr->th_egressnick = (multidest ? dtnick:adj_nick);
                rw_enter(&tip->ti_rwlock, RW_READER);
                trillhdr->th_ingressnick = tip->ti_nick;
                rw_exit(&tip->ti_rwlock);
                if (!VALID_NICK(trillhdr->th_ingressnick))
                        goto dest_fwd_fail;
        }

        /* Set hop count and update header in packet */
        ASSERT(trillhdr->th_hopcount != 0);
        trillhdr->th_hopcount--;

        /* Clear checksum flag and transmit frame on the link */
        DB_CKSUMFLAGS(fwd_mp) = 0;
        DTRACE_PROBE1(trill__dest__fwd__tx, trill_header_t *, &trillhdr);
        fwd_mp = bridge_trill_output(tsock->ts_link, fwd_mp);
        if (fwd_mp == NULL) {
                KSPINCR(tks_sent);
                KSPINCR(tks_forward);
        } else {
                freemsg(fwd_mp);
                KSPINCR(tks_drops);
        }
        trill_node_unref(tip, adj);
        return;

dest_fwd_fail:
        if (adj != NULL)
                trill_node_unref(tip, adj);
        if (tsock != NULL)
                KSPINCR(tks_drops);
        freemsg(fwd_mp);
}

/*
 * TRILL multi-destination forwarding. Transmits the packet to the adjacencies
 * on the distribution tree determined by the egress nick. Source addr (saddr)
 * is NULL for new TRILL packets originating from us.
 */
static void
trill_multidest_fwd(trill_inst_t *tip, mblk_t *mp, uint16_t egressnick,
    uint16_t ingressnick, boolean_t is_trill_pkt, const uint8_t *saddr,
    int inner_vlan, boolean_t free_mblk)
{
        int idx;
        uint16_t adjnick;
        trill_node_t *dest;
        trill_node_t *adj;
        mblk_t *fwd_mp;
        boolean_t nicksaved = B_FALSE;
        uint16_t adjnicksaved;

        /* Lookup the egress nick info, this is the DT root */
        if ((dest = trill_node_lookup(tip, egressnick)) == NULL)
                goto fail_multidest_fwd;

        /* Send a copy to all our adjacencies on the DT root  */
        ASSERT(dest->tn_ni);
        for (idx = 0; idx < dest->tn_ni->tni_adjcount; idx++) {

                /* Check for a valid adjacency node */
                adjnick = TNI_ADJNICK(dest->tn_ni, idx);
                if (!VALID_NICK(adjnick) || ingressnick == adjnick ||
                    ((adj = trill_node_lookup(tip, adjnick)) == NULL))
                        continue;

                /* Do not forward back to adjacency that sent the pkt to us */
                ASSERT(adj->tn_ni != NULL);
                if ((saddr != NULL) &&
                    (memcmp(adj->tn_ni->tni_adjsnpa, saddr,
                    ETHERADDRL) == 0)) {
                        trill_node_unref(tip, adj);
                        continue;
                }

                /* Check if adj is marked as reaching inner VLAN downstream */
                if ((inner_vlan != VLAN_ID_NONE) &&
                    !TRILL_VLANISSET(TNI_VLANFILTERMAP(dest->tn_ni, idx),
                    inner_vlan)) {
                        trill_node_unref(tip, adj);
                        DTRACE_PROBE4(trill__multi__dest__fwd__vlanfiltered,
                            uint16_t, adjnick, uint16_t, ingressnick,
                            uint16_t, egressnick, int, inner_vlan);
                        continue;
                }

                trill_node_unref(tip, adj);

                /*
                 * Save the nick and look ahead to see if we should forward the
                 * frame to more adjacencies. We avoid doing a copy for this
                 * nick and use the passed mblk when we can consume the passed
                 * mblk.
                 */
                if (free_mblk && !nicksaved) {
                        adjnicksaved = adjnick;
                        nicksaved = B_TRUE;
                        continue;
                }

                fwd_mp = copymsg(mp);
                if (fwd_mp == NULL)
                        break;
                DTRACE_PROBE2(trill__multi__dest__fwd, uint16_t,
                    adjnick, uint16_t, ingressnick);
                trill_dest_fwd(tip, fwd_mp, adjnick, is_trill_pkt,
                    B_TRUE, egressnick);
        }
        trill_node_unref(tip, dest);

        if (nicksaved) {
                ASSERT(free_mblk);
                DTRACE_PROBE2(trill__multi__dest__fwd, uint16_t,
                    adjnicksaved, uint16_t, ingressnick);
                trill_dest_fwd(tip, mp, adjnicksaved, is_trill_pkt,
                    B_TRUE, egressnick);
                return;
        }

fail_multidest_fwd:
        DTRACE_PROBE2(trill__multi__dest__fwd__fail, uint16_t,
            egressnick, uint16_t, ingressnick);
        if (free_mblk) {
                freemsg(mp);
        }
}

/*
 * TRILL data receive function. Forwards the received frame if necessary
 * and also determines if the received frame should be consumed locally.
 * Consumes passed mblk.
 */
static void
trill_recv(trill_sock_t *tsock, mblk_t *mp, const uint8_t *mpsaddr)
{
        trill_header_t *trillhdr;
        trill_node_t *dest = NULL;
        trill_node_t *source = NULL;
        trill_node_t *adj;
        uint16_t ournick, adjnick, treeroot;
        struct ether_header *ethhdr;
        trill_inst_t *tip = tsock->ts_tip;
        uint8_t srcaddr[ETHERADDRL];
        size_t trillhdrlen;
        int inner_vlan = VLAN_ID_NONE;
        int tci;
        int idx;
        size_t min_size;

        /* Copy Ethernet source address before modifying packet */
        (void) memcpy(srcaddr, mpsaddr, ETHERADDRL);

        /* Pull up TRILL header if necessary. */
        min_size = sizeof (trill_header_t);
        if ((MBLKL(mp) < min_size ||
            !IS_P2ALIGNED(mp->b_rptr, TRILL_HDR_ALIGN)) &&
            !pullupmsg(mp, min_size))
                goto fail;

        /* LINTED: alignment */
        trillhdr = (trill_header_t *)mp->b_rptr;
        if (trillhdr->th_version != TRILL_PROTOCOL_VERS) {
                DTRACE_PROBE1(trill__recv__wrongversion,
                    trill_header_t *, trillhdr);
                goto fail;
        }

        /* Drop if unknown or invalid nickname */
        if (!VALID_NICK(trillhdr->th_egressnick) ||
            !VALID_NICK(trillhdr->th_ingressnick)) {
                DTRACE_PROBE1(trill__recv__invalidnick,
                    trill_header_t *, trillhdr);
                goto fail;
        }

        rw_enter(&tip->ti_rwlock, RW_READER);
        ournick = tip->ti_nick;
        treeroot = tip->ti_treeroot;
        rw_exit(&tip->ti_rwlock);
        /* Drop if we received a packet with our nick as ingress */
        if (trillhdr->th_ingressnick == ournick)
                goto fail;

        /* Re-pull any TRILL options and inner Ethernet header */
        min_size += GET_TRILL_OPTS_LEN(trillhdr) * sizeof (uint32_t) +
            sizeof (struct ether_header);
        if (MBLKL(mp) < min_size) {
                if (!pullupmsg(mp, min_size))
                        goto fail;
                /* LINTED: alignment */
                trillhdr = (trill_header_t *)mp->b_rptr;
        }
        trillhdrlen = sizeof (trill_header_t) +
            (GET_TRILL_OPTS_LEN(trillhdr) * sizeof (uint32_t));

        /*
         * Get the inner Ethernet header, plus the inner VLAN header if there
         * is one.
         */
        /* LINTED: alignment */
        ethhdr = (struct ether_header *)(mp->b_rptr + trillhdrlen);
        if (ethhdr->ether_type == htons(ETHERTYPE_VLAN)) {
                min_size += sizeof (struct ether_vlan_extinfo);
                if (MBLKL(mp) < min_size) {
                        if (!pullupmsg(mp, min_size))
                                goto fail;
                        /* LINTED: alignment */
                        trillhdr = (trill_header_t *)mp->b_rptr;
                        /* LINTED: alignment */
                        ethhdr = (struct ether_header *)(mp->b_rptr +
                            trillhdrlen);
                }

                tci = ntohs(((struct ether_vlan_header *)ethhdr)->ether_tci);
                inner_vlan = VLAN_ID(tci);
        }

        /* Known/single destination forwarding. */
        if (!trillhdr->th_multidest) {

                /* Inner MacDA must be unicast */
                if (ethhdr->ether_dhost.ether_addr_octet[0] & 1)
                        goto fail;

                /* Ingress and Egress nicks must be different */
                if (trillhdr->th_egressnick == trillhdr->th_ingressnick)
                        goto fail;

                DTRACE_PROBE1(trill__recv__singledest,
                    trill_header_t *, trillhdr);
                if (trillhdr->th_egressnick == ournick) {
                        mp->b_rptr += trillhdrlen;
                        trill_recv_local(tsock, mp, trillhdr->th_ingressnick);
                } else if (trillhdr->th_hopcount > 0) {
                        trill_dest_fwd(tip, mp, trillhdr->th_egressnick,
                            B_TRUE, B_FALSE, RBRIDGE_NICKNAME_NONE);
                } else {
                        goto fail;
                }
                return;
        }

        /*
         * Multi-destination frame: perform checks verifying we have
         * received a valid multi-destination frame before receiving the
         * frame locally and forwarding the frame to other RBridges.
         *
         * Check if we received this multi-destination frame on a
         * adjacency in the distribution tree indicated by the frame's
         * egress nickname.
         */
        if ((dest = trill_node_lookup(tip, trillhdr->th_egressnick)) == NULL)
                goto fail;
        for (idx = 0; idx < dest->tn_ni->tni_adjcount; idx++) {
                adjnick = TNI_ADJNICK(dest->tn_ni, idx);
                if ((adj = trill_node_lookup(tip, adjnick)) == NULL)
                        continue;
                if (memcmp(adj->tn_ni->tni_adjsnpa, srcaddr, ETHERADDRL) == 0) {
                        trill_node_unref(tip, adj);
                        break;
                }
                trill_node_unref(tip, adj);
        }

        if (idx >= dest->tn_ni->tni_adjcount) {
                DTRACE_PROBE2(trill__recv__multidest__adjcheckfail,
                    trill_header_t *, trillhdr, trill_node_t *, dest);
                goto fail;
        }

        /*
         * Reverse path forwarding check. Check if the ingress RBridge
         * that has forwarded the frame advertised the use of the
         * distribution tree specified in the egress nick.
         */
        if ((source = trill_node_lookup(tip, trillhdr->th_ingressnick)) == NULL)
                goto fail;
        for (idx = 0; idx < source->tn_ni->tni_dtrootcount; idx++) {
                if (TNI_DTROOTNICK(source->tn_ni, idx) ==
                    trillhdr->th_egressnick)
                        break;
        }

        if (idx >= source->tn_ni->tni_dtrootcount) {
                /*
                 * Allow receipt of forwarded frame with the highest
                 * tree root RBridge as the egress RBridge when the
                 * ingress RBridge has not advertised the use of any
                 * distribution trees.
                 */
                if (source->tn_ni->tni_dtrootcount != 0 ||
                    trillhdr->th_egressnick != treeroot) {
                        DTRACE_PROBE3(
                            trill__recv__multidest__rpfcheckfail,
                            trill_header_t *, trillhdr, trill_node_t *,
                            source, trill_inst_t *, tip);
                        goto fail;
                }
        }

        /* Check hop count before doing any forwarding */
        if (trillhdr->th_hopcount == 0)
                goto fail;

        /* Forward frame using the distribution tree specified by egress nick */
        DTRACE_PROBE2(trill__recv__multidest, trill_header_t *,
            trillhdr, trill_node_t *, source);
        trill_node_unref(tip, source);
        trill_node_unref(tip, dest);

        /* Tell forwarding not to free if we're the link forwarder. */
        trill_multidest_fwd(tip, mp, trillhdr->th_egressnick,
            trillhdr->th_ingressnick, B_TRUE, srcaddr, inner_vlan,
            B_FALSE);

        /*
         * Send de-capsulated frame locally if we are the link forwarder (also
         * does bridge learning).
         */
        mp->b_rptr += trillhdrlen;
        trill_recv_local(tsock, mp, trillhdr->th_ingressnick);
        KSPINCR(tks_recv);
        return;

fail:
        DTRACE_PROBE2(trill__recv__multidest__fail, mblk_t *, mp,
            trill_sock_t *, tsock);
        if (dest != NULL)
                trill_node_unref(tip, dest);
        if (source != NULL)
                trill_node_unref(tip, source);
        freemsg(mp);
        KSPINCR(tks_drops);
}

static void
trill_stop_recv(trill_sock_t *tsock)
{
        mutex_enter(&tsock->ts_socklock);
stop_retry:
        if (tsock->ts_state == TS_UNBND || tsock->ts_link == NULL) {
                mutex_exit(&tsock->ts_socklock);
                return;
        }

        /*
         * If another thread is closing the socket then wait. Our callers
         * expect us to return only after the socket is closed.
         */
        if (tsock->ts_flags & TSF_CLOSEWAIT) {
                cv_wait(&tsock->ts_sockclosewait, &tsock->ts_socklock);
                goto stop_retry;
        }

        /*
         * Set state and flags to block new bind or close calls
         * while we close the socket.
         */
        tsock->ts_flags |= TSF_CLOSEWAIT;

        /* Wait until all AF_TRILL socket transmit operations are done */
        while (tsock->ts_sockthreadcount > 0)
                cv_wait(&tsock->ts_sockthreadwait, &tsock->ts_socklock);

        /*
         * We are guaranteed to be the only thread closing on the
         * socket while the TSF_CLOSEWAIT flag is set, all others cv_wait
         * for us to finish.
         */
        ASSERT(tsock->ts_link != NULL);
        if (tsock->ts_ksp != NULL)
                kstat_delete(tsock->ts_ksp);

        /*
         * Release lock before bridge_trill_lnunref to prevent deadlock
         * between trill_ctrl_input thread waiting to acquire ts_socklock
         * and bridge_trill_lnunref waiting for the trill thread to finish.
         */
        mutex_exit(&tsock->ts_socklock);

        /*
         * Release TRILL link reference from Bridging. On return from
         * bridge_trill_lnunref we can be sure there are no active TRILL data
         * threads for this link.
         */
        bridge_trill_lnunref(tsock->ts_link);

        /* Set socket as unbound & wakeup threads waiting for socket to close */
        mutex_enter(&tsock->ts_socklock);
        ASSERT(tsock->ts_link != NULL);
        tsock->ts_link = NULL;
        tsock->ts_state = TS_UNBND;
        tsock->ts_flags &= ~TSF_CLOSEWAIT;
        cv_broadcast(&tsock->ts_sockclosewait);
        mutex_exit(&tsock->ts_socklock);
}

static int
trill_start_recv(trill_sock_t *tsock, const struct sockaddr *sa, socklen_t len)
{
        struct sockaddr_dl *lladdr = (struct sockaddr_dl *)sa;
        datalink_id_t linkid;
        int err = 0;

        if (len != sizeof (*lladdr))
                return (EINVAL);

        mutex_enter(&tsock->ts_socklock);
        if (tsock->ts_tip == NULL || tsock->ts_state != TS_UNBND) {
                err = EINVAL;
                goto bind_error;
        }

        if (tsock->ts_flags & TSF_CLOSEWAIT || tsock->ts_link != NULL) {
                err = EBUSY;
                goto bind_error;
        }

        (void) memcpy(&(tsock->ts_lladdr), lladdr,
            sizeof (struct sockaddr_dl));
        (void) memcpy(&linkid, tsock->ts_lladdr.sdl_data,
            sizeof (datalink_id_t));

        tsock->ts_link = bridge_trill_lnref(tsock->ts_tip->ti_binst,
            linkid, tsock);
        if (tsock->ts_link == NULL) {
                err = EINVAL;
                goto bind_error;
        }

        trill_kstats_init(tsock, tsock->ts_tip->ti_bridgename);
        tsock->ts_state = TS_IDLE;

bind_error:
        mutex_exit(&tsock->ts_socklock);
        return (err);
}

static int
trill_do_unbind(trill_sock_t *tsock)
{
        /* If a bind has not been done, we can't unbind. */
        if (tsock->ts_state != TS_IDLE)
                return (EINVAL);

        trill_stop_recv(tsock);
        return (0);
}

static void
trill_instance_unref(trill_inst_t *tip)
{
        rw_enter(&trill_inst_rwlock, RW_WRITER);
        rw_enter(&tip->ti_rwlock, RW_WRITER);
        if (atomic_dec_uint_nv(&tip->ti_refs) == 0) {
                list_remove(&trill_inst_list, tip);
                rw_exit(&tip->ti_rwlock);
                rw_exit(&trill_inst_rwlock);
                if (tip->ti_binst != NULL)
                        bridge_trill_brunref(tip->ti_binst);
                list_destroy(&tip->ti_socklist);
                rw_destroy(&tip->ti_rwlock);
                kmem_free(tip, sizeof (*tip));
        } else {
                rw_exit(&tip->ti_rwlock);
                rw_exit(&trill_inst_rwlock);
        }
}

/*
 * This is called when the bridge module receives a TRILL-encapsulated packet
 * on a given link or a packet identified as "TRILL control."  We must verify
 * that it's for us (it almost certainly will be), and then either decapsulate
 * (if it's to our nickname), forward (if it's to someone else), or send up one
 * of the sockets (if it's control traffic).
 *
 * Sadly, on Ethernet, the control traffic is identified by Outer.MacDA, and
 * not by TRILL header information.
 */
static void
trill_recv_pkt_cb(void *lptr, bridge_link_t *blp, mac_resource_handle_t rsrc,
    mblk_t *mp, mac_header_info_t *hdr_info)
{
        trill_sock_t *tsock = lptr;

        _NOTE(ARGUNUSED(rsrc));

        ASSERT(tsock->ts_tip != NULL);
        ASSERT(tsock->ts_link != NULL);
        ASSERT(!(tsock->ts_flags & TSF_SHUTDOWN));

        /*
         * Only receive packet if the source address is not multicast (which is
         * bogus).
         */
        if (hdr_info->mhi_saddr[0] & 1)
                goto discard;

        /*
         * Check if this is our own packet reflected back.  It should not be.
         */
        if (bcmp(hdr_info->mhi_saddr, blp->bl_local_mac, ETHERADDRL) == 0)
                goto discard;

        /* Only receive unicast packet if addressed to us */
        if (hdr_info->mhi_dsttype == MAC_ADDRTYPE_UNICAST &&
            bcmp(hdr_info->mhi_daddr, blp->bl_local_mac, ETHERADDRL) != 0)
                goto discard;

        if (hdr_info->mhi_bindsap == ETHERTYPE_TRILL) {
                /* TRILL data packets */
                trill_recv(tsock, mp, hdr_info->mhi_saddr);
        } else {
                /* Design constraint for cheap IS-IS/BPDU comparison */
                ASSERT(all_isis_rbridges[4] != bridge_group_address[4]);
                /* Send received control packet upstream */
                trill_ctrl_input(tsock, mp, hdr_info->mhi_saddr,
                    hdr_info->mhi_daddr[4] == all_isis_rbridges[4] ?
                    hdr_info->mhi_tci : TRILL_TCI_BPDU);
        }

        return;

discard:
        freemsg(mp);
        KSPINCR(tks_drops);
}

/*
 * This is called when the bridge module discovers that the destination address
 * for a packet is not local -- it's through some remote node.  We must verify
 * that the remote node isn't our nickname (it shouldn't be), add a TRILL
 * header, and then use the IS-IS data to determine which link and which
 * next-hop RBridge should be used for output.  We then transmit on that link.
 *
 * The egress_nick is RBRIDGE_NICKNAME_NONE for the "unknown destination" case.
 */
static void
trill_encap_pkt_cb(void *lptr, bridge_link_t *blp, mac_header_info_t *hdr_info,
    mblk_t *mp, uint16_t egress_nick)
{
        uint16_t ournick;
        uint16_t dtnick;
        trill_node_t *self = NULL;
        trill_sock_t *tsock = lptr;
        trill_inst_t *tip = tsock->ts_tip;
        int vlan = VLAN_ID_NONE;

        _NOTE(ARGUNUSED(blp));
        ASSERT(hdr_info->mhi_bindsap != ETHERTYPE_TRILL);

        /* egress_nick = RBRIDGE_NICKNAME_NONE is valid */
        if (egress_nick != RBRIDGE_NICKNAME_NONE && !VALID_NICK(egress_nick))
                goto discard;

        /* Check if our own nick is valid before we do any forwarding */
        rw_enter(&tip->ti_rwlock, RW_READER);
        ournick = tip->ti_nick;
        dtnick = tip->ti_treeroot;
        rw_exit(&tip->ti_rwlock);
        if (!VALID_NICK(ournick))
                goto discard;

        /*
         * For Multi-Destination forwarding determine our choice of
         * root distribution tree. If we didn't choose a distribution
         * tree (dtroots_count=0) then we use the highest priority tree
         * root (t_treeroot) else we drop the packet without forwarding.
         */
        if (egress_nick == RBRIDGE_NICKNAME_NONE) {
                if ((self = trill_node_lookup(tip, ournick)) == NULL)
                        goto discard;

                /*
                 * Use the first DT configured for now. In future we
                 * should have DT selection code here.
                 */
                if (self->tn_ni->tni_dtrootcount > 0) {
                        dtnick = TNI_DTROOTNICK(self->tn_ni, 0);
                }

                trill_node_unref(tip, self);
                if (!VALID_NICK(dtnick)) {
                        DTRACE_PROBE(trill__fwd__packet__nodtroot);
                        goto discard;
                }
        }

        /*
         * Retrieve VLAN ID of the native frame used for VLAN
         * pruning of multi-destination frames.
         */
        if (hdr_info->mhi_istagged) {
                vlan = VLAN_ID(hdr_info->mhi_tci);
        }

        DTRACE_PROBE2(trill__fwd__packet, mac_header_info_t *, hdr_info,
            uint16_t, egress_nick);
        if (egress_nick == RBRIDGE_NICKNAME_NONE) {
                trill_multidest_fwd(tip, mp, dtnick,
                    ournick, B_FALSE, NULL, vlan, B_TRUE);
        } else {
                trill_dest_fwd(tip, mp, egress_nick, B_FALSE, B_FALSE,
                    RBRIDGE_NICKNAME_NONE);
        }
        KSPINCR(tks_encap);
        return;

discard:
        freemsg(mp);
}

/*
 * This is called when the bridge module has completely torn down a bridge
 * instance and all of the attached links.  We need to make the TRILL instance
 * go away at this point.
 */
static void
trill_br_dstr_cb(void *bptr, bridge_inst_t *bip)
{
        trill_inst_t *tip = bptr;

        _NOTE(ARGUNUSED(bip));
        rw_enter(&tip->ti_rwlock, RW_WRITER);
        if (tip->ti_binst != NULL)
                bridge_trill_brunref(tip->ti_binst);
        tip->ti_binst = NULL;
        rw_exit(&tip->ti_rwlock);
}

/*
 * This is called when the bridge module is tearing down a link, but before the
 * actual tear-down starts.  When this function returns, we must make sure that
 * we will not initiate any new transmits on this link.
 */
static void
trill_ln_dstr_cb(void *lptr, bridge_link_t *blp)
{
        trill_sock_t *tsock = lptr;

        _NOTE(ARGUNUSED(blp));
        trill_stop_recv(tsock);
}

static void
trill_init(void)
{
        list_create(&trill_inst_list, sizeof (trill_inst_t),
            offsetof(trill_inst_t, ti_instnode));
        rw_init(&trill_inst_rwlock, NULL, RW_DRIVER, NULL);
        bridge_trill_register_cb(trill_recv_pkt_cb, trill_encap_pkt_cb,
            trill_br_dstr_cb, trill_ln_dstr_cb);
}

static void
trill_fini(void)
{
        bridge_trill_register_cb(NULL, NULL, NULL, NULL);
        rw_destroy(&trill_inst_rwlock);
        list_destroy(&trill_inst_list);
}

/* Loadable module configuration entry points */
int
_init(void)
{
        int rc;

        trill_init();
        if ((rc = mod_install(&ml)) != 0)
                trill_fini();
        return (rc);
}

int
_info(struct modinfo *modinfop)
{
        return (mod_info(&ml, modinfop));
}

int
_fini(void)
{
        int rc;

        rw_enter(&trill_inst_rwlock, RW_READER);
        rc = list_is_empty(&trill_inst_list) ? 0 : EBUSY;
        rw_exit(&trill_inst_rwlock);
        if (rc == 0 && ((rc = mod_remove(&ml)) == 0))
                trill_fini();
        return (rc);
}

static void
trill_kstats_init(trill_sock_t *tsock, const char *bname)
{
        int i;
        char kstatname[KSTAT_STRLEN];
        kstat_named_t  *knt;
        static const char *sock_kstats_list[] = { TRILL_KSSOCK_NAMES };
        char link_name[MAXNAMELEN];
        int num;
        int err;

        bzero(link_name, sizeof (link_name));
        if ((err = dls_mgmt_get_linkinfo(tsock->ts_link->bl_linkid, link_name,
            NULL, NULL, NULL)) != 0) {
                cmn_err(CE_WARN, "%s: trill_kstats_init: error %d retrieving"
                    " linkinfo for linkid:%d", "trill", err,
                    tsock->ts_link->bl_linkid);
                return;
        }

        bzero(kstatname, sizeof (kstatname));
        (void) snprintf(kstatname, sizeof (kstatname), "%s-%s",
            bname, link_name);

        num = sizeof (sock_kstats_list) / sizeof (*sock_kstats_list);
        for (i = 0; i < num; i++) {
                knt = (kstat_named_t *)&(tsock->ts_kstats);
                kstat_named_init(&knt[i], sock_kstats_list[i],
                    KSTAT_DATA_UINT64);
        }

        tsock->ts_ksp = kstat_create_zone("trill", 0, kstatname, "sock",
            KSTAT_TYPE_NAMED, num, KSTAT_FLAG_VIRTUAL, GLOBAL_ZONEID);
        if (tsock->ts_ksp != NULL) {
                tsock->ts_ksp->ks_data = &tsock->ts_kstats;
                kstat_install(tsock->ts_ksp);
        }
}

static trill_sock_t *
trill_do_open(int flags)
{
        trill_sock_t *tsock;
        int kmflag = ((flags & SOCKET_NOSLEEP)) ? KM_NOSLEEP:KM_SLEEP;

        tsock = kmem_zalloc(sizeof (trill_sock_t), kmflag);
        if (tsock != NULL) {
                tsock->ts_state = TS_UNBND;
                tsock->ts_refs++;
                mutex_init(&tsock->ts_socklock, NULL, MUTEX_DRIVER, NULL);
                cv_init(&tsock->ts_sockthreadwait, NULL, CV_DRIVER, NULL);
                cv_init(&tsock->ts_sockclosewait, NULL, CV_DRIVER, NULL);
        }
        return (tsock);
}

static int
trill_find_bridge(trill_sock_t *tsock, const char *bname, boolean_t can_create)
{
        trill_inst_t *tip, *newtip = NULL;

        /* Allocate some memory (speculatively) before taking locks */
        if (can_create)
                newtip = kmem_zalloc(sizeof (*tip), KM_NOSLEEP);

        rw_enter(&trill_inst_rwlock, RW_WRITER);
        for (tip = list_head(&trill_inst_list); tip != NULL;
            tip = list_next(&trill_inst_list, tip)) {
                if (strcmp(tip->ti_bridgename, bname) == 0)
                        break;
        }
        if (tip == NULL) {
                if (!can_create || newtip == NULL) {
                        rw_exit(&trill_inst_rwlock);
                        return (can_create ? ENOMEM : ENOENT);
                }

                tip = newtip;
                newtip = NULL;
                (void) strcpy(tip->ti_bridgename, bname);

                /* Register TRILL instance with bridging */
                tip->ti_binst = bridge_trill_brref(bname, tip);
                if (tip->ti_binst == NULL) {
                        rw_exit(&trill_inst_rwlock);
                        kmem_free(tip, sizeof (*tip));
                        return (ENOENT);
                }

                rw_init(&tip->ti_rwlock, NULL, RW_DRIVER, NULL);
                list_create(&tip->ti_socklist, sizeof (trill_sock_t),
                    offsetof(trill_sock_t, ts_socklistnode));
                list_insert_tail(&trill_inst_list, tip);
        }
        atomic_inc_uint(&tip->ti_refs);
        rw_exit(&trill_inst_rwlock);

        /* If we didn't need the preallocated memory, then discard now. */
        if (newtip != NULL)
                kmem_free(newtip, sizeof (*newtip));

        rw_enter(&tip->ti_rwlock, RW_WRITER);
        list_insert_tail(&(tip->ti_socklist), tsock);
        tsock->ts_tip = tip;
        rw_exit(&tip->ti_rwlock);
        return (0);
}

static void
trill_clear_bridge(trill_sock_t *tsock)
{
        trill_inst_t *tip;

        if ((tip = tsock->ts_tip) == NULL)
                return;
        rw_enter(&tip->ti_rwlock, RW_WRITER);
        list_remove(&tip->ti_socklist, tsock);
        if (list_is_empty(&tip->ti_socklist))
                trill_del_all(tip, B_TRUE);
        rw_exit(&tip->ti_rwlock);
}

static void
trill_sock_unref(trill_sock_t *tsock)
{
        if (atomic_dec_uint_nv(&tsock->ts_refs) == 0) {
                mutex_destroy(&tsock->ts_socklock);
                cv_destroy(&tsock->ts_sockthreadwait);
                cv_destroy(&tsock->ts_sockclosewait);
                kmem_free(tsock, sizeof (trill_sock_t));
        }
}

static void
trill_do_close(trill_sock_t *tsock)
{
        trill_inst_t *tip;

        tip = tsock->ts_tip;
        trill_stop_recv(tsock);
        /* Remove socket from TRILL instance socket list */
        trill_clear_bridge(tsock);
        tsock->ts_flags |= TSF_SHUTDOWN;
        trill_sock_unref(tsock);
        if (tip != NULL)
                trill_instance_unref(tip);
}

static void
trill_del_all(trill_inst_t *tip, boolean_t lockheld)
{
        int i;

        if (!lockheld)
                rw_enter(&tip->ti_rwlock, RW_WRITER);
        for (i = RBRIDGE_NICKNAME_MIN; i < RBRIDGE_NICKNAME_MAX; i++) {
                if (tip->ti_nodes[i] != NULL)
                        (void) trill_del_nick(tip, i, B_TRUE);
        }
        if (!lockheld)
                rw_exit(&tip->ti_rwlock);
}

static void
trill_node_free(trill_node_t *nick_entry)
{
        trill_nickinfo_t *tni;

        tni = nick_entry->tn_ni;
        kmem_free(tni, TNI_TOTALSIZE(tni));
        kmem_free(nick_entry, sizeof (trill_node_t));
}

static void
trill_node_unref(trill_inst_t *tip, trill_node_t *tnp)
{
        if (atomic_dec_uint_nv(&tnp->tn_refs) == 0) {
                if (tnp->tn_tsp != NULL)
                        trill_sock_unref(tnp->tn_tsp);
                trill_node_free(tnp);
                atomic_dec_uint(&tip->ti_nodecount);
        }
}

static trill_node_t *
trill_node_lookup(trill_inst_t *tip, uint16_t nick)
{
        trill_node_t *nick_entry;

        if (!VALID_NICK(nick))
                return (NULL);
        rw_enter(&tip->ti_rwlock, RW_READER);
        nick_entry = tip->ti_nodes[nick];
        if (nick_entry != NULL) {
                atomic_inc_uint(&nick_entry->tn_refs);
        }
        rw_exit(&tip->ti_rwlock);
        return (nick_entry);
}

static int
trill_del_nick(trill_inst_t *tip, uint16_t nick, boolean_t lockheld)
{
        trill_node_t *nick_entry;
        int rc = ENOENT;

        if (!lockheld)
                rw_enter(&tip->ti_rwlock, RW_WRITER);
        if (VALID_NICK(nick)) {
                nick_entry = tip->ti_nodes[nick];
                if (nick_entry != NULL) {
                        trill_node_unref(tip, nick_entry);
                        tip->ti_nodes[nick] = NULL;
                        rc = 0;
                }
        }
        if (!lockheld)
                rw_exit(&tip->ti_rwlock);
        return (rc);
}

static int
trill_add_nick(trill_inst_t *tip, void *arg, boolean_t self, int mode)
{
        uint16_t nick;
        int size;
        trill_node_t *tnode;
        trill_nickinfo_t tnihdr;

        /* First make sure we have at least the header available */
        if (ddi_copyin(arg, &tnihdr, sizeof (trill_nickinfo_t), mode) != 0)
                return (EFAULT);

        nick = tnihdr.tni_nick;
        if (!VALID_NICK(nick)) {
                DTRACE_PROBE1(trill__add__nick__bad, trill_nickinfo_t *,
                    &tnihdr);
                return (EINVAL);
        }

        size = TNI_TOTALSIZE(&tnihdr);
        if (size > TNI_MAXSIZE)
                return (EINVAL);
        tnode = kmem_zalloc(sizeof (trill_node_t), KM_SLEEP);
        tnode->tn_ni = kmem_zalloc(size, KM_SLEEP);
        if (ddi_copyin(arg, tnode->tn_ni, size, mode) != 0) {
                kmem_free(tnode->tn_ni, size);
                kmem_free(tnode, sizeof (trill_node_t));
                return (EFAULT);
        }

        tnode->tn_refs++;
        rw_enter(&tip->ti_rwlock, RW_WRITER);
        if (tip->ti_nodes[nick] != NULL)
                (void) trill_del_nick(tip, nick, B_TRUE);

        if (self) {
                tip->ti_nick = nick;
        } else {
                tnode->tn_tsp = find_trill_link(tip,
                    tnode->tn_ni->tni_linkid);
        }
        DTRACE_PROBE2(trill__add__nick, trill_node_t *, tnode,
            uint16_t, nick);
        tip->ti_nodes[nick] = tnode;
        tip->ti_nodecount++;
        rw_exit(&tip->ti_rwlock);
        return (0);
}

static int
trill_do_ioctl(trill_sock_t *tsock, int cmd, void *arg, int mode)
{
        int error = 0;
        trill_inst_t *tip = tsock->ts_tip;

        switch (cmd) {
        case TRILL_DESIGVLAN: {
                uint16_t desigvlan;

                if (ddi_copyin(arg, &desigvlan, sizeof (desigvlan), mode) != 0)
                        return (EFAULT);
                tsock->ts_desigvlan = desigvlan;
                break;
        }
        case TRILL_VLANFWDER: {
                uint8_t vlans[TRILL_VLANS_ARRSIZE];

                if (tsock->ts_link == NULL)
                        return (EINVAL);
                if ((ddi_copyin(arg, vlans, sizeof (vlans), mode)) != 0)
                        return (EFAULT);
                bridge_trill_setvlans(tsock->ts_link, vlans);
                break;
        }
        case TRILL_SETNICK:
                if (tip == NULL)
                        return (EINVAL);
                error = trill_add_nick(tip, arg, B_TRUE, mode);
                break;

        case TRILL_GETNICK:
                if (tip == NULL)
                        return (EINVAL);
                rw_enter(&tip->ti_rwlock, RW_READER);
                if (ddi_copyout(&tip->ti_nick, arg, sizeof (tip->ti_nick),
                    mode) != 0)
                        error = EFAULT;
                rw_exit(&tip->ti_rwlock);
                break;

        case TRILL_ADDNICK:
                if (tip == NULL)
                        break;
                error = trill_add_nick(tip, arg, B_FALSE, mode);
                break;

        case TRILL_DELNICK: {
                uint16_t delnick;

                if (tip == NULL)
                        break;
                if (ddi_copyin(arg, &delnick, sizeof (delnick), mode) != 0)
                        return (EFAULT);
                error = trill_del_nick(tip, delnick, B_FALSE);
                break;
        }
        case TRILL_DELALL:
                if (tip == NULL)
                        break;
                trill_del_all(tip, B_FALSE);
                break;

        case TRILL_TREEROOT: {
                uint16_t treeroot;

                if (tip == NULL)
                        break;
                if (ddi_copyin(arg, &treeroot, sizeof (treeroot), mode) != 0)
                        return (EFAULT);
                if (!VALID_NICK(treeroot))
                        return (EINVAL);
                rw_enter(&tip->ti_rwlock, RW_WRITER);
                tip->ti_treeroot = treeroot;
                rw_exit(&tip->ti_rwlock);
                break;
        }
        case TRILL_HWADDR:
                if (tsock->ts_link == NULL)
                        break;
                if (ddi_copyout(tsock->ts_link->bl_local_mac, arg, ETHERADDRL,
                    mode) != 0)
                        return (EFAULT);
                break;

        case TRILL_NEWBRIDGE: {
                char bname[MAXLINKNAMELEN];

                if (tsock->ts_state != TS_UNBND)
                        return (ENOTSUP);
                /* ts_tip can only be set once */
                if (tip != NULL)
                        return (EEXIST);
                if (ddi_copyin(arg, bname, sizeof (bname), mode) != 0)
                        return (EFAULT);
                bname[MAXLINKNAMELEN-1] = '\0';
                error = trill_find_bridge(tsock, bname, B_TRUE);
                break;
        }

        case TRILL_GETBRIDGE: {
                char bname[MAXLINKNAMELEN];

                /* ts_tip can only be set once */
                if (tip != NULL)
                        return (EEXIST);
                if (ddi_copyin(arg, bname, sizeof (bname), mode) != 0)
                        return (EFAULT);
                bname[MAXLINKNAMELEN - 1] = '\0';
                error = trill_find_bridge(tsock, bname, B_FALSE);
                break;
        }

        case TRILL_LISTNICK: {
                trill_listnick_t tln;
                trill_node_t *tnp;
                trill_nickinfo_t *tnip;
                uint16_t nick;

                if (tip == NULL)
                        return (EINVAL);
                if (ddi_copyin(arg, &tln, sizeof (tln), mode) != 0)
                        return (EFAULT);
                nick = tln.tln_nick;
                if (nick >= RBRIDGE_NICKNAME_MAX) {
                        error = EINVAL;
                        break;
                }
                rw_enter(&tip->ti_rwlock, RW_READER);
                while (++nick < RBRIDGE_NICKNAME_MAX) {
                        if ((tnp = tip->ti_nodes[nick]) != NULL) {
                                tnip = tnp->tn_ni;
                                ASSERT(nick == tnip->tni_nick);
                                tln.tln_nick = nick;
                                bcopy(tnip->tni_adjsnpa, tln.tln_nexthop,
                                    ETHERADDRL);
                                tln.tln_ours = nick == tip->ti_nick;
                                if (tln.tln_ours || tnp->tn_tsp == NULL) {
                                        tln.tln_linkid =
                                            DATALINK_INVALID_LINKID;
                                } else {
                                        tln.tln_linkid =
                                            tnp->tn_tsp->ts_link->bl_linkid;
                                }
                                break;
                        }
                }
                rw_exit(&tip->ti_rwlock);
                if (nick >= RBRIDGE_NICKNAME_MAX)
                        bzero(&tln, sizeof (tln));
                if (ddi_copyout(&tln, arg, sizeof (tln), mode) != 0)
                        return (EFAULT);
                break;
        }

        /*
         * Port flush: this is used when we lose AF on a port.  We must discard
         * all regular bridge forwarding entries on this port with the
         * indicated VLAN.
         */
        case TRILL_PORTFLUSH: {
                uint16_t vlan = (uint16_t)(uintptr_t)arg;

                if (tsock->ts_link == NULL)
                        return (EINVAL);
                bridge_trill_flush(tsock->ts_link, vlan, B_FALSE);
                break;
        }

        /*
         * Nick flush: this is used when we lose AF on a port.  We must discard
         * all bridge TRILL forwarding entries on this port with the indicated
         * VLAN.
         */
        case TRILL_NICKFLUSH: {
                uint16_t vlan = (uint16_t)(uintptr_t)arg;

                if (tsock->ts_link == NULL)
                        return (EINVAL);
                bridge_trill_flush(tsock->ts_link, vlan, B_TRUE);
                break;
        }

        case TRILL_GETMTU:
                if (tsock->ts_link == NULL)
                        break;
                if (ddi_copyout(&tsock->ts_link->bl_maxsdu, arg,
                    sizeof (uint_t), mode) != 0)
                        return (EFAULT);
                break;

        default:
                error = ENOTSUP;
                break;
        }

        return (error);
}

/*
 * Sends received packet back upstream on the TRILL socket.
 * Consumes passed mblk_t.
 */
static void
trill_ctrl_input(trill_sock_t *tsock, mblk_t *mp, const uint8_t *saddr,
    uint16_t tci)
{
        int udi_size;
        mblk_t *mp1;
        struct T_unitdata_ind *tudi;
        struct sockaddr_dl *sdl;
        char *lladdr;
        int error;

        ASSERT(!(tsock->ts_flags & TSF_SHUTDOWN));
        if (tsock->ts_flow_ctrld) {
                freemsg(mp);
                KSPINCR(tks_drops);
                return;
        }

        udi_size =  sizeof (struct T_unitdata_ind) +
            sizeof (struct sockaddr_dl);
        mp1 = allocb(udi_size, BPRI_MED);
        if (mp1 == NULL) {
                freemsg(mp);
                KSPINCR(tks_drops);
                return;
        }

        mp1->b_cont = mp;
        mp = mp1;
        mp->b_datap->db_type = M_PROTO;
        /* LINTED: alignment */
        tudi = (struct T_unitdata_ind *)mp->b_rptr;
        mp->b_wptr = (uchar_t *)tudi + udi_size;

        tudi->PRIM_type = T_UNITDATA_IND;
        tudi->SRC_length = sizeof (struct sockaddr_dl);
        tudi->SRC_offset = sizeof (struct T_unitdata_ind);
        tudi->OPT_length = 0;
        tudi->OPT_offset = sizeof (struct T_unitdata_ind) +
            sizeof (struct sockaddr_dl);

        /* Information of the link on which packet was received. */
        sdl = (struct sockaddr_dl *)&tudi[1];
        (void) memset(sdl, 0, sizeof (struct sockaddr_dl));
        sdl->sdl_family = AF_TRILL;

        /* LINTED: alignment */
        *(datalink_id_t *)sdl->sdl_data = tsock->ts_link->bl_linkid;
        sdl->sdl_nlen = sizeof (tsock->ts_link->bl_linkid);

        lladdr = LLADDR(sdl);
        (void) memcpy(lladdr, saddr, ETHERADDRL);
        lladdr += ETHERADDRL;
        sdl->sdl_alen = ETHERADDRL;

        /* LINTED: alignment */
        *(uint16_t *)lladdr = tci;
        sdl->sdl_slen = sizeof (uint16_t);

        DTRACE_PROBE2(trill__ctrl__input, trill_sock_t *, tsock, mblk_t *, mp);
        (*tsock->ts_conn_upcalls->su_recv)(tsock->ts_conn_upper_handle,
            mp, msgdsize(mp), 0, &error, NULL);

        if (error == ENOSPC) {
                mutex_enter(&tsock->ts_socklock);
                (*tsock->ts_conn_upcalls->su_recv)(tsock->ts_conn_upper_handle,
                    NULL, 0, 0, &error, NULL);
                if (error == ENOSPC)
                        tsock->ts_flow_ctrld = B_TRUE;
                mutex_exit(&tsock->ts_socklock);
                KSPINCR(tks_drops);
        } else if (error != 0) {
                KSPINCR(tks_drops);
        } else {
                KSPINCR(tks_recv);
        }

        DTRACE_PROBE2(trill__ctrl__input__done, trill_sock_t *,
            tsock, int, error);
}

/* ARGSUSED */
static void
trill_activate(sock_lower_handle_t proto_handle,
    sock_upper_handle_t sock_handle, sock_upcalls_t *sock_upcalls,
    int flags, cred_t *cr)
{
        trill_sock_t *tsock = (trill_sock_t *)proto_handle;
        struct sock_proto_props sopp;

        tsock->ts_conn_upcalls = sock_upcalls;
        tsock->ts_conn_upper_handle = sock_handle;

        sopp.sopp_flags = SOCKOPT_WROFF | SOCKOPT_RCVHIWAT |
            SOCKOPT_RCVLOWAT | SOCKOPT_MAXADDRLEN | SOCKOPT_MAXPSZ |
            SOCKOPT_MAXBLK | SOCKOPT_MINPSZ;
        sopp.sopp_wroff = 0;
        sopp.sopp_rxhiwat = SOCKET_RECVHIWATER;
        sopp.sopp_rxlowat = SOCKET_RECVLOWATER;
        sopp.sopp_maxaddrlen = sizeof (struct sockaddr_dl);
        sopp.sopp_maxpsz = INFPSZ;
        sopp.sopp_maxblk = INFPSZ;
        sopp.sopp_minpsz = 0;
        (*tsock->ts_conn_upcalls->su_set_proto_props)(
            tsock->ts_conn_upper_handle, &sopp);
}

/* ARGSUSED */
static int
trill_close(sock_lower_handle_t proto_handle, int flags, cred_t *cr)
{
        trill_sock_t *tsock = (trill_sock_t *)proto_handle;

        trill_do_close(tsock);
        return (0);
}

/* ARGSUSED */
static int
trill_bind(sock_lower_handle_t proto_handle, struct sockaddr *sa,
    socklen_t len, cred_t *cr)
{
        int error;
        trill_sock_t *tsock = (trill_sock_t *)proto_handle;

        if (sa == NULL)
                error = trill_do_unbind(tsock);
        else
                error = trill_start_recv(tsock, sa, len);

        return (error);
}

/*
 * This is a token getsockopt() implementation so we can reply to SO_PROTOCOL.
 */
static int
trill_getsockopt(sock_lower_handle_t handle, int level,
    int option_name, void *optval, socklen_t *optlenp, struct cred *cr)
{
        int32_t value;

        if (level != SOL_SOCKET && option_name != SO_PROTOCOL) {
                return (ENOPROTOOPT);
        }

        if (*optlenp < sizeof (int32_t)) {
                return (EINVAL);
        }

        value = 0;
        bcopy(&value, optval, sizeof (value));
        *optlenp = sizeof (value);
        return (0);
}

/* ARGSUSED */
static int
trill_send(sock_lower_handle_t proto_handle, mblk_t *mp, struct nmsghdr *msg,
    cred_t *cr)
{
        trill_sock_t *tsock = (trill_sock_t *)proto_handle;
        struct sockaddr_dl *laddr;
        uint16_t tci;

        ASSERT(DB_TYPE(mp) == M_DATA);
        ASSERT(!(tsock->ts_flags & TSF_SHUTDOWN));

        if (msg->msg_name == NULL || msg->msg_namelen != sizeof (*laddr))
                goto eproto;

        /*
         * The name is a datalink_id_t, the address is an Ethernet address, and
         * the selector value is the VLAN ID.
         */
        laddr = (struct sockaddr_dl *)msg->msg_name;
        if (laddr->sdl_nlen != sizeof (datalink_id_t) ||
            laddr->sdl_alen != ETHERADDRL ||
            (laddr->sdl_slen != sizeof (tci) && laddr->sdl_slen != 0))
                goto eproto;

        mutex_enter(&tsock->ts_socklock);
        if (tsock->ts_state != TS_IDLE || tsock->ts_link == NULL) {
                mutex_exit(&tsock->ts_socklock);
                goto eproto;
        }
        atomic_inc_uint(&tsock->ts_sockthreadcount);
        mutex_exit(&tsock->ts_socklock);

        /*
         * Safe to dereference VLAN now, as we've checked the user's specified
         * values, and alignment is now guaranteed.
         */
        if (laddr->sdl_slen == 0) {
                tci = TRILL_NO_TCI;
        } else {
                /* LINTED: alignment */
                tci = *(uint16_t *)(LLADDR(laddr) + ETHERADDRL);
        }

        mp = create_trill_header(tsock, mp, (const uchar_t *)LLADDR(laddr),
            B_TRUE, B_FALSE, tci, msgdsize(mp));
        if (mp != NULL) {
                mp = bridge_trill_output(tsock->ts_link, mp);
                if (mp == NULL) {
                        KSPINCR(tks_sent);
                } else {
                        freemsg(mp);
                        KSPINCR(tks_drops);
                }
        }

        /* Wake up any threads blocking on us */
        if (atomic_dec_uint_nv(&tsock->ts_sockthreadcount) == 0)
                cv_broadcast(&tsock->ts_sockthreadwait);
        return (0);

eproto:
        freemsg(mp);
        KSPINCR(tks_drops);
        return (EPROTO);
}

/* ARGSUSED */
static int
trill_ioctl(sock_lower_handle_t proto_handle, int cmd, intptr_t arg,
    int mode, int32_t *rvalp, cred_t *cr)
{
        trill_sock_t *tsock = (trill_sock_t *)proto_handle;
        int rc;

        switch (cmd) {
        /* List of unprivileged TRILL ioctls */
        case TRILL_GETNICK:
        case TRILL_GETBRIDGE:
        case TRILL_LISTNICK:
                break;
        default:
                if (secpolicy_dl_config(cr) != 0)
                        return (EPERM);
                break;
        }

        /* Lock ensures socket state is unchanged during ioctl handling */
        mutex_enter(&tsock->ts_socklock);
        rc = trill_do_ioctl(tsock, cmd, (void *)arg, mode);
        mutex_exit(&tsock->ts_socklock);
        return (rc);
}

static void
trill_clr_flowctrl(sock_lower_handle_t proto_handle)
{
        trill_sock_t *tsock = (trill_sock_t *)proto_handle;

        mutex_enter(&tsock->ts_socklock);
        tsock->ts_flow_ctrld = B_FALSE;
        mutex_exit(&tsock->ts_socklock);
}

static sock_downcalls_t sock_trill_downcalls = {
        trill_activate,                 /* sd_activate */
        sock_accept_notsupp,            /* sd_accept */
        trill_bind,                     /* sd_bind */
        sock_listen_notsupp,            /* sd_listen */
        sock_connect_notsupp,           /* sd_connect */
        sock_getpeername_notsupp,       /* sd_getpeername */
        sock_getsockname_notsupp,       /* sd_getsockname */
        trill_getsockopt,               /* sd_getsockopt */
        sock_setsockopt_notsupp,        /* sd_setsockopt */
        trill_send,                     /* sd_send */
        NULL,                           /* sd_send_uio */
        NULL,                           /* sd_recv_uio */
        NULL,                           /* sd_poll */
        sock_shutdown_notsupp,          /* sd_shutdown */
        trill_clr_flowctrl,             /* sd_setflowctrl */
        trill_ioctl,                    /* sd_ioctl */
        trill_close                     /* sd_close */
};

/* ARGSUSED */
static sock_lower_handle_t
trill_create(int family, int type, int proto, sock_downcalls_t **sock_downcalls,
    uint_t *smodep, int *errorp, int flags, cred_t *credp)
{
        trill_sock_t *tsock;

        if (family != AF_TRILL || type != SOCK_DGRAM || proto != 0) {
                *errorp = EPROTONOSUPPORT;
                return (NULL);
        }

        *sock_downcalls = &sock_trill_downcalls;
        *smodep = SM_ATOMIC;
        tsock = trill_do_open(flags);
        *errorp = (tsock != NULL) ? 0:ENOMEM;
        return ((sock_lower_handle_t)tsock);
}