root/usr/src/uts/sun4v/io/vsw_switching.c
/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */

/*
 * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
 */

#include <sys/types.h>
#include <sys/errno.h>
#include <sys/debug.h>
#include <sys/time.h>
#include <sys/sysmacros.h>
#include <sys/systm.h>
#include <sys/user.h>
#include <sys/stropts.h>
#include <sys/stream.h>
#include <sys/strlog.h>
#include <sys/strsubr.h>
#include <sys/cmn_err.h>
#include <sys/cpu.h>
#include <sys/kmem.h>
#include <sys/conf.h>
#include <sys/ddi.h>
#include <sys/sunddi.h>
#include <sys/ksynch.h>
#include <sys/stat.h>
#include <sys/kstat.h>
#include <sys/vtrace.h>
#include <sys/strsun.h>
#include <sys/dlpi.h>
#include <sys/ethernet.h>
#include <net/if.h>
#include <sys/varargs.h>
#include <sys/machsystm.h>
#include <sys/modctl.h>
#include <sys/modhash.h>
#include <sys/mac.h>
#include <sys/mac_ether.h>
#include <sys/taskq.h>
#include <sys/note.h>
#include <sys/mach_descrip.h>
#include <sys/mdeg.h>
#include <sys/ldc.h>
#include <sys/vsw_fdb.h>
#include <sys/vsw.h>
#include <sys/vio_mailbox.h>
#include <sys/vnet_mailbox.h>
#include <sys/vnet_common.h>
#include <sys/vio_util.h>
#include <sys/sdt.h>
#include <sys/atomic.h>
#include <sys/vlan.h>

/* Switching setup routines */
void vsw_setup_switching_thread(void *arg);
int vsw_setup_switching_start(vsw_t *vswp);
void vsw_setup_switching_stop(vsw_t *vswp);
int vsw_setup_switching(vsw_t *);
void vsw_setup_switching_post_process(vsw_t *vswp);
void vsw_switch_frame_nop(vsw_t *vswp, mblk_t *mp, int caller,
    vsw_port_t *port, mac_resource_handle_t mrh);
static  int vsw_setup_layer2(vsw_t *);
static  int vsw_setup_layer3(vsw_t *);

/* Switching/data transmit routines */
static  void vsw_switch_l2_frame_mac_client(vsw_t *vswp, mblk_t *mp, int caller,
    vsw_port_t *port, mac_resource_handle_t);
static  void vsw_switch_l2_frame(vsw_t *vswp, mblk_t *mp, int caller,
        vsw_port_t *port, mac_resource_handle_t);
static  void vsw_switch_l3_frame(vsw_t *vswp, mblk_t *mp, int caller,
        vsw_port_t *port, mac_resource_handle_t);
static  int vsw_forward_all(vsw_t *vswp, mblk_t *mp,
        int caller, vsw_port_t *port);
static  int vsw_forward_grp(vsw_t *vswp, mblk_t *mp,
    int caller, vsw_port_t *port);

/* VLAN routines */
void vsw_create_vlans(void *arg, int type);
void vsw_destroy_vlans(void *arg, int type);
void vsw_vlan_add_ids(void *arg, int type);
void vsw_vlan_remove_ids(void *arg, int type);
static  void vsw_vlan_create_hash(void *arg, int type);
static  void vsw_vlan_destroy_hash(void *arg, int type);
boolean_t vsw_frame_lookup_vid(void *arg, int caller, struct ether_header *ehp,
        uint16_t *vidp);
mblk_t *vsw_vlan_frame_pretag(void *arg, int type, mblk_t *mp);
uint32_t vsw_vlan_frames_untag(void *arg, int type, mblk_t **np, mblk_t **npt);
boolean_t vsw_vlan_lookup(mod_hash_t *vlan_hashp, uint16_t vid);

/* Forwarding database (FDB) routines */
void vsw_fdbe_add(vsw_t *vswp, void *port);
void vsw_fdbe_del(vsw_t *vswp, struct ether_addr *eaddr);
static  vsw_fdbe_t *vsw_fdbe_find(vsw_t *vswp, struct ether_addr *);
static void vsw_fdbe_find_cb(mod_hash_key_t key, mod_hash_val_t val);

int vsw_add_rem_mcst(vnet_mcast_msg_t *, vsw_port_t *);
int vsw_add_mcst(vsw_t *, uint8_t, uint64_t, void *);
int vsw_del_mcst(vsw_t *, uint8_t, uint64_t, void *);
void vsw_del_mcst_vsw(vsw_t *);

/* Support functions */
static mblk_t *vsw_dupmsgchain(mblk_t *mp);
static mblk_t *vsw_get_same_dest_list(struct ether_header *ehp, mblk_t **mpp);


/*
 * Functions imported from other files.
 */
extern mblk_t *vsw_tx_msg(vsw_t *, mblk_t *, int, vsw_port_t *);
extern mcst_addr_t *vsw_del_addr(uint8_t, void *, uint64_t);
extern int vsw_mac_open(vsw_t *vswp);
extern void vsw_mac_close(vsw_t *vswp);
extern void vsw_mac_rx(vsw_t *vswp, mac_resource_handle_t mrh,
    mblk_t *mp, vsw_macrx_flags_t flags);
extern void vsw_set_addrs(vsw_t *vswp);
extern int vsw_portsend(vsw_port_t *port, mblk_t *mp);
extern void vsw_hio_init(vsw_t *vswp);
extern void vsw_hio_start_ports(vsw_t *vswp);
extern int vsw_mac_multicast_add(vsw_t *vswp, vsw_port_t *port,
    mcst_addr_t *mcst_p, int type);
extern void vsw_mac_multicast_remove(vsw_t *vswp, vsw_port_t *port,
    mcst_addr_t *mcst_p, int type);
extern void vsw_mac_link_update(vsw_t *vswp, link_state_t link_state);
extern void vsw_physlink_update_ports(vsw_t *vswp);

/*
 * Tunables used in this file.
 */
extern  int vsw_setup_switching_delay;
extern  uint32_t vsw_vlan_nchains;
extern  uint32_t vsw_fdbe_refcnt_delay;

#define VSW_FDBE_REFHOLD(p)                                             \
{                                                                       \
        atomic_inc_32(&(p)->refcnt);                                    \
        ASSERT((p)->refcnt != 0);                                       \
}

#define VSW_FDBE_REFRELE(p)                                             \
{                                                                       \
        ASSERT((p)->refcnt != 0);                                       \
        atomic_dec_32(&(p)->refcnt);                                    \
}

/*
 * Thread to setup switching mode. This thread is created during vsw_attach()
 * initially. It invokes vsw_setup_switching() and keeps retrying while the
 * returned value is EAGAIN. The thread exits when the switching mode setup is
 * done successfully or when the error returned is not EAGAIN. This thread may
 * also get created from vsw_update_md_prop() if the switching mode needs to be
 * updated.
 */
void
vsw_setup_switching_thread(void *arg)
{
        callb_cpr_t     cprinfo;
        vsw_t           *vswp =  (vsw_t *)arg;
        clock_t         wait_time;
        clock_t         xwait;
        clock_t         wait_rv;
        int             rv;

        /* wait time used on successive retries */
        xwait = drv_usectohz(vsw_setup_switching_delay * MICROSEC);

        CALLB_CPR_INIT(&cprinfo, &vswp->sw_thr_lock, callb_generic_cpr,
            "vsw_setup_sw_thread");

        mutex_enter(&vswp->sw_thr_lock);

        while ((vswp->sw_thr_flags & VSW_SWTHR_STOP) == 0) {

                CALLB_CPR_SAFE_BEGIN(&cprinfo);

                /* Wait for sometime before (re)trying setup_switching() */
                wait_time = ddi_get_lbolt() + xwait;
                while ((vswp->sw_thr_flags & VSW_SWTHR_STOP) == 0) {
                        wait_rv = cv_timedwait(&vswp->sw_thr_cv,
                            &vswp->sw_thr_lock, wait_time);
                        if (wait_rv == -1) {    /* timed out */
                                break;
                        }
                }

                CALLB_CPR_SAFE_END(&cprinfo, &vswp->sw_thr_lock)

                if ((vswp->sw_thr_flags & VSW_SWTHR_STOP) != 0) {
                        /*
                         * If there is a stop request, process that first and
                         * exit the loop. Continue to hold the mutex which gets
                         * released in CALLB_CPR_EXIT().
                         */
                        break;
                }

                mutex_exit(&vswp->sw_thr_lock);
                rv = vsw_setup_switching(vswp);
                if (rv == 0) {
                        vsw_setup_switching_post_process(vswp);
                }
                mutex_enter(&vswp->sw_thr_lock);
                if (rv != EAGAIN) {
                        break;
                }

        }

        vswp->sw_thr_flags &= ~VSW_SWTHR_STOP;
        vswp->sw_thread = NULL;
        CALLB_CPR_EXIT(&cprinfo);
        thread_exit();
}

/*
 * Create a thread to setup the switching mode.
 * Returns 0 on success; 1 on failure.
 */
int
vsw_setup_switching_start(vsw_t *vswp)
{
        mutex_enter(&vswp->sw_thr_lock);

        vswp->sw_thread = thread_create(NULL, 2 * DEFAULTSTKSZ,
            vsw_setup_switching_thread, vswp, 0, &p0, TS_RUN, minclsyspri);

        if (vswp->sw_thread == NULL) {
                mutex_exit(&vswp->sw_thr_lock);
                return (1);
        }

        mutex_exit(&vswp->sw_thr_lock);
        return (0);
}

/*
 * Stop the thread to setup switching mode.
 */
void
vsw_setup_switching_stop(vsw_t *vswp)
{
        kt_did_t        tid = 0;

        /*
         * Signal the setup_switching thread to stop and wait until it stops.
         */
        mutex_enter(&vswp->sw_thr_lock);

        if (vswp->sw_thread != NULL) {
                tid = vswp->sw_thread->t_did;
                vswp->sw_thr_flags |= VSW_SWTHR_STOP;
                cv_signal(&vswp->sw_thr_cv);
        }

        mutex_exit(&vswp->sw_thr_lock);

        if (tid != 0)
                thread_join(tid);

        (void) atomic_swap_32(&vswp->switching_setup_done, B_FALSE);

        vswp->mac_open_retries = 0;
}

/*
 * Setup the required switching mode.
 * Returns:
 *  0 on success.
 *  EAGAIN if retry is needed.
 *  1 on all other failures.
 */
int
vsw_setup_switching(vsw_t *vswp)
{
        int     rv = 1;

        D1(vswp, "%s: enter", __func__);

        /*
         * Select best switching mode.
         * This is done as this routine can be called from the timeout
         * handler to retry setting up a specific mode. Currently only
         * the function which sets up layer2/promisc mode returns EAGAIN
         * if the underlying network device is not available yet, causing
         * retries.
         */
        if (vswp->smode & VSW_LAYER2) {
                rv = vsw_setup_layer2(vswp);
        } else if (vswp->smode & VSW_LAYER3) {
                rv = vsw_setup_layer3(vswp);
        } else {
                DERR(vswp, "unknown switch mode");
                rv = 1;
        }

        if (rv && (rv != EAGAIN)) {
                cmn_err(CE_WARN, "!vsw%d: Unable to setup specified "
                    "switching mode", vswp->instance);
        } else if (rv == 0) {
                (void) atomic_swap_32(&vswp->switching_setup_done, B_TRUE);
        }

        D2(vswp, "%s: Operating in mode %d", __func__,
            vswp->smode);

        D1(vswp, "%s: exit", __func__);

        return (rv);
}

/*
 * Setup for layer 2 switching.
 *
 * Returns:
 *  0 on success.
 *  EAGAIN if retry is needed.
 *  EIO on all other failures.
 */
static int
vsw_setup_layer2(vsw_t *vswp)
{
        int     rv;

        D1(vswp, "%s: enter", __func__);

        /*
         * Until the network device is successfully opened,
         * set the switching to use vsw_switch_l2_frame.
         */
        vswp->vsw_switch_frame = vsw_switch_l2_frame;
        vswp->mac_cl_switching = B_FALSE;

        rv = strlen(vswp->physname);
        if (rv == 0) {
                /*
                 * Physical device name is NULL, which is
                 * required for layer 2.
                 */
                cmn_err(CE_WARN, "!vsw%d: no network device name specified",
                    vswp->instance);
                return (EIO);
        }

        mutex_enter(&vswp->mac_lock);

        rv = vsw_mac_open(vswp);
        if (rv != 0) {
                if (rv != EAGAIN) {
                        cmn_err(CE_WARN, "!vsw%d: Unable to open network "
                            "device: %s\n", vswp->instance, vswp->physname);
                }
                mutex_exit(&vswp->mac_lock);
                return (rv);
        }

        /*
         * Now we can use the mac client switching, so set the switching
         * function to use vsw_switch_l2_frame_mac_client(), which simply
         * sends the packets to MAC layer for switching.
         */
        vswp->vsw_switch_frame = vsw_switch_l2_frame_mac_client;
        vswp->mac_cl_switching = B_TRUE;

        D1(vswp, "%s: exit", __func__);

        /* Initialize HybridIO related stuff */
        vsw_hio_init(vswp);

        mutex_exit(&vswp->mac_lock);
        return (0);

exit_error:
        vsw_mac_close(vswp);
        mutex_exit(&vswp->mac_lock);
        return (EIO);
}

static int
vsw_setup_layer3(vsw_t *vswp)
{
        D1(vswp, "%s: enter", __func__);

        D2(vswp, "%s: operating in layer 3 mode", __func__);
        vswp->vsw_switch_frame = vsw_switch_l3_frame;

        D1(vswp, "%s: exit", __func__);

        return (0);
}

/* ARGSUSED */
void
vsw_switch_frame_nop(vsw_t *vswp, mblk_t *mp, int caller, vsw_port_t *port,
                        mac_resource_handle_t mrh)
{
        freemsgchain(mp);
}

/*
 * Use mac client for layer 2 switching .
 */
static void
vsw_switch_l2_frame_mac_client(vsw_t *vswp, mblk_t *mp, int caller,
    vsw_port_t *port, mac_resource_handle_t mrh)
{
        _NOTE(ARGUNUSED(mrh))

        mblk_t          *ret_m;

        /*
         * This switching function is expected to be called by
         * the ports or the interface only. The packets from
         * physical interface already switched.
         */
        ASSERT((caller == VSW_VNETPORT) || (caller == VSW_LOCALDEV));

        if ((ret_m = vsw_tx_msg(vswp, mp, caller, port)) != NULL) {
                DERR(vswp, "%s: drop mblks to "
                    "phys dev", __func__);
                freemsgchain(ret_m);
        }
}

/*
 * Switch the given ethernet frame when operating in layer 2 mode.
 *
 * vswp: pointer to the vsw instance
 * mp: pointer to chain of ethernet frame(s) to be switched
 * caller: identifies the source of this frame as:
 *              1. VSW_VNETPORT - a vsw port (connected to a vnet).
 *              2. VSW_PHYSDEV - the physical ethernet device
 *              3. VSW_LOCALDEV - vsw configured as a virtual interface
 * arg: argument provided by the caller.
 *              1. for VNETPORT - pointer to the corresponding vsw_port_t.
 *              2. for PHYSDEV - NULL
 *              3. for LOCALDEV - pointer to to this vsw_t(self)
 */
void
vsw_switch_l2_frame(vsw_t *vswp, mblk_t *mp, int caller,
                        vsw_port_t *arg, mac_resource_handle_t mrh)
{
        struct ether_header     *ehp;
        mblk_t                  *bp, *ret_m;
        vsw_fdbe_t              *fp;

        D1(vswp, "%s: enter (caller %d)", __func__, caller);

        /*
         * PERF: rather than breaking up the chain here, scan it
         * to find all mblks heading to same destination and then
         * pass that sub-chain to the lower transmit functions.
         */

        /* process the chain of packets */
        bp = mp;
        while (bp) {
                ehp = (struct ether_header *)bp->b_rptr;
                mp = vsw_get_same_dest_list(ehp, &bp);
                ASSERT(mp != NULL);

                D2(vswp, "%s: mblk data buffer %lld : actual data size %lld",
                    __func__, MBLKSIZE(mp), MBLKL(mp));

                if (ether_cmp(&ehp->ether_dhost, &vswp->if_addr) == 0) {
                        /*
                         * If destination is VSW_LOCALDEV (vsw as an eth
                         * interface) and if the device is up & running,
                         * send the packet up the stack on this host.
                         * If the virtual interface is down, drop the packet.
                         */
                        if (caller != VSW_LOCALDEV) {
                                vsw_mac_rx(vswp, mrh, mp, VSW_MACRX_FREEMSG);
                        } else {
                                freemsgchain(mp);
                        }
                        continue;
                }

                /*
                 * Find fdb entry for the destination
                 * and hold a reference to it.
                 */
                fp = vsw_fdbe_find(vswp, &ehp->ether_dhost);
                if (fp != NULL) {

                        /*
                         * If plumbed and in promisc mode then copy msg
                         * and send up the stack.
                         */
                        vsw_mac_rx(vswp, mrh, mp,
                            VSW_MACRX_PROMISC | VSW_MACRX_COPYMSG);

                        /*
                         * If the destination is in FDB, the packet
                         * should be forwarded to the correponding
                         * vsw_port (connected to a vnet device -
                         * VSW_VNETPORT)
                         */
                        (void) vsw_portsend(fp->portp, mp);

                        /* Release the reference on the fdb entry */
                        VSW_FDBE_REFRELE(fp);
                } else {
                        /*
                         * Destination not in FDB.
                         *
                         * If the destination is broadcast or
                         * multicast forward the packet to all
                         * (VNETPORTs, PHYSDEV, LOCALDEV),
                         * except the caller.
                         */
                        if (IS_BROADCAST(ehp)) {
                                D2(vswp, "%s: BROADCAST pkt", __func__);
                                (void) vsw_forward_all(vswp, mp, caller, arg);
                        } else if (IS_MULTICAST(ehp)) {
                                D2(vswp, "%s: MULTICAST pkt", __func__);
                                (void) vsw_forward_grp(vswp, mp, caller, arg);
                        } else {
                                /*
                                 * If the destination is unicast, and came
                                 * from either a logical network device or
                                 * the switch itself when it is plumbed, then
                                 * send it out on the physical device and also
                                 * up the stack if the logical interface is
                                 * in promiscious mode.
                                 *
                                 * NOTE:  The assumption here is that if we
                                 * cannot find the destination in our fdb, its
                                 * a unicast address, and came from either a
                                 * vnet or down the stack (when plumbed) it
                                 * must be destinded for an ethernet device
                                 * outside our ldoms.
                                 */
                                if (caller == VSW_VNETPORT) {
                                        /* promisc check copy etc */
                                        vsw_mac_rx(vswp, mrh, mp,
                                            VSW_MACRX_PROMISC |
                                            VSW_MACRX_COPYMSG);

                                        if ((ret_m = vsw_tx_msg(vswp, mp,
                                            caller, arg)) != NULL) {
                                                DERR(vswp, "%s: drop mblks to "
                                                    "phys dev", __func__);
                                                freemsgchain(ret_m);
                                        }

                                } else if (caller == VSW_PHYSDEV) {
                                        /*
                                         * Pkt seen because card in promisc
                                         * mode. Send up stack if plumbed in
                                         * promisc mode, else drop it.
                                         */
                                        vsw_mac_rx(vswp, mrh, mp,
                                            VSW_MACRX_PROMISC |
                                            VSW_MACRX_FREEMSG);

                                } else if (caller == VSW_LOCALDEV) {
                                        /*
                                         * Pkt came down the stack, send out
                                         * over physical device.
                                         */
                                        if ((ret_m = vsw_tx_msg(vswp, mp,
                                            caller, NULL)) != NULL) {
                                                DERR(vswp, "%s: drop mblks to "
                                                    "phys dev", __func__);
                                                freemsgchain(ret_m);
                                        }
                                }
                        }
                }
        }
        D1(vswp, "%s: exit\n", __func__);
}

/*
 * Switch ethernet frame when in layer 3 mode (i.e. using IP
 * layer to do the routing).
 *
 * There is a large amount of overlap between this function and
 * vsw_switch_l2_frame. At some stage we need to revisit and refactor
 * both these functions.
 */
void
vsw_switch_l3_frame(vsw_t *vswp, mblk_t *mp, int caller,
                        vsw_port_t *arg, mac_resource_handle_t mrh)
{
        struct ether_header     *ehp;
        mblk_t                  *bp = NULL;
        vsw_fdbe_t              *fp;

        D1(vswp, "%s: enter (caller %d)", __func__, caller);

        /*
         * In layer 3 mode should only ever be switching packets
         * between IP layer and vnet devices. So make sure thats
         * who is invoking us.
         */
        if ((caller != VSW_LOCALDEV) && (caller != VSW_VNETPORT)) {
                DERR(vswp, "%s: unexpected caller (%d)", __func__, caller);
                freemsgchain(mp);
                return;
        }

        /* process the chain of packets */
        bp = mp;
        while (bp) {
                ehp = (struct ether_header *)bp->b_rptr;
                mp = vsw_get_same_dest_list(ehp, &bp);
                ASSERT(mp != NULL);

                D2(vswp, "%s: mblk data buffer %lld : actual data size %lld",
                    __func__, MBLKSIZE(mp), MBLKL(mp));

                /*
                 * Find fdb entry for the destination
                 * and hold a reference to it.
                 */
                fp = vsw_fdbe_find(vswp, &ehp->ether_dhost);
                if (fp != NULL) {

                        D2(vswp, "%s: sending to target port", __func__);
                        (void) vsw_portsend(fp->portp, mp);

                        /* Release the reference on the fdb entry */
                        VSW_FDBE_REFRELE(fp);
                } else {
                        /*
                         * Destination not in FDB
                         *
                         * If the destination is broadcast or
                         * multicast forward the packet to all
                         * (VNETPORTs, PHYSDEV, LOCALDEV),
                         * except the caller.
                         */
                        if (IS_BROADCAST(ehp)) {
                                D2(vswp, "%s: BROADCAST pkt", __func__);
                                (void) vsw_forward_all(vswp, mp, caller, arg);
                        } else if (IS_MULTICAST(ehp)) {
                                D2(vswp, "%s: MULTICAST pkt", __func__);
                                (void) vsw_forward_grp(vswp, mp, caller, arg);
                        } else {
                                /*
                                 * Unicast pkt from vnet that we don't have
                                 * an FDB entry for, so must be destinded for
                                 * the outside world. Attempt to send up to the
                                 * IP layer to allow it to deal with it.
                                 */
                                if (caller == VSW_VNETPORT) {
                                        vsw_mac_rx(vswp, mrh,
                                            mp, VSW_MACRX_FREEMSG);
                                }
                        }
                }
        }

        D1(vswp, "%s: exit", __func__);
}

/*
 * Additional initializations that are needed for the specific switching mode.
 */
void
vsw_setup_switching_post_process(vsw_t *vswp)
{
        link_state_t    link_state = LINK_STATE_UP;

        if (vswp->smode & VSW_LAYER2) {
                /*
                 * Program unicst, mcst addrs of vsw
                 * interface and ports in the physdev.
                 */
                vsw_set_addrs(vswp);

                /* Start HIO for ports that have already connected */
                vsw_hio_start_ports(vswp);

                if (vswp->pls_update == B_TRUE) {
                        link_state = vswp->phys_link_state;
                }

                /* Update physical link info to any ports already connected */
                vsw_physlink_update_ports(vswp);
        }

        vsw_mac_link_update(vswp, link_state);
}

/*
 * Forward the ethernet frame to all ports (VNETPORTs, PHYSDEV, LOCALDEV),
 * except the caller (port on which frame arrived).
 */
static int
vsw_forward_all(vsw_t *vswp, mblk_t *mp, int caller, vsw_port_t *arg)
{
        vsw_port_list_t *plist = &vswp->plist;
        vsw_port_t      *portp;
        mblk_t          *nmp = NULL;
        mblk_t          *ret_m = NULL;
        int             skip_port = 0;

        D1(vswp, "vsw_forward_all: enter\n");

        /*
         * Broadcast message from inside ldoms so send to outside
         * world if in either of layer 2 modes.
         */
        if ((vswp->smode & VSW_LAYER2) &&
            ((caller == VSW_LOCALDEV) || (caller == VSW_VNETPORT))) {

                nmp = vsw_dupmsgchain(mp);
                if (nmp) {
                        if ((ret_m = vsw_tx_msg(vswp, nmp, caller, arg))
                            != NULL) {
                                DERR(vswp, "%s: dropping pkt(s) "
                                    "consisting of %ld bytes of data for"
                                    " physical device", __func__, MBLKL(ret_m));
                                freemsgchain(ret_m);
                        }
                }
        }

        if (caller == VSW_VNETPORT)
                skip_port = 1;

        /*
         * Broadcast message from other vnet (layer 2 or 3) or outside
         * world (layer 2 only), send up stack if plumbed.
         */
        if ((caller == VSW_PHYSDEV) || (caller == VSW_VNETPORT)) {
                vsw_mac_rx(vswp, NULL, mp, VSW_MACRX_COPYMSG);
        }

        /* send it to all VNETPORTs */
        READ_ENTER(&plist->lockrw);
        for (portp = plist->head; portp != NULL; portp = portp->p_next) {
                D2(vswp, "vsw_forward_all: port %d", portp->p_instance);
                /*
                 * Caution ! - don't reorder these two checks as arg
                 * will be NULL if the caller is PHYSDEV. skip_port is
                 * only set if caller is VNETPORT.
                 */
                if ((skip_port) && (portp == arg)) {
                        continue;
                } else {
                        nmp = vsw_dupmsgchain(mp);
                        if (nmp) {
                                /*
                                 * The plist->lockrw is protecting the
                                 * portp from getting destroyed here.
                                 * So, no ref_cnt is incremented here.
                                 */
                                (void) vsw_portsend(portp, nmp);
                        } else {
                                DERR(vswp, "vsw_forward_all: nmp NULL");
                        }
                }
        }
        RW_EXIT(&plist->lockrw);

        freemsgchain(mp);

        D1(vswp, "vsw_forward_all: exit\n");
        return (0);
}

/*
 * Forward pkts to any devices or interfaces which have registered
 * an interest in them (i.e. multicast groups).
 */
static int
vsw_forward_grp(vsw_t *vswp, mblk_t *mp, int caller, vsw_port_t *arg)
{
        struct ether_header     *ehp = (struct ether_header *)mp->b_rptr;
        mfdb_ent_t              *entp = NULL;
        mfdb_ent_t              *tpp = NULL;
        vsw_port_t              *port;
        uint64_t                key = 0;
        mblk_t                  *nmp = NULL;
        mblk_t                  *ret_m = NULL;
        boolean_t               check_if = B_TRUE;

        /*
         * Convert address to hash table key
         */
        KEY_HASH(key, &ehp->ether_dhost);

        D1(vswp, "%s: key 0x%llx", __func__, key);

        /*
         * If pkt came from either a vnet or down the stack (if we are
         * plumbed) and we are in layer 2 mode, then we send the pkt out
         * over the physical adapter, and then check to see if any other
         * vnets are interested in it.
         */
        if ((vswp->smode & VSW_LAYER2) &&
            ((caller == VSW_VNETPORT) || (caller == VSW_LOCALDEV))) {
                nmp = vsw_dupmsgchain(mp);
                if (nmp) {
                        if ((ret_m = vsw_tx_msg(vswp, nmp, caller, arg))
                            != NULL) {
                                DERR(vswp, "%s: dropping pkt(s) consisting of "
                                    "%ld bytes of data for physical device",
                                    __func__, MBLKL(ret_m));
                                freemsgchain(ret_m);
                        }
                }
        }

        READ_ENTER(&vswp->mfdbrw);
        if (mod_hash_find(vswp->mfdb, (mod_hash_key_t)key,
            (mod_hash_val_t *)&entp) != 0) {
                D3(vswp, "%s: no table entry found for addr 0x%llx",
                    __func__, key);
        } else {
                /*
                 * Send to list of devices associated with this address...
                 */
                for (tpp = entp; tpp != NULL; tpp = tpp->nextp) {

                        /* dont send to ourselves */
                        if ((caller == VSW_VNETPORT) &&
                            (tpp->d_addr == (void *)arg)) {
                                port = (vsw_port_t *)tpp->d_addr;
                                D3(vswp, "%s: not sending to ourselves"
                                    " : port %d", __func__, port->p_instance);
                                continue;

                        } else if ((caller == VSW_LOCALDEV) &&
                            (tpp->d_type == VSW_LOCALDEV)) {
                                D2(vswp, "%s: not sending back up stack",
                                    __func__);
                                continue;
                        }

                        if (tpp->d_type == VSW_VNETPORT) {
                                port = (vsw_port_t *)tpp->d_addr;
                                D3(vswp, "%s: sending to port %ld for addr "
                                    "0x%llx", __func__, port->p_instance, key);

                                nmp = vsw_dupmsgchain(mp);
                                if (nmp) {
                                        /*
                                         * The vswp->mfdbrw is protecting the
                                         * portp from getting destroyed here.
                                         * So, no ref_cnt is incremented here.
                                         */
                                        (void) vsw_portsend(port, nmp);
                                }
                        } else {
                                vsw_mac_rx(vswp, NULL,
                                    mp, VSW_MACRX_COPYMSG);
                                D2(vswp, "%s: sending up stack"
                                    " for addr 0x%llx", __func__, key);
                                check_if = B_FALSE;
                        }
                }
        }

        RW_EXIT(&vswp->mfdbrw);

        /*
         * If the pkt came from either a vnet or from physical device,
         * and if we havent already sent the pkt up the stack then we
         * check now if we can/should (i.e. the interface is plumbed
         * and in promisc mode).
         */
        if ((check_if) &&
            ((caller == VSW_VNETPORT) || (caller == VSW_PHYSDEV))) {
                vsw_mac_rx(vswp, NULL, mp,
                    VSW_MACRX_PROMISC | VSW_MACRX_COPYMSG);
        }

        freemsgchain(mp);

        D1(vswp, "%s: exit", __func__);

        return (0);
}

/*
 * This function creates the vlan id hash table for the given vsw device or
 * port. It then adds each vlan that the device or port has been assigned,
 * into this hash table.
 * Arguments:
 *   arg:  vsw device or port.
 *   type: type of arg; VSW_LOCALDEV(vsw device) or VSW_VNETPORT(port).
 */
void
vsw_create_vlans(void *arg, int type)
{
        /* create vlan hash table */
        vsw_vlan_create_hash(arg, type);

        /* add vlan ids of the vsw device into its hash table */
        vsw_vlan_add_ids(arg, type);
}

/*
 * This function removes the vlan ids of the vsw device or port from its hash
 * table. It then destroys the vlan hash table.
 * Arguments:
 *   arg:  vsw device or port.
 *   type: type of arg; VSW_LOCALDEV(vsw device) or VSW_VNETPORT(port).
 */
void
vsw_destroy_vlans(void *arg, int type)
{
        /* remove vlan ids from the hash table */
        vsw_vlan_remove_ids(arg, type);

        /* destroy vlan-hash-table */
        vsw_vlan_destroy_hash(arg, type);
}

/*
 * Create a vlan-id hash table for the given vsw device or port.
 */
static void
vsw_vlan_create_hash(void *arg, int type)
{
        char            hashname[MAXNAMELEN];

        if (type == VSW_LOCALDEV) {
                vsw_t           *vswp = (vsw_t *)arg;

                (void) snprintf(hashname, MAXNAMELEN, "vsw%d-vlan-hash",
                    vswp->instance);

                vswp->vlan_nchains = vsw_vlan_nchains;
                vswp->vlan_hashp = mod_hash_create_idhash(hashname,
                    vswp->vlan_nchains, mod_hash_null_valdtor);

        } else if (type == VSW_VNETPORT) {
                vsw_port_t      *portp = (vsw_port_t *)arg;

                (void) snprintf(hashname, MAXNAMELEN, "port%d-vlan-hash",
                    portp->p_instance);

                portp->vlan_nchains = vsw_vlan_nchains;
                portp->vlan_hashp = mod_hash_create_idhash(hashname,
                    portp->vlan_nchains, mod_hash_null_valdtor);

        } else {
                return;
        }
}

/*
 * Destroy the vlan-id hash table for the given vsw device or port.
 */
static void
vsw_vlan_destroy_hash(void *arg, int type)
{
        if (type == VSW_LOCALDEV) {
                vsw_t           *vswp = (vsw_t *)arg;

                mod_hash_destroy_hash(vswp->vlan_hashp);
                vswp->vlan_nchains = 0;
        } else if (type == VSW_VNETPORT) {
                vsw_port_t      *portp = (vsw_port_t *)arg;

                mod_hash_destroy_hash(portp->vlan_hashp);
                portp->vlan_nchains = 0;
        } else {
                return;
        }
}

/*
 * Add vlan ids of the given vsw device or port into its hash table.
 */
void
vsw_vlan_add_ids(void *arg, int type)
{
        int     rv;
        int     i;

        if (type == VSW_LOCALDEV) {
                vsw_t           *vswp = (vsw_t *)arg;

                rv = mod_hash_insert(vswp->vlan_hashp,
                    (mod_hash_key_t)VLAN_ID_KEY(vswp->pvid),
                    (mod_hash_val_t)B_TRUE);
                if (rv != 0) {
                        cmn_err(CE_WARN, "vsw%d: Duplicate vlan-id(%d) for "
                            "the interface", vswp->instance, vswp->pvid);
                }

                for (i = 0; i < vswp->nvids; i++) {
                        rv = mod_hash_insert(vswp->vlan_hashp,
                            (mod_hash_key_t)VLAN_ID_KEY(vswp->vids[i].vl_vid),
                            (mod_hash_val_t)B_TRUE);
                        if (rv != 0) {
                                cmn_err(CE_WARN, "vsw%d: Duplicate vlan-id(%d)"
                                    " for the interface", vswp->instance,
                                    vswp->pvid);
                        }
                }

        } else if (type == VSW_VNETPORT) {
                vsw_port_t      *portp = (vsw_port_t *)arg;
                vsw_t           *vswp = portp->p_vswp;

                rv = mod_hash_insert(portp->vlan_hashp,
                    (mod_hash_key_t)VLAN_ID_KEY(portp->pvid),
                    (mod_hash_val_t)B_TRUE);
                if (rv != 0) {
                        cmn_err(CE_WARN, "vsw%d: Duplicate vlan-id(%d) for "
                            "the port(%d)", vswp->instance, vswp->pvid,
                            portp->p_instance);
                }

                for (i = 0; i < portp->nvids; i++) {
                        rv = mod_hash_insert(portp->vlan_hashp,
                            (mod_hash_key_t)VLAN_ID_KEY(portp->vids[i].vl_vid),
                            (mod_hash_val_t)B_TRUE);
                        if (rv != 0) {
                                cmn_err(CE_WARN, "vsw%d: Duplicate vlan-id(%d)"
                                    " for the port(%d)", vswp->instance,
                                    vswp->pvid, portp->p_instance);
                        }
                }

        }
}

/*
 * Remove vlan ids of the given vsw device or port from its hash table.
 */
void
vsw_vlan_remove_ids(void *arg, int type)
{
        mod_hash_val_t  vp;
        int             rv;
        int             i;

        if (type == VSW_LOCALDEV) {
                vsw_t           *vswp = (vsw_t *)arg;

                rv = vsw_vlan_lookup(vswp->vlan_hashp, vswp->pvid);
                if (rv == B_TRUE) {
                        rv = mod_hash_remove(vswp->vlan_hashp,
                            (mod_hash_key_t)VLAN_ID_KEY(vswp->pvid),
                            (mod_hash_val_t *)&vp);
                        ASSERT(rv == 0);
                }

                for (i = 0; i < vswp->nvids; i++) {
                        rv = vsw_vlan_lookup(vswp->vlan_hashp,
                            vswp->vids[i].vl_vid);
                        if (rv == B_TRUE) {
                                rv = mod_hash_remove(vswp->vlan_hashp,
                                    (mod_hash_key_t)VLAN_ID_KEY(
                                    vswp->vids[i].vl_vid),
                                    (mod_hash_val_t *)&vp);
                                ASSERT(rv == 0);
                        }
                }

        } else if (type == VSW_VNETPORT) {
                vsw_port_t      *portp = (vsw_port_t *)arg;

                portp = (vsw_port_t *)arg;
                rv = vsw_vlan_lookup(portp->vlan_hashp, portp->pvid);
                if (rv == B_TRUE) {
                        rv = mod_hash_remove(portp->vlan_hashp,
                            (mod_hash_key_t)VLAN_ID_KEY(portp->pvid),
                            (mod_hash_val_t *)&vp);
                        ASSERT(rv == 0);
                }

                for (i = 0; i < portp->nvids; i++) {
                        rv = vsw_vlan_lookup(portp->vlan_hashp,
                            portp->vids[i].vl_vid);
                        if (rv == B_TRUE) {
                                rv = mod_hash_remove(portp->vlan_hashp,
                                    (mod_hash_key_t)VLAN_ID_KEY(
                                    portp->vids[i].vl_vid),
                                    (mod_hash_val_t *)&vp);
                                ASSERT(rv == 0);
                        }
                }

        } else {
                return;
        }
}

/*
 * Find the given vlan id in the hash table.
 * Return: B_TRUE if the id is found; B_FALSE if not found.
 */
boolean_t
vsw_vlan_lookup(mod_hash_t *vlan_hashp, uint16_t vid)
{
        int             rv;
        mod_hash_val_t  vp;

        rv = mod_hash_find(vlan_hashp, VLAN_ID_KEY(vid), (mod_hash_val_t *)&vp);

        if (rv != 0)
                return (B_FALSE);

        return (B_TRUE);
}

/*
 * Add an entry into FDB for the given vsw.
 */
void
vsw_fdbe_add(vsw_t *vswp, void *port)
{
        uint64_t        addr = 0;
        vsw_port_t      *portp;
        vsw_fdbe_t      *fp;
        int             rv;

        portp = (vsw_port_t *)port;
        KEY_HASH(addr, &portp->p_macaddr);

        fp = kmem_zalloc(sizeof (vsw_fdbe_t), KM_SLEEP);
        fp->portp = port;

        /*
         * Note: duplicate keys will be rejected by mod_hash.
         */
        rv = mod_hash_insert(vswp->fdb_hashp, (mod_hash_key_t)addr,
            (mod_hash_val_t)fp);
        if (rv != 0) {
                cmn_err(CE_WARN, "vsw%d: Duplicate mac-address(%s) for "
                    "the port(%d)", vswp->instance,
                    ether_sprintf(&portp->p_macaddr), portp->p_instance);
                kmem_free(fp, sizeof (*fp));
        }
}

/*
 * Remove an entry from FDB.
 */
void
vsw_fdbe_del(vsw_t *vswp, struct ether_addr *eaddr)
{
        uint64_t        addr = 0;
        vsw_fdbe_t      *fp;
        int             rv;

        KEY_HASH(addr, eaddr);

        /*
         * Remove the entry from fdb hash table.
         * This prevents further references to this fdb entry.
         */
        rv = mod_hash_remove(vswp->fdb_hashp, (mod_hash_key_t)addr,
            (mod_hash_val_t *)&fp);
        if (rv != 0) {
                /* invalid key? */
                return;
        }

        /*
         * If there are threads already ref holding before the entry was
         * removed from hash table, then wait for ref count to drop to zero.
         */
        while (fp->refcnt != 0) {
                delay(drv_usectohz(vsw_fdbe_refcnt_delay));
        }

        kmem_free(fp, sizeof (*fp));
}

/*
 * Search fdb for a given mac address. If an entry is found, hold
 * a reference to it and return the entry, else returns NULL.
 */
static vsw_fdbe_t *
vsw_fdbe_find(vsw_t *vswp, struct ether_addr *addrp)
{
        uint64_t        key = 0;
        vsw_fdbe_t      *fp;
        int             rv;

        KEY_HASH(key, addrp);

        rv = mod_hash_find_cb(vswp->fdb_hashp, (mod_hash_key_t)key,
            (mod_hash_val_t *)&fp, vsw_fdbe_find_cb);

        if (rv != 0)
                return (NULL);

        return (fp);
}

/*
 * Callback function provided to mod_hash_find_cb(). After finding the fdb
 * entry corresponding to the key (macaddr), this callback will be invoked by
 * mod_hash_find_cb() to atomically increment the reference count on the fdb
 * entry before returning the found entry.
 */
static void
vsw_fdbe_find_cb(mod_hash_key_t key, mod_hash_val_t val)
{
        _NOTE(ARGUNUSED(key))
        VSW_FDBE_REFHOLD((vsw_fdbe_t *)val);
}

/*
 * A given frame must be always tagged with the appropriate vlan id (unless it
 * is in the default-vlan) before the mac address switching function is called.
 * Otherwise, after switching function determines the destination, we cannot
 * figure out if the destination belongs to the the same vlan that the frame
 * originated from and if it needs tag/untag. Frames which are inbound from
 * the external(physical) network over a vlan trunk link are always tagged.
 * However frames which are received from a vnet-port over ldc or frames which
 * are coming down the stack on the service domain over vsw interface may be
 * untagged. These frames must be tagged with the appropriate pvid of the
 * sender (vnet-port or vsw device), before invoking the switching function.
 *
 * Arguments:
 *   arg:    caller of the function.
 *   type:   type of arg(caller): VSW_LOCALDEV(vsw) or VSW_VNETPORT(port)
 *   mp:     frame(s) to be tagged.
 */
mblk_t *
vsw_vlan_frame_pretag(void *arg, int type, mblk_t *mp)
{
        vsw_t                   *vswp;
        vsw_port_t              *portp;
        struct ether_header     *ehp;
        mblk_t                  *bp;
        mblk_t                  *bpt;
        mblk_t                  *bph;
        mblk_t                  *bpn;
        uint16_t                pvid;

        ASSERT((type == VSW_LOCALDEV) || (type == VSW_VNETPORT));

        if (type == VSW_LOCALDEV) {
                vswp = (vsw_t *)arg;
                pvid = vswp->pvid;
                portp = NULL;
        } else {
                /* VSW_VNETPORT */
                portp = (vsw_port_t *)arg;
                pvid = portp->pvid;
                vswp = portp->p_vswp;
        }

        bpn = bph = bpt = NULL;

        for (bp = mp; bp != NULL; bp = bpn) {

                bpn = bp->b_next;
                bp->b_next = bp->b_prev = NULL;

                /* Determine if it is an untagged frame */
                ehp = (struct ether_header *)bp->b_rptr;

                if (ehp->ether_type != ETHERTYPE_VLAN) {        /* untagged */

                        /* no need to tag if the frame is in default vlan */
                        if (pvid != vswp->default_vlan_id) {
                                bp = vnet_vlan_insert_tag(bp, pvid);
                                if (bp == NULL) {
                                        continue;
                                }
                        }
                }

                /* build a chain of processed packets */
                if (bph == NULL) {
                        bph = bpt = bp;
                } else {
                        bpt->b_next = bp;
                        bpt = bp;
                }

        }

        return (bph);
}

/*
 * Frames destined to a vnet-port or to the local vsw interface, must be
 * untagged if necessary before sending. This function first checks that the
 * frame can be sent to the destination in the vlan identified by the frame
 * tag. Note that when this function is invoked the frame must have been
 * already tagged (unless it is in the default-vlan). Because, this function is
 * called when the switching function determines the destination and invokes
 * its send function (vnet-port or vsw interface) and all frames would have
 * been tagged by this time (see comments in vsw_vlan_frame_pretag()).
 *
 * Arguments:
 *   arg:    destination device.
 *   type:   type of arg(destination): VSW_LOCALDEV(vsw) or VSW_VNETPORT(port)
 *   np:     head of pkt chain to be validated and untagged.
 *   npt:    tail of pkt chain to be validated and untagged.
 *
 * Returns:
 *   np:     head of updated chain of packets
 *   npt:    tail of updated chain of packets
 *   rv:     count of the packets in the returned list
 */
uint32_t
vsw_vlan_frame_untag(void *arg, int type, mblk_t **np, mblk_t **npt)
{
        mblk_t                  *bp;
        mblk_t                  *bpt;
        mblk_t                  *bph;
        mblk_t                  *bpn;
        vsw_port_t              *portp;
        vsw_t                   *vswp;
        uint32_t                count;
        struct ether_header     *ehp;
        boolean_t               is_tagged;
        boolean_t               rv;
        uint16_t                vlan_id;
        uint16_t                pvid;
        mod_hash_t              *vlan_hashp;

        ASSERT((type == VSW_LOCALDEV) || (type == VSW_VNETPORT));


        if (type == VSW_LOCALDEV) {
                vswp = (vsw_t *)arg;
                pvid = vswp->pvid;
                vlan_hashp = vswp->vlan_hashp;
                portp = NULL;
        } else {
                /* type == VSW_VNETPORT */
                portp = (vsw_port_t *)arg;
                vswp = portp->p_vswp;
                vlan_hashp = portp->vlan_hashp;
                pvid = portp->pvid;
        }

        /*
         * If the MAC layer switching in place, then
         * untagging required only if the pvid is not
         * the same as default_vlan_id. This is because,
         * the MAC layer will send packets for the
         * registered vlans only.
         */
        if ((vswp->mac_cl_switching == B_TRUE) &&
            (pvid == vswp->default_vlan_id)) {
                /* simply count and set the tail */
                count = 1;
                bp = *np;
                ASSERT(bp != NULL);
                while (bp->b_next != NULL) {
                        bp = bp->b_next;
                        count++;
                }
                *npt = bp;
                return (count);
        }

        bpn = bph = bpt = NULL;
        count = 0;

        for (bp = *np; bp != NULL; bp = bpn) {

                bpn = bp->b_next;
                bp->b_next = bp->b_prev = NULL;

                /*
                 * Determine the vlan id that the frame belongs to.
                 */
                ehp = (struct ether_header *)bp->b_rptr;
                is_tagged = vsw_frame_lookup_vid(arg, type, ehp, &vlan_id);

                /*
                 * If MAC layer switching in place, then we
                 * need to untag only if the tagged packet has
                 * vlan-id same as the pvid.
                 */
                if (vswp->mac_cl_switching == B_TRUE) {

                        /* only tagged packets expected here */
                        ASSERT(is_tagged == B_TRUE);
                        if (vlan_id == pvid) {
                                bp = vnet_vlan_remove_tag(bp);
                                if (bp == NULL) {
                                        /* packet dropped */
                                        continue;
                                }
                        }
                } else { /* No MAC layer switching */

                        /*
                         * Check the frame header if tag/untag is  needed.
                         */
                        if (is_tagged == B_FALSE) {
                                /*
                                 * Untagged frame. We shouldn't have an
                                 * untagged packet at this point, unless
                                 * the destination's  vlan id is
                                 * default-vlan-id; if it is not the
                                 * default-vlan-id, we drop the packet.
                                 */
                                if (vlan_id != vswp->default_vlan_id) {
                                        /* drop the packet */
                                        freemsg(bp);
                                        continue;
                                }
                        } else {        /* Tagged */
                                /*
                                 * Tagged frame, untag if it's the
                                 * destination's pvid.
                                 */
                                if (vlan_id == pvid) {

                                        bp = vnet_vlan_remove_tag(bp);
                                        if (bp == NULL) {
                                                /* packet dropped */
                                                continue;
                                        }
                                } else {

                                        /*
                                         * Check if the destination is in the
                                         * same vlan.
                                         */
                                        rv = vsw_vlan_lookup(vlan_hashp,
                                            vlan_id);
                                        if (rv == B_FALSE) {
                                                /* drop the packet */
                                                freemsg(bp);
                                                continue;
                                        }
                                }

                        }
                }

                /* build a chain of processed packets */
                if (bph == NULL) {
                        bph = bpt = bp;
                } else {
                        bpt->b_next = bp;
                        bpt = bp;
                }
                count++;
        }

        *np = bph;
        *npt = bpt;
        return (count);
}

/*
 * Lookup the vlan id of the given frame. If it is a vlan-tagged frame,
 * then the vlan-id is available in the tag; otherwise, its vlan id is
 * implicitly obtained based on the caller (destination of the frame:
 * VSW_VNETPORT or VSW_LOCALDEV).
 * The vlan id determined is returned in vidp.
 * Returns: B_TRUE if it is a tagged frame; B_FALSE if it is untagged.
 */
boolean_t
vsw_frame_lookup_vid(void *arg, int caller, struct ether_header *ehp,
        uint16_t *vidp)
{
        struct ether_vlan_header        *evhp;
        vsw_t                           *vswp;
        vsw_port_t                      *portp;

        /* If it's a tagged frame, get the vid from vlan header */
        if (ehp->ether_type == ETHERTYPE_VLAN) {

                evhp = (struct ether_vlan_header *)ehp;
                *vidp = VLAN_ID(ntohs(evhp->ether_tci));
                return (B_TRUE);
        }

        /* Untagged frame; determine vlan id based on caller */
        switch (caller) {

        case VSW_VNETPORT:
                /*
                 * packet destined to a vnet; vlan-id is pvid of vnet-port.
                 */
                portp = (vsw_port_t *)arg;
                *vidp = portp->pvid;
                break;

        case VSW_LOCALDEV:

                /*
                 * packet destined to vsw interface;
                 * vlan-id is port-vlan-id of vsw device.
                 */
                vswp = (vsw_t *)arg;
                *vidp = vswp->pvid;
                break;
        }

        return (B_FALSE);
}

/*
 * Add or remove multicast address(es).
 *
 * Returns 0 on success, 1 on failure.
 */
int
vsw_add_rem_mcst(vnet_mcast_msg_t *mcst_pkt, vsw_port_t *port)
{
        mcst_addr_t             *mcst_p = NULL;
        vsw_t                   *vswp = port->p_vswp;
        uint64_t                addr = 0x0;
        int                     i;

        D1(vswp, "%s: enter", __func__);

        D2(vswp, "%s: %d addresses", __func__, mcst_pkt->count);

        for (i = 0; i < mcst_pkt->count; i++) {
                /*
                 * Convert address into form that can be used
                 * as hash table key.
                 */
                KEY_HASH(addr, &(mcst_pkt->mca[i]));

                /*
                 * Add or delete the specified address/port combination.
                 */
                if (mcst_pkt->set == 0x1) {
                        D3(vswp, "%s: adding multicast address 0x%llx for "
                            "port %ld", __func__, addr, port->p_instance);
                        if (vsw_add_mcst(vswp, VSW_VNETPORT, addr, port) == 0) {
                                /*
                                 * Update the list of multicast
                                 * addresses contained within the
                                 * port structure to include this new
                                 * one.
                                 */
                                mcst_p = kmem_zalloc(sizeof (mcst_addr_t),
                                    KM_NOSLEEP);
                                if (mcst_p == NULL) {
                                        DERR(vswp, "%s: unable to alloc mem",
                                            __func__);
                                        (void) vsw_del_mcst(vswp,
                                            VSW_VNETPORT, addr, port);
                                        return (1);
                                }

                                mcst_p->nextp = NULL;
                                mcst_p->addr = addr;
                                ether_copy(&mcst_pkt->mca[i], &mcst_p->mca);

                                /*
                                 * Program the address into HW. If the addr
                                 * has already been programmed then the MAC
                                 * just increments a ref counter (which is
                                 * used when the address is being deleted)
                                 */
                                if (vsw_mac_multicast_add(vswp, port, mcst_p,
                                    VSW_VNETPORT)) {
                                        (void) vsw_del_mcst(vswp,
                                            VSW_VNETPORT, addr, port);
                                        kmem_free(mcst_p, sizeof (*mcst_p));
                                        return (1);
                                }

                                mutex_enter(&port->mca_lock);
                                mcst_p->nextp = port->mcap;
                                port->mcap = mcst_p;
                                mutex_exit(&port->mca_lock);

                        } else {
                                DERR(vswp, "%s: error adding multicast "
                                    "address 0x%llx for port %ld",
                                    __func__, addr, port->p_instance);
                                return (1);
                        }
                } else {
                        /*
                         * Delete an entry from the multicast hash
                         * table and update the address list
                         * appropriately.
                         */
                        if (vsw_del_mcst(vswp, VSW_VNETPORT, addr, port) == 0) {
                                D3(vswp, "%s: deleting multicast address "
                                    "0x%llx for port %ld", __func__, addr,
                                    port->p_instance);

                                mcst_p = vsw_del_addr(VSW_VNETPORT, port, addr);
                                ASSERT(mcst_p != NULL);

                                /*
                                 * Remove the address from HW. The address
                                 * will actually only be removed once the ref
                                 * count within the MAC layer has dropped to
                                 * zero. I.e. we can safely call this fn even
                                 * if other ports are interested in this
                                 * address.
                                 */
                                vsw_mac_multicast_remove(vswp, port, mcst_p,
                                    VSW_VNETPORT);
                                kmem_free(mcst_p, sizeof (*mcst_p));

                        } else {
                                DERR(vswp, "%s: error deleting multicast "
                                    "addr 0x%llx for port %ld",
                                    __func__, addr, port->p_instance);
                                return (1);
                        }
                }
        }
        D1(vswp, "%s: exit", __func__);
        return (0);
}

/*
 * Add a new multicast entry.
 *
 * Search hash table based on address. If match found then
 * update associated val (which is chain of ports), otherwise
 * create new key/val (addr/port) pair and insert into table.
 */
int
vsw_add_mcst(vsw_t *vswp, uint8_t devtype, uint64_t addr, void *arg)
{
        int             dup = 0;
        int             rv = 0;
        mfdb_ent_t      *ment = NULL;
        mfdb_ent_t      *tmp_ent = NULL;
        mfdb_ent_t      *new_ent = NULL;
        void            *tgt = NULL;

        if (devtype == VSW_VNETPORT) {
                /*
                 * Being invoked from a vnet.
                 */
                ASSERT(arg != NULL);
                tgt = arg;
                D2(NULL, "%s: port %d : address 0x%llx", __func__,
                    ((vsw_port_t *)arg)->p_instance, addr);
        } else {
                /*
                 * We are being invoked via the m_multicst mac entry
                 * point.
                 */
                D2(NULL, "%s: address 0x%llx", __func__, addr);
                tgt = (void *)vswp;
        }

        WRITE_ENTER(&vswp->mfdbrw);
        if (mod_hash_find(vswp->mfdb, (mod_hash_key_t)addr,
            (mod_hash_val_t *)&ment) != 0) {

                /* address not currently in table */
                ment = kmem_alloc(sizeof (mfdb_ent_t), KM_SLEEP);
                ment->d_addr = (void *)tgt;
                ment->d_type = devtype;
                ment->nextp = NULL;

                if (mod_hash_insert(vswp->mfdb, (mod_hash_key_t)addr,
                    (mod_hash_val_t)ment) != 0) {
                        DERR(vswp, "%s: hash table insertion failed", __func__);
                        kmem_free(ment, sizeof (mfdb_ent_t));
                        rv = 1;
                } else {
                        D2(vswp, "%s: added initial entry for 0x%llx to "
                            "table", __func__, addr);
                }
        } else {
                /*
                 * Address in table. Check to see if specified port
                 * is already associated with the address. If not add
                 * it now.
                 */
                tmp_ent = ment;
                while (tmp_ent != NULL) {
                        if (tmp_ent->d_addr == (void *)tgt) {
                                if (devtype == VSW_VNETPORT) {
                                        DERR(vswp, "%s: duplicate port entry "
                                            "found for portid %ld and key "
                                            "0x%llx", __func__,
                                            ((vsw_port_t *)arg)->p_instance,
                                            addr);
                                } else {
                                        DERR(vswp, "%s: duplicate entry found"
                                            "for key 0x%llx", __func__, addr);
                                }
                                rv = 1;
                                dup = 1;
                                break;
                        }
                        tmp_ent = tmp_ent->nextp;
                }

                /*
                 * Port not on list so add it to end now.
                 */
                if (0 == dup) {
                        D2(vswp, "%s: added entry for 0x%llx to table",
                            __func__, addr);
                        new_ent = kmem_alloc(sizeof (mfdb_ent_t), KM_SLEEP);
                        new_ent->d_addr = (void *)tgt;
                        new_ent->d_type = devtype;
                        new_ent->nextp = NULL;

                        tmp_ent = ment;
                        while (tmp_ent->nextp != NULL)
                                tmp_ent = tmp_ent->nextp;

                        tmp_ent->nextp = new_ent;
                }
        }

        RW_EXIT(&vswp->mfdbrw);
        return (rv);
}

/*
 * Remove a multicast entry from the hashtable.
 *
 * Search hash table based on address. If match found, scan
 * list of ports associated with address. If specified port
 * found remove it from list.
 */
int
vsw_del_mcst(vsw_t *vswp, uint8_t devtype, uint64_t addr, void *arg)
{
        mfdb_ent_t      *ment = NULL;
        mfdb_ent_t      *curr_p, *prev_p;
        void            *tgt = NULL;

        D1(vswp, "%s: enter", __func__);

        if (devtype == VSW_VNETPORT) {
                tgt = (vsw_port_t *)arg;
                D2(vswp, "%s: removing port %d from mFDB for address"
                    " 0x%llx", __func__, ((vsw_port_t *)tgt)->p_instance, addr);
        } else {
                D2(vswp, "%s: removing entry", __func__);
                tgt = (void *)vswp;
        }

        WRITE_ENTER(&vswp->mfdbrw);
        if (mod_hash_find(vswp->mfdb, (mod_hash_key_t)addr,
            (mod_hash_val_t *)&ment) != 0) {
                D2(vswp, "%s: address 0x%llx not in table", __func__, addr);
                RW_EXIT(&vswp->mfdbrw);
                return (1);
        }

        prev_p = curr_p = ment;

        while (curr_p != NULL) {
                if (curr_p->d_addr == (void *)tgt) {
                        if (devtype == VSW_VNETPORT) {
                                D2(vswp, "%s: port %d found", __func__,
                                    ((vsw_port_t *)tgt)->p_instance);
                        } else {
                                D2(vswp, "%s: instance found", __func__);
                        }

                        if (prev_p == curr_p) {
                                /*
                                 * head of list, if no other element is in
                                 * list then destroy this entry, otherwise
                                 * just replace it with updated value.
                                 */
                                ment = curr_p->nextp;
                                if (ment == NULL) {
                                        (void) mod_hash_destroy(vswp->mfdb,
                                            (mod_hash_val_t)addr);
                                } else {
                                        (void) mod_hash_replace(vswp->mfdb,
                                            (mod_hash_key_t)addr,
                                            (mod_hash_val_t)ment);
                                }
                        } else {
                                /*
                                 * Not head of list, no need to do
                                 * replacement, just adjust list pointers.
                                 */
                                prev_p->nextp = curr_p->nextp;
                        }
                        break;
                }

                prev_p = curr_p;
                curr_p = curr_p->nextp;
        }

        RW_EXIT(&vswp->mfdbrw);

        D1(vswp, "%s: exit", __func__);

        if (curr_p == NULL)
                return (1);
        kmem_free(curr_p, sizeof (mfdb_ent_t));
        return (0);
}

/*
 * Port is being deleted, but has registered an interest in one
 * or more multicast groups. Using the list of addresses maintained
 * within the port structure find the appropriate entry in the hash
 * table and remove this port from the list of interested ports.
 */
void
vsw_del_mcst_port(vsw_port_t *port)
{
        mcst_addr_t     *mcap = NULL;
        vsw_t           *vswp = port->p_vswp;

        D1(vswp, "%s: enter", __func__);

        mutex_enter(&port->mca_lock);

        while ((mcap = port->mcap) != NULL) {

                port->mcap = mcap->nextp;

                mutex_exit(&port->mca_lock);

                (void) vsw_del_mcst(vswp, VSW_VNETPORT,
                    mcap->addr, port);

                /*
                 * Remove the address from HW. The address
                 * will actually only be removed once the ref
                 * count within the MAC layer has dropped to
                 * zero. I.e. we can safely call this fn even
                 * if other ports are interested in this
                 * address.
                 */
                vsw_mac_multicast_remove(vswp, port, mcap, VSW_VNETPORT);
                kmem_free(mcap, sizeof (*mcap));

                mutex_enter(&port->mca_lock);

        }

        mutex_exit(&port->mca_lock);

        D1(vswp, "%s: exit", __func__);
}

/*
 * This vsw instance is detaching, but has registered an interest in one
 * or more multicast groups. Using the list of addresses maintained
 * within the vsw structure find the appropriate entry in the hash
 * table and remove this instance from the list of interested ports.
 */
void
vsw_del_mcst_vsw(vsw_t *vswp)
{
        mcst_addr_t     *next_p = NULL;

        D1(vswp, "%s: enter", __func__);

        mutex_enter(&vswp->mca_lock);

        while (vswp->mcap != NULL) {
                DERR(vswp, "%s: deleting addr 0x%llx",
                    __func__, vswp->mcap->addr);
                (void) vsw_del_mcst(vswp, VSW_LOCALDEV, vswp->mcap->addr, NULL);

                next_p = vswp->mcap->nextp;
                kmem_free(vswp->mcap, sizeof (mcst_addr_t));
                vswp->mcap = next_p;
        }

        vswp->mcap = NULL;
        mutex_exit(&vswp->mca_lock);

        D1(vswp, "%s: exit", __func__);
}

mblk_t *
vsw_get_same_dest_list(struct ether_header *ehp, mblk_t **mpp)
{
        mblk_t                  *bp;
        mblk_t                  *nbp;
        mblk_t                  *head = NULL;
        mblk_t                  *tail = NULL;
        mblk_t                  *prev = NULL;
        struct ether_header     *behp;

        /* process the chain of packets */
        bp = *mpp;
        while (bp) {
                nbp = bp->b_next;
                behp = (struct ether_header *)bp->b_rptr;
                bp->b_prev = NULL;
                if (ether_cmp(&ehp->ether_dhost, &behp->ether_dhost) == 0) {
                        if (prev == NULL) {
                                *mpp = nbp;
                        } else {
                                prev->b_next = nbp;
                        }
                        bp->b_next =  NULL;
                        if (head == NULL) {
                                head = tail = bp;
                        } else {
                                tail->b_next = bp;
                                tail = bp;
                        }
                } else {
                        prev = bp;
                }
                bp = nbp;
        }
        return (head);
}

static mblk_t *
vsw_dupmsgchain(mblk_t *mp)
{
        mblk_t  *nmp = NULL;
        mblk_t  **nmpp = &nmp;

        for (; mp != NULL; mp = mp->b_next) {
                if ((*nmpp = dupmsg(mp)) == NULL) {
                        freemsgchain(nmp);
                        return (NULL);
                }

                nmpp = &((*nmpp)->b_next);
        }

        return (nmp);
}