root/usr/src/uts/sun4v/io/vsw_phys.c
/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */

/*
 * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

#include <sys/types.h>
#include <sys/errno.h>
#include <sys/debug.h>
#include <sys/time.h>
#include <sys/sysmacros.h>
#include <sys/systm.h>
#include <sys/user.h>
#include <sys/stropts.h>
#include <sys/stream.h>
#include <sys/strlog.h>
#include <sys/strsubr.h>
#include <sys/cmn_err.h>
#include <sys/cpu.h>
#include <sys/kmem.h>
#include <sys/conf.h>
#include <sys/ddi.h>
#include <sys/sunddi.h>
#include <sys/ksynch.h>
#include <sys/stat.h>
#include <sys/kstat.h>
#include <sys/vtrace.h>
#include <sys/strsun.h>
#include <sys/dlpi.h>
#include <sys/ethernet.h>
#include <net/if.h>
#include <netinet/arp.h>
#include <inet/arp.h>
#include <sys/varargs.h>
#include <sys/machsystm.h>
#include <sys/modctl.h>
#include <sys/modhash.h>
#include <sys/mac_client.h>
#include <sys/mac_provider.h>
#include <sys/mac_client_priv.h>
#include <sys/mac_ether.h>
#include <sys/taskq.h>
#include <sys/note.h>
#include <sys/mach_descrip.h>
#include <sys/mac.h>
#include <sys/mac_flow.h>
#include <sys/mdeg.h>
#include <sys/vsw.h>
#include <sys/vlan.h>

/* MAC Ring table functions. */
static void vsw_port_rx_cb(void *, mac_resource_handle_t, mblk_t *,
    boolean_t);
static void vsw_if_rx_cb(void *, mac_resource_handle_t, mblk_t *, boolean_t);

/* MAC layer routines */
static int vsw_set_port_hw_addr(vsw_port_t *port);
static int vsw_set_if_hw_addr(vsw_t *vswp);
static  void vsw_unset_hw_addr(vsw_t *, vsw_port_t *, int);
static int vsw_maccl_open(vsw_t *vswp, vsw_port_t *port, int type);
static void vsw_maccl_close(vsw_t *vswp, vsw_port_t *port, int type);
static void vsw_mac_multicast_add_all(vsw_t *vswp, vsw_port_t *portp, int type);
static void vsw_mac_multicast_remove_all(vsw_t *vswp,
    vsw_port_t *portp, int type);
static void vsw_mac_add_vlans(vsw_t *vswp, mac_client_handle_t mch,
    uint8_t *macaddr, uint16_t flags, vsw_vlanid_t *vids, int nvids);
static void vsw_mac_remove_vlans(mac_client_handle_t mch, vsw_vlanid_t *vids,
    int nvids);
static  void vsw_mac_set_mtu(vsw_t *vswp, uint32_t mtu);
static void vsw_maccl_set_bandwidth(vsw_t *vswp, vsw_port_t *port, int type,
    uint64_t maxbw);
static int vsw_notify_add(vsw_t *vswp);
static int vsw_notify_rem(vsw_t *vswp);
static void vsw_notify_cb(void *arg, mac_notify_type_t type);
static void vsw_notify_link(vsw_t *vswp);

/* Support functions */
int vsw_set_hw(vsw_t *, vsw_port_t *, int);
void vsw_unset_hw(vsw_t *, vsw_port_t *, int);
void vsw_reconfig_hw(vsw_t *);
int vsw_mac_open(vsw_t *vswp);
void vsw_mac_close(vsw_t *vswp);
int vsw_mac_multicast_add(vsw_t *vswp, vsw_port_t *port, mcst_addr_t *mcst_p,
    int type);
void vsw_mac_multicast_remove(vsw_t *vswp, vsw_port_t *port,
    mcst_addr_t *mcst_p, int type);
int vsw_mac_client_init(vsw_t *vswp, vsw_port_t *port, int type);
void vsw_mac_client_cleanup(vsw_t *vswp, vsw_port_t *port, int type);
void vsw_mac_cleanup_ports(vsw_t *vswp);
void vsw_unset_addrs(vsw_t *vswp);
void vsw_set_addrs(vsw_t *vswp);
mblk_t *vsw_tx_msg(vsw_t *, mblk_t *, int, vsw_port_t *);
void vsw_publish_macaddr(vsw_t *vswp, vsw_port_t *portp);
void vsw_port_mac_reconfig(vsw_port_t *portp, boolean_t update_vlans,
    uint16_t new_pvid, vsw_vlanid_t *new_vids, int new_nvids);
void vsw_mac_port_reconfig_vlans(vsw_port_t *portp, uint16_t new_pvid,
    vsw_vlanid_t *new_vids, int new_nvids);
void vsw_if_mac_reconfig(vsw_t *vswp, boolean_t update_vlans,
    uint16_t new_pvid, vsw_vlanid_t *new_vids, int new_nvids);
void vsw_update_bandwidth(vsw_t *vswp, vsw_port_t *port, int type,
    uint64_t maxbw);

/*
 * Functions imported from other files.
 */
extern int vsw_portsend(vsw_port_t *port, mblk_t *mp);
extern void vsw_hio_stop_port(vsw_port_t *portp);
extern void vsw_hio_port_reset(vsw_port_t *portp, boolean_t immediate);
extern uint32_t vsw_publish_macaddr_count;
extern uint32_t vsw_vlan_frame_untag(void *arg, int type, mblk_t **np,
        mblk_t **npt);
extern void vsw_physlink_state_update(vsw_t *vswp);
static char mac_mtu_propname[] = "mtu";

/*
 * Tunables used in this file.
 */
extern int vsw_mac_open_retries;

#define WRITE_MACCL_ENTER(vswp, port, type)     \
        (type == VSW_LOCALDEV) ?  rw_enter(&vswp->maccl_rwlock, RW_WRITER) :\
        rw_enter(&port->maccl_rwlock, RW_WRITER)

#define READ_MACCL_ENTER(vswp, port, type)      \
        (type == VSW_LOCALDEV) ?  rw_enter(&vswp->maccl_rwlock, RW_READER) :\
        rw_enter(&port->maccl_rwlock, RW_READER)

#define RW_MACCL_EXIT(vswp, port, type) \
        (type == VSW_LOCALDEV) ?  rw_exit(&vswp->maccl_rwlock) :        \
        rw_exit(&port->maccl_rwlock)


/*
 * Locking strategy in this file is explained as follows:
 *       - A global lock(vswp->mac_lock) is used to protect the
 *         MAC calls that deal with entire device. That is, the
 *         operations that deal with mac_handle which include
 *         mac_open()/close() and mac_client_open().
 *
 *      - A per port/interface RW lock(maccl_rwlock) is used protect
 *        the operations that deal with the MAC client.
 *
 *      When both mac_lock and maccl_rwlock need to be held, the
 *      mac_lock need be acquired first and then maccl_rwlock. That is,
 *              mac_lock---->maccl_rwlock
 *
 *      The 'mca_lock' that protects the mcast list is also acquired
 *      within the context of maccl_rwlock. The hierarchy for this
 *      one is as below:
 *              maccl_rwlock---->mca_lock
 */


/*
 * Program unicast and multicast addresses of vsw interface and the ports
 * into the network device.
 */
void
vsw_set_addrs(vsw_t *vswp)
{
        vsw_port_list_t *plist = &vswp->plist;
        vsw_port_t      *port;
        int             rv;

        READ_ENTER(&vswp->if_lockrw);

        if (vswp->if_state & VSW_IF_UP) {

                /* Open a mac client and program addresses */
                rv = vsw_mac_client_init(vswp, NULL, VSW_LOCALDEV);
                if (rv != 0) {
                        cmn_err(CE_NOTE,
                            "!vsw%d: failed to program interface "
                            "unicast address\n", vswp->instance);
                }

                /*
                 * Notify the MAC layer of the changed address.
                 */
                if (rv == 0) {
                        mac_unicst_update(vswp->if_mh,
                            (uint8_t *)&vswp->if_addr);
                }

        }

        RW_EXIT(&vswp->if_lockrw);

        WRITE_ENTER(&plist->lockrw);

        /* program unicast address of ports in the network device */
        for (port = plist->head; port != NULL; port = port->p_next) {
                if (port->addr_set) /* addr already set */
                        continue;

                /* Open a mac client and program addresses */
                rv = vsw_mac_client_init(vswp, port, VSW_VNETPORT);
                if (rv != 0) {
                        cmn_err(CE_NOTE,
                            "!vsw%d: failed to program port(%d) "
                            "unicast address\n", vswp->instance,
                            port->p_instance);
                }
        }
        /* announce macaddr of vnets to the physical switch */
        if (vsw_publish_macaddr_count != 0) {   /* enabled */
                for (port = plist->head; port != NULL; port = port->p_next) {
                        vsw_publish_macaddr(vswp, port);
                }
        }

        RW_EXIT(&plist->lockrw);
}

/*
 * Remove unicast, multicast addresses and close mac clients
 * for the vsw interface and all ports.
 */
void
vsw_unset_addrs(vsw_t *vswp)
{
        READ_ENTER(&vswp->if_lockrw);
        if (vswp->if_state & VSW_IF_UP) {

                /* Cleanup and close the mac client for the interface */
                vsw_mac_client_cleanup(vswp, NULL, VSW_LOCALDEV);
        }
        RW_EXIT(&vswp->if_lockrw);

        /* Cleanup and close the mac clients for all ports */
        vsw_mac_cleanup_ports(vswp);
}

/*
 * Open the underlying network device for access in layer2 mode.
 * Returns:
 *      0 on success
 *      EAGAIN if mac_open() fails due to the device being not available yet.
 *      EIO on any other failures.
 */
int
vsw_mac_open(vsw_t *vswp)
{
        int                     rv;

        ASSERT(MUTEX_HELD(&vswp->mac_lock));

        if (vswp->mh != NULL) {
                /* already open */
                return (0);
        }

        if (vswp->mac_open_retries++ >= vsw_mac_open_retries) {
                /* exceeded max retries */
                return (EIO);
        }

        if ((rv = mac_open_by_linkname(vswp->physname, &vswp->mh)) != 0) {
                /*
                 * If mac_open() failed and the error indicates that either
                 * the dlmgmtd door or the device is not available yet, we
                 * return EAGAIN to indicate that mac_open() needs to be
                 * retried. For example, this may happen during boot up, if
                 * the required link aggregation groups(devices) have not
                 * been created yet.
                 */
                if (rv == ENOENT || rv == EBADF) {
                        return (EAGAIN);
                } else {
                        cmn_err(CE_WARN, "!vsw%d: mac_open %s failed rv:%x\n",
                            vswp->instance, vswp->physname, rv);
                        return (EIO);
                }
        }
        vswp->mac_open_retries = 0;

        vsw_mac_set_mtu(vswp, vswp->mtu);

        rv = vsw_notify_add(vswp);
        if (rv != 0) {
                cmn_err(CE_CONT, "!vsw%d: mac_notify_add %s failed rv:%x\n",
                    vswp->instance, vswp->physname, rv);
        }

        return (0);
}

/*
 * Close the underlying physical device.
 */
void
vsw_mac_close(vsw_t *vswp)
{
        ASSERT(MUTEX_HELD(&vswp->mac_lock));

        if (vswp->mh != NULL) {
                if (vswp->mnh != 0) {
                        (void) vsw_notify_rem(vswp);
                        vswp->mnh = 0;
                }
                if (vswp->mtu != vswp->mtu_physdev_orig) {
                        vsw_mac_set_mtu(vswp, vswp->mtu_physdev_orig);
                }
                mac_close(vswp->mh);
                vswp->mh = NULL;
        }
}

/*
 * Add multicast addr.
 */
int
vsw_mac_multicast_add(vsw_t *vswp, vsw_port_t *port, mcst_addr_t *mcst_p,
    int type)
{
        int                     ret = 0;
        mac_client_handle_t     mch;

        WRITE_MACCL_ENTER(vswp, port, type);

        mch = (type == VSW_LOCALDEV) ? vswp->mch : port->p_mch;

        if (mch != NULL) {
                ret = mac_multicast_add(mch, mcst_p->mca.ether_addr_octet);
                if (ret != 0) {
                        cmn_err(CE_WARN, "!vsw%d: unable to "
                            "program multicast address(%s) err=%d",
                            vswp->instance,
                            ether_sprintf((void *)&mcst_p->mca), ret);
                        RW_MACCL_EXIT(vswp, port, type);
                        return (ret);
                }
                mcst_p->mac_added = B_TRUE;
        }

        RW_MACCL_EXIT(vswp, port, type);
        return (ret);
}

/*
 * Remove multicast addr.
 */
void
vsw_mac_multicast_remove(vsw_t *vswp, vsw_port_t *port, mcst_addr_t *mcst_p,
    int type)
{
        mac_client_handle_t     mch;

        WRITE_MACCL_ENTER(vswp, port, type);
        mch = (type == VSW_LOCALDEV) ? vswp->mch : port->p_mch;

        if (mch != NULL && mcst_p->mac_added) {
                mac_multicast_remove(mch, mcst_p->mca.ether_addr_octet);
                mcst_p->mac_added = B_FALSE;
        }
        RW_MACCL_EXIT(vswp, port, type);
}


/*
 * Add all multicast addresses of the port.
 */
static void
vsw_mac_multicast_add_all(vsw_t *vswp, vsw_port_t *portp, int type)
{
        mcst_addr_t             *mcap;
        mac_client_handle_t     mch;
        kmutex_t                *mca_lockp;
        int                     rv;

        ASSERT((type == VSW_LOCALDEV) || (type == VSW_VNETPORT));
        if (type == VSW_LOCALDEV) {
                ASSERT(RW_WRITE_HELD(&vswp->maccl_rwlock));
                mch = vswp->mch;
                mcap = vswp->mcap;
                mca_lockp = &vswp->mca_lock;
        } else {
                ASSERT(RW_WRITE_HELD(&portp->maccl_rwlock));
                mch = portp->p_mch;
                mcap = portp->mcap;
                mca_lockp = &portp->mca_lock;
        }

        if (mch == NULL)
                return;

        mutex_enter(mca_lockp);
        for (mcap = mcap; mcap != NULL; mcap = mcap->nextp) {
                if (mcap->mac_added)
                        continue;
                rv = mac_multicast_add(mch, (uchar_t *)&mcap->mca);
                if (rv == 0) {
                        mcap->mac_added = B_TRUE;
                } else {
                        cmn_err(CE_WARN, "!vsw%d: unable to program "
                            "multicast address(%s) err=%d", vswp->instance,
                            ether_sprintf((void *)&mcap->mca), rv);
                }
        }
        mutex_exit(mca_lockp);
}

/*
 * Remove all multicast addresses of the port.
 */
static void
vsw_mac_multicast_remove_all(vsw_t *vswp, vsw_port_t *portp, int type)
{
        mac_client_handle_t     mch;
        mcst_addr_t             *mcap;
        kmutex_t                *mca_lockp;

        ASSERT((type == VSW_LOCALDEV) || (type == VSW_VNETPORT));
        if (type == VSW_LOCALDEV) {
                ASSERT(RW_WRITE_HELD(&vswp->maccl_rwlock));
                mch = vswp->mch;
                mcap = vswp->mcap;
                mca_lockp = &vswp->mca_lock;
        } else {
                ASSERT(RW_WRITE_HELD(&portp->maccl_rwlock));
                mch = portp->p_mch;
                mcap = portp->mcap;
                mca_lockp = &portp->mca_lock;
        }

        if (mch == NULL)
                return;

        mutex_enter(mca_lockp);
        for (; mcap != NULL; mcap = mcap->nextp) {
                if (!mcap->mac_added)
                        continue;
                (void) mac_multicast_remove(mch, (uchar_t *)&mcap->mca);
                mcap->mac_added = B_FALSE;
        }
        mutex_exit(mca_lockp);
}

void
vsw_update_bandwidth(vsw_t *vswp, vsw_port_t *port, int type, uint64_t maxbw)
{
        ASSERT((type == VSW_LOCALDEV) || (type == VSW_VNETPORT));

        WRITE_MACCL_ENTER(vswp, port, type);
        vsw_maccl_set_bandwidth(vswp, port, type, maxbw);
        RW_MACCL_EXIT(vswp, port, type);
}

/*
 * Open a mac client and program uncast and multicast addresses
 * for a port or the interface.
 * Returns:
 *      0 on success
 *      non-zero for failure.
 */
int
vsw_mac_client_init(vsw_t *vswp, vsw_port_t *port, int type)
{
        int rv;

        mutex_enter(&vswp->mac_lock);
        WRITE_MACCL_ENTER(vswp, port, type);
        rv = vsw_maccl_open(vswp, port, type);

        /* Release mac_lock now */
        mutex_exit(&vswp->mac_lock);

        if (rv == 0) {
                (void) vsw_set_hw(vswp, port, type);
                vsw_mac_multicast_add_all(vswp, port, type);
        }
        RW_MACCL_EXIT(vswp, port, type);
        return (rv);
}

/*
 * Open a MAC client for a port or an interface.
 * The flags and their purpose as below:
 *
 *      MAC_OPEN_FLAGS_SHARES_DESIRED -- This flag is used to indicate
 *      that a port desires a Share. This will be the case with the
 *      the ports that have hybrid mode enabled. This will only cause
 *      MAC layer to allocate a share and corresponding resources
 *      ahead of time. Ports that are not HybridIO enabled are
 *      associated with default group & resources.
 *
 *      MAC_UNICAST_TAG_DISABLE -- This flag is used for VLAN
 *      support. It will cause MAC to not add any tags, but expect
 *      vsw to tag the packets.
 *
 *      MAC_UNICAST_STRIP_DISABLE -- This flag is used for VLAN
 *      support. It will case the MAC layer to not strip the tags.
 *      Vsw may have to strip the tag for pvid case.
 */
static int
vsw_maccl_open(vsw_t *vswp, vsw_port_t *port, int type)
{
        int             rv = 0;
        int             instance;
        char            mac_cl_name[MAXNAMELEN];
        const char      *dev_name;
        mac_client_handle_t *mchp;
        uint64_t flags = 0;

        ASSERT(MUTEX_HELD(&vswp->mac_lock));
        if (vswp->mh == NULL) {
                /*
                 * In case net-dev is changed (either set to nothing or
                 * using aggregation device), return success here as the
                 * timeout mechanism will handle it.
                 */
                return (0);
        }

        mchp = (type == VSW_LOCALDEV) ? &vswp->mch : &port->p_mch;
        if (*mchp != NULL) {
                /* already open */
                return (0);
        }
        dev_name = ddi_driver_name(vswp->dip);
        instance = ddi_get_instance(vswp->dip);
        if (type == VSW_VNETPORT) {
                if (port->p_hio_enabled)
                        flags |= MAC_OPEN_FLAGS_SHARES_DESIRED;
                (void) snprintf(mac_cl_name, MAXNAMELEN, "%s%d%s%d", dev_name,
                    instance, "_port", port->p_instance);
        } else {
                (void) snprintf(mac_cl_name, MAXNAMELEN, "%s%s%d",
                    dev_name, "_if", instance);
        }

        rv = mac_client_open(vswp->mh, mchp, mac_cl_name, flags);
        if (rv != 0) {
                cmn_err(CE_NOTE, "!vsw%d:%s mac_client_open() failed\n",
                    vswp->instance, mac_cl_name);
        }

        if (type != VSW_VNETPORT || !port->p_hio_enabled)
                mac_client_set_rings(*mchp, MAC_RXRINGS_NONE, MAC_TXRINGS_NONE);

        return (rv);
}

/*
 * Clean up by removing uncast, multicast addresses and
 * closing the MAC client for a port or the interface.
 */
void
vsw_mac_client_cleanup(vsw_t *vswp, vsw_port_t *port, int type)
{
        WRITE_MACCL_ENTER(vswp, port, type);
        vsw_mac_multicast_remove_all(vswp, port, type);
        vsw_unset_hw(vswp, port, type);
        vsw_maccl_close(vswp, port, type);
        RW_MACCL_EXIT(vswp, port, type);
}

/*
 * Close a MAC client for a port or an interface.
 */
static void
vsw_maccl_close(vsw_t *vswp, vsw_port_t *port, int type)
{
        mac_client_handle_t *mchp;

        ASSERT((type == VSW_LOCALDEV) || (type == VSW_VNETPORT));

        mchp = (type == VSW_LOCALDEV) ? &vswp->mch : &port->p_mch;
        if (*mchp != NULL) {
                mac_client_close(*mchp, 0);
                *mchp = NULL;
        }
}

/*
 * Cleanup MAC client related stuff for all ports.
 */
void
vsw_mac_cleanup_ports(vsw_t *vswp)
{
        vsw_port_list_t         *plist = &vswp->plist;
        vsw_port_t              *port;

        READ_ENTER(&plist->lockrw);
        for (port = plist->head; port != NULL; port = port->p_next) {
                vsw_mac_client_cleanup(vswp, port, VSW_VNETPORT);
        }
        RW_EXIT(&plist->lockrw);
}

/*
 * Depending on the mode specified, the capabilites and capacity
 * of the underlying device setup the physical device.
 *
 * If in layer 3 mode, then do nothing.
 *
 * If in layer 2 mode, open a mac client and program the mac-address
 * and vlan-ids. The MAC layer will take care of programming
 * the address into h/w or set the h/w into promiscuous mode.
 *
 * Returns 0 success, 1 on failure.
 */
int
vsw_set_hw(vsw_t *vswp, vsw_port_t *port, int type)
{
        int                     err = 1;

        D1(vswp, "%s: enter", __func__);

        ASSERT((type == VSW_LOCALDEV) || (type == VSW_VNETPORT));

        if (vswp->smode == VSW_LAYER3)
                return (0);

        if (type == VSW_VNETPORT) {
                ASSERT(port != NULL);
                err = vsw_set_port_hw_addr(port);
        } else {
                err = vsw_set_if_hw_addr(vswp);
        }

        D1(vswp, "%s: exit", __func__);
        return (err);
}

/*
 * If in layer 3 mode do nothing.
 *
 * If in layer 2 switched mode remove the address from the physical
 * device.
 *
 * If in layer 2 promiscuous mode disable promisc mode.
 *
 * Returns 0 on success.
 */
void
vsw_unset_hw(vsw_t *vswp, vsw_port_t *port, int type)
{
        D1(vswp, "%s: enter", __func__);

        ASSERT((type == VSW_LOCALDEV) || (type == VSW_VNETPORT));

        if (vswp->smode == VSW_LAYER3)
                return;

        if (type == VSW_VNETPORT) {
                ASSERT(port != NULL);
                vsw_unset_hw_addr(vswp, port, type);
        } else {
                vsw_unset_hw_addr(vswp, NULL, type);
        }

        D1(vswp, "%s: exit", __func__);
}

/*
 * Program the macaddress and vlans of a port.
 *
 * Returns 0 on sucess, 1 on failure.
 */
static int
vsw_set_port_hw_addr(vsw_port_t *port)
{
        vsw_t                   *vswp = port->p_vswp;
        mac_diag_t              diag;
        uint8_t                 *macaddr;
        uint16_t                vid = VLAN_ID_NONE;
        int                     rv;
        uint16_t                mac_flags = MAC_UNICAST_TAG_DISABLE |
            MAC_UNICAST_STRIP_DISABLE;

        D1(vswp, "%s: enter", __func__);

        ASSERT(RW_WRITE_HELD(&port->maccl_rwlock));
        if (port->p_mch == NULL)
                return (0);

        /*
         * If the port has a specific 'pvid', then
         * register with that vlan-id, otherwise register
         * with VLAN_ID_NONE.
         */
        if (port->pvid != vswp->default_vlan_id) {
                vid = port->pvid;
        }
        macaddr = (uint8_t *)port->p_macaddr.ether_addr_octet;

        if (!(vswp->smode & VSW_LAYER2_PROMISC)) {
                mac_flags |= MAC_UNICAST_HW;
        }

        if (port->addr_set == B_FALSE) {
                port->p_muh = NULL;
                rv = mac_unicast_add(port->p_mch, macaddr, mac_flags,
                    &port->p_muh, vid, &diag);

                if (rv != 0) {
                        cmn_err(CE_WARN, "vsw%d: Failed to program"
                            "macaddr,vid(%s, %d) err=%d",
                            vswp->instance, ether_sprintf((void *)macaddr),
                            vid, rv);
                        return (rv);
                }
                port->addr_set = B_TRUE;

                D2(vswp, "%s:programmed macaddr(%s) vid(%d) into device %s",
                    __func__, ether_sprintf((void *)macaddr), vid,
                    vswp->physname);
        }

        /* Add vlans to the MAC layer */
        vsw_mac_add_vlans(vswp, port->p_mch, macaddr,
            mac_flags, port->vids, port->nvids);

        /* Configure bandwidth to the MAC layer */
        vsw_maccl_set_bandwidth(NULL, port, VSW_VNETPORT, port->p_bandwidth);

        mac_rx_set(port->p_mch, vsw_port_rx_cb, (void *)port);

        D1(vswp, "%s: exit", __func__);
        return (rv);
}

/*
 * Program the macaddress and vlans of a port.
 *
 * Returns 0 on sucess, 1 on failure.
 */
static int
vsw_set_if_hw_addr(vsw_t *vswp)
{
        mac_diag_t              diag;
        uint8_t                 *macaddr;
        uint8_t                 primary_addr[ETHERADDRL];
        uint16_t                vid = VLAN_ID_NONE;
        int                     rv;
        uint16_t                mac_flags = MAC_UNICAST_TAG_DISABLE |
            MAC_UNICAST_STRIP_DISABLE;

        D1(vswp, "%s: enter", __func__);

        ASSERT(RW_WRITE_HELD(&vswp->maccl_rwlock));
        if (vswp->mch == NULL)
                return (0);

        macaddr = (uint8_t *)vswp->if_addr.ether_addr_octet;

        /* check if it is the primary macaddr of the card. */
        mac_unicast_primary_get(vswp->mh, primary_addr);
        if (ether_cmp((void *)primary_addr, (void*)macaddr) == 0) {
                mac_flags |= MAC_UNICAST_PRIMARY;
        }

        /*
         * If the interface has a specific 'pvid', then
         * register with that vlan-id, otherwise register
         * with VLAN_ID_NONE.
         */
        if (vswp->pvid != vswp->default_vlan_id) {
                vid = vswp->pvid;
        }

        if (!(vswp->smode & VSW_LAYER2_PROMISC)) {
                mac_flags |= MAC_UNICAST_HW;
        }

        if (vswp->addr_set == B_FALSE) {
                vswp->muh = NULL;
                rv = mac_unicast_add(vswp->mch, macaddr, mac_flags,
                    &vswp->muh, vid, &diag);

                if (rv != 0) {
                        cmn_err(CE_WARN, "vsw%d: Failed to program"
                            "macaddr,vid(%s, %d) err=%d",
                            vswp->instance, ether_sprintf((void *)macaddr),
                            vid, rv);
                        return (rv);
                }
                vswp->addr_set = B_TRUE;

                D2(vswp, "%s:programmed macaddr(%s) vid(%d) into device %s",
                    __func__, ether_sprintf((void *)macaddr), vid,
                    vswp->physname);
        }

        vsw_mac_add_vlans(vswp, vswp->mch, macaddr, mac_flags,
            vswp->vids, vswp->nvids);

        vsw_maccl_set_bandwidth(vswp, NULL, VSW_LOCALDEV, vswp->bandwidth);

        mac_rx_set(vswp->mch, vsw_if_rx_cb, (void *)vswp);

        D1(vswp, "%s: exit", __func__);
        return (rv);
}

/*
 * Remove a unicast mac address which has previously been programmed
 * into HW.
 *
 * Returns 0 on sucess, 1 on failure.
 */
static void
vsw_unset_hw_addr(vsw_t *vswp, vsw_port_t *port, int type)
{
        vsw_vlanid_t            *vids;
        int                     nvids;
        mac_client_handle_t     mch = NULL;

        D1(vswp, "%s: enter", __func__);

        ASSERT((type == VSW_LOCALDEV) || (type == VSW_VNETPORT));

        if (type == VSW_VNETPORT) {
                ASSERT(port != NULL);
                ASSERT(RW_WRITE_HELD(&port->maccl_rwlock));
                vids = port->vids;
                nvids = port->nvids;
        } else {
                ASSERT(RW_WRITE_HELD(&vswp->maccl_rwlock));
                vids = vswp->vids;
                nvids = vswp->nvids;
        }

        /* First clear the callback */
        if (type == VSW_LOCALDEV) {
                mch = vswp->mch;
        } else if (type == VSW_VNETPORT) {
                mch = port->p_mch;
        }


        if (mch == NULL) {
                return;
        }

        mac_rx_clear(mch);

        /* Remove vlans */
        vsw_mac_remove_vlans(mch, vids, nvids);

        if ((type == VSW_LOCALDEV) && (vswp->addr_set == B_TRUE)) {
                (void) mac_unicast_remove(vswp->mch, vswp->muh);
                vswp->muh = NULL;
                D2(vswp, "removed vsw interface mac-addr from "
                    "the device %s", vswp->physname);
                vswp->addr_set = B_FALSE;

        } else if ((type == VSW_VNETPORT) && (port->addr_set == B_TRUE)) {
                (void) mac_unicast_remove(port->p_mch, port->p_muh);
                port->p_muh = NULL;
                D2(vswp, "removed port(0x%p) mac-addr from "
                    "the device %s", port, vswp->physname);
                port->addr_set = B_FALSE;
        }

        D1(vswp, "%s: exit", __func__);
}

/*
 * receive callback routine for vsw interface. Invoked by MAC layer when there
 * are pkts being passed up from physical device for this vsw interface.
 */
/* ARGSUSED */
static void
vsw_if_rx_cb(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
    boolean_t loopback)
{
        _NOTE(ARGUNUSED(mrh))

        vsw_t           *vswp = (vsw_t *)arg;
        mblk_t          *mpt;
        int             count;

        ASSERT(vswp != NULL);

        D1(vswp, "%s: enter", __func__);

        READ_ENTER(&vswp->if_lockrw);
        if (vswp->if_state & VSW_IF_UP) {
                RW_EXIT(&vswp->if_lockrw);
                count = vsw_vlan_frame_untag(vswp, VSW_LOCALDEV, &mp, &mpt);
                if (count != 0) {
                        mac_rx(vswp->if_mh, NULL, mp);
                }
        } else {
                RW_EXIT(&vswp->if_lockrw);
                freemsgchain(mp);
        }

        D1(vswp, "%s: exit", __func__);
}

/*
 * receive callback routine for port. Invoked by MAC layer when there
 * are pkts being passed up from physical device for this port.
 */
/* ARGSUSED */
static void
vsw_port_rx_cb(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
    boolean_t loopback)
{
        _NOTE(ARGUNUSED(mrh))

        vsw_t           *vswp;
        vsw_port_t      *port = arg;

        ASSERT(port != NULL);

        vswp = port->p_vswp;

        D1(vswp, "vsw_port_rx_cb: enter");

        /*
         * Send the packets to the peer directly.
         */
        (void) vsw_portsend(port, mp);

        D1(vswp, "vsw_port_rx_cb: exit");
}

/*
 * Send a message out over the physical device
 * via the MAC layer.
 *
 * Returns any mblks that it was unable to transmit.
 */
mblk_t *
vsw_tx_msg(vsw_t *vswp, mblk_t *mp, int caller, vsw_port_t *port)
{
        mac_client_handle_t     mch;
        mac_unicast_handle_t    muh;

        READ_MACCL_ENTER(vswp, port, caller);

        mch = (caller == VSW_LOCALDEV) ? vswp->mch : port->p_mch;
        muh = (caller == VSW_LOCALDEV) ? vswp->muh : port->p_muh;

        if (mch == NULL || muh == NULL) {
                RW_MACCL_EXIT(vswp, port, caller);
                return (mp);
        }

        /* packets are sent or dropped */
        (void) mac_tx(mch, mp, 0, MAC_DROP_ON_NO_DESC, NULL);
        RW_MACCL_EXIT(vswp, port, caller);
        return (NULL);
}

/*
 * vsw_port_mac_reconfig -- Cleanup and close the MAC client
 * and reopen and re-configure the MAC client with new flags etc.
 * This function is useful for two different purposes:
 *      1) To update the MAC client with new vlan-ids. This is done
 *         by freeing the existing vlan-ids and reopen with the new
 *         vlan-ids.
 *
 *      2) If the Hybrid mode status of a port changes, then the
 *         MAC client need to be closed and re-opened, otherwise,
 *         Share related resources may not be freed(hybird mode disabled)
 *         or assigned(hybrid mode enabled). To accomplish this,
 *         this function simply closes and reopens the MAC client.
 *         The reopen will result in using the flags based on the
 *         new hybrid mode of the port.
 */
void
vsw_port_mac_reconfig(vsw_port_t *portp, boolean_t update_vlans,
    uint16_t new_pvid, vsw_vlanid_t *new_vids, int new_nvids)
{
        vsw_t *vswp = portp->p_vswp;
        int rv;

        D1(vswp, "%s: enter", __func__);
        /*
         * Remove the multi-cast addresses, unicast address
         * and close the mac-client.
         */
        mutex_enter(&vswp->mac_lock);
        WRITE_ENTER(&portp->maccl_rwlock);
        vsw_mac_multicast_remove_all(vswp, portp, VSW_VNETPORT);
        vsw_unset_hw(vswp, portp, VSW_VNETPORT);
        vsw_maccl_close(vswp, portp, VSW_VNETPORT);

        if (update_vlans == B_TRUE) {
                if (portp->nvids != 0) {
                        kmem_free(portp->vids,
                            sizeof (vsw_vlanid_t) * portp->nvids);
                        portp->vids = NULL;
                        portp->nvids = 0;
                }
                portp->vids = new_vids;
                portp->nvids = new_nvids;
                portp->pvid = new_pvid;
        }

        /*
         * Now re-open the mac-client and
         * configure unicast addr and multicast addrs.
         */
        rv = vsw_maccl_open(vswp, portp, VSW_VNETPORT);
        if (rv != 0) {
                goto recret;
        }

        if (vsw_set_hw(vswp, portp, VSW_VNETPORT)) {
                cmn_err(CE_NOTE, "!vsw%d: port:%d failed to "
                    "set unicast address\n", vswp->instance, portp->p_instance);
                goto recret;
        }

        vsw_mac_multicast_add_all(vswp, portp, VSW_VNETPORT);

recret:
        RW_EXIT(&portp->maccl_rwlock);
        mutex_exit(&vswp->mac_lock);
        D1(vswp, "%s: exit", __func__);
}

/*
 * vsw_if_mac_reconfig -- Reconfigure the vsw interfaace's mac-client
 * by closing and re-opening it. This function is used handle the
 * following two cases:
 *
 *      1) Handle the MAC address change for the interface.
 *      2) Handle vlan update.
 */
void
vsw_if_mac_reconfig(vsw_t *vswp, boolean_t update_vlans,
    uint16_t new_pvid, vsw_vlanid_t *new_vids, int new_nvids)
{
        int rv;

        D1(vswp, "%s: enter", __func__);
        /*
         * Remove the multi-cast addresses, unicast address
         * and close the mac-client.
         */
        mutex_enter(&vswp->mac_lock);
        WRITE_ENTER(&vswp->maccl_rwlock);
        vsw_mac_multicast_remove_all(vswp, NULL, VSW_LOCALDEV);
        vsw_unset_hw(vswp, NULL, VSW_LOCALDEV);
        vsw_maccl_close(vswp, NULL, VSW_LOCALDEV);

        if (update_vlans == B_TRUE) {
                if (vswp->nvids != 0) {
                        kmem_free(vswp->vids,
                            sizeof (vsw_vlanid_t) * vswp->nvids);
                        vswp->vids = NULL;
                        vswp->nvids = 0;
                }
                vswp->vids = new_vids;
                vswp->nvids = new_nvids;
                vswp->pvid = new_pvid;
        }

        /*
         * Now re-open the mac-client and
         * configure unicast addr and multicast addrs.
         */
        rv = vsw_maccl_open(vswp, NULL, VSW_LOCALDEV);
        if (rv != 0) {
                goto ifrecret;
        }

        if (vsw_set_hw(vswp, NULL, VSW_LOCALDEV)) {
                cmn_err(CE_NOTE, "!vsw%d:failed to set unicast address\n",
                    vswp->instance);
                goto ifrecret;
        }

        vsw_mac_multicast_add_all(vswp, NULL, VSW_LOCALDEV);

ifrecret:
        RW_EXIT(&vswp->maccl_rwlock);
        mutex_exit(&vswp->mac_lock);
        D1(vswp, "%s: exit", __func__);
}

/*
 * vsw_mac_port_reconfig_vlans -- Reconfigure a port to handle
 * vlan configuration update. As the removal of the last unicast-address,vid
 * from the MAC client results in releasing all resources, it expects
 * no Shares to be associated with such MAC client.
 *
 * To handle vlan configuration update for a port that already has
 * a Share bound, then we need to free that share prior to reconfiguration.
 * Initiate the hybrdIO setup again after the completion of reconfiguration.
 */
void
vsw_mac_port_reconfig_vlans(vsw_port_t *portp, uint16_t new_pvid,
    vsw_vlanid_t *new_vids, int new_nvids)
{
        /*
         * As the reconfiguration involves the close of
         * mac client, cleanup HybridIO and later restart
         * HybridIO setup again.
         */
        if (portp->p_hio_enabled == B_TRUE) {
                vsw_hio_stop_port(portp);
        }
        vsw_port_mac_reconfig(portp, B_TRUE, new_pvid, new_vids, new_nvids);
        if (portp->p_hio_enabled == B_TRUE) {
                /* reset to setup the HybridIO again. */
                vsw_hio_port_reset(portp, B_FALSE);
        }
}

/* Add vlans to MAC client */
static void
vsw_mac_add_vlans(vsw_t *vswp, mac_client_handle_t mch, uint8_t *macaddr,
    uint16_t flags, vsw_vlanid_t *vids, int nvids)
{
        vsw_vlanid_t    *vidp;
        mac_diag_t      diag;
        int             rv;
        int             i;

        flags |= MAC_UNICAST_TAG_DISABLE | MAC_UNICAST_STRIP_DISABLE;

        /* Add vlans to the MAC layer */
        for (i = 0; i < nvids; i++) {
                vidp = &vids[i];

                if (vidp->vl_set == B_TRUE) {
                        continue;
                }

                rv = mac_unicast_add(mch, macaddr, flags,
                    &vidp->vl_muh, vidp->vl_vid, &diag);
                if (rv != 0) {
                        cmn_err(CE_WARN, "vsw%d: Failed to program"
                            "macaddr,vid(%s, %d) err=%d",
                            vswp->instance, ether_sprintf((void *)macaddr),
                            vidp->vl_vid, rv);
                } else {
                        vidp->vl_set = B_TRUE;
                        D2(vswp, "%s:programmed macaddr(%s) vid(%d) "
                            "into device %s", __func__,
                            ether_sprintf((void *)macaddr),
                            vidp->vl_vid, vswp->physname);
                }
        }
}

/* Remove vlans from the MAC client */
static void
vsw_mac_remove_vlans(mac_client_handle_t mch, vsw_vlanid_t *vids, int nvids)
{
        int i;
        vsw_vlanid_t *vidp;

        for (i = 0; i < nvids; i++) {
                vidp = &vids[i];
                if (vidp->vl_set == B_FALSE) {
                        continue;
                }
                (void) mac_unicast_remove(mch, vidp->vl_muh);
                vidp->vl_set = B_FALSE;
        }
}

#define ARH_FIXED_LEN   8    /* Length of fixed part of ARP header(see arp.h) */

/*
 * Send a gratuitous RARP packet to notify the physical switch to update its
 * Layer2 forwarding table for the given mac address. This is done to allow the
 * switch to quickly learn the macaddr-port association when a guest is live
 * migrated or when vsw's physical device is changed dynamically. Any protocol
 * packet would serve this purpose, but we choose RARP, as it allows us to
 * accomplish this within L2 (ie, no need to specify IP addr etc in the packet)
 * The macaddr of vnet is retained across migration. Hence, we don't need to
 * update the arp cache of other hosts within the broadcast domain. Note that
 * it is harmless to send these RARP packets during normal port attach of a
 * client vnet. This can can be turned off if needed, by setting
 * vsw_publish_macaddr_count to zero in /etc/system.
 */
void
vsw_publish_macaddr(vsw_t *vswp, vsw_port_t *portp)
{
        mblk_t                  *mp;
        mblk_t                  *bp;
        struct arphdr           *arh;
        struct  ether_header    *ehp;
        int                     count = 0;
        int                     plen = 4;
        uint8_t                 *cp;

        mp = allocb(ETHERMIN, BPRI_MED);
        if (mp == NULL) {
                return;
        }

        /* Initialize eth header */
        ehp = (struct  ether_header *)mp->b_rptr;
        bcopy(&etherbroadcastaddr, &ehp->ether_dhost, ETHERADDRL);
        bcopy(&portp->p_macaddr, &ehp->ether_shost, ETHERADDRL);
        ehp->ether_type = htons(ETHERTYPE_REVARP);

        /* Initialize arp packet */
        arh = (struct arphdr *)(mp->b_rptr + sizeof (struct ether_header));
        cp = (uint8_t *)arh;

        arh->ar_hrd = htons(ARPHRD_ETHER);      /* Hardware type:  ethernet */
        arh->ar_pro = htons(ETHERTYPE_IP);      /* Protocol type:  IP */
        arh->ar_hln = ETHERADDRL;       /* Length of hardware address:  6 */
        arh->ar_pln = plen;             /* Length of protocol address:  4 */
        arh->ar_op = htons(REVARP_REQUEST);     /* Opcode: REVARP Request */

        cp += ARH_FIXED_LEN;

        /* Sender's hardware address and protocol address */
        bcopy(&portp->p_macaddr, cp, ETHERADDRL);
        cp += ETHERADDRL;
        bzero(cp, plen);        /* INADDR_ANY */
        cp += plen;

        /* Target hardware address and protocol address */
        bcopy(&portp->p_macaddr, cp, ETHERADDRL);
        cp += ETHERADDRL;
        bzero(cp, plen);        /* INADDR_ANY */
        cp += plen;

        mp->b_wptr += ETHERMIN; /* total size is 42; round up to ETHERMIN */

        for (count = 0; count < vsw_publish_macaddr_count; count++) {

                bp = dupmsg(mp);
                if (bp == NULL) {
                        continue;
                }

                /* transmit the packet */
                bp = vsw_tx_msg(vswp, bp, VSW_VNETPORT, portp);
                if (bp != NULL) {
                        freemsg(bp);
                }
        }

        freemsg(mp);
}

static void
vsw_mac_set_mtu(vsw_t *vswp, uint32_t mtu)
{
        uint_t  mtu_orig;
        int     rv;

        rv = mac_set_mtu(vswp->mh, mtu, &mtu_orig);
        if (rv != 0) {
                cmn_err(CE_NOTE,
                    "!vsw%d: Unable to set the mtu:%d, in the "
                    "physical device:%s\n",
                    vswp->instance, mtu, vswp->physname);
                return;
        }

        /* save the original mtu of physdev to reset it back later if needed */
        vswp->mtu_physdev_orig = mtu_orig;
}

/*
 * Register a callback with underlying mac layer for notifications.
 * We are currently interested in only link-state events.
 */
static int
vsw_notify_add(vsw_t *vswp)
{
        mac_notify_handle_t     mnh;
        uint32_t                note;

        /*
         * Check if the underlying MAC supports link update notification.
         */
        note = mac_no_notification(vswp->mh);
        if ((note & (DL_NOTE_LINK_UP | DL_NOTE_LINK_DOWN)) != 0) {
                vswp->phys_no_link_update = B_TRUE;
        } else {
                vswp->phys_no_link_update = B_FALSE;
        }

        /*
         * Read the current link state of the device and cache it.
         */
        vswp->phys_link_state = vswp->phys_no_link_update ? LINK_STATE_UP :
            mac_stat_get(vswp->mh, MAC_STAT_LINK_STATE);

        /*
         * Add notify callback function, if link update is supported.
         */
        if (vswp->phys_no_link_update == B_TRUE) {
                return (0);
        }

        mnh = mac_notify_add(vswp->mh, vsw_notify_cb, vswp);
        if (mnh == 0) {
                /* failed */
                return (1);
        }

        vswp->mnh = mnh;
        return (0);
}

/*
 * Remove notify callback.
 */
static int
vsw_notify_rem(vsw_t *vswp)
{
        int     rv;

        rv = mac_notify_remove(vswp->mnh, B_FALSE);
        return (rv);
}

/*
 * Notification callback invoked by the MAC service
 * module. Note that we process only link state updates.
 */
static void
vsw_notify_cb(void *arg, mac_notify_type_t type)
{
        vsw_t   *vswp = arg;

        switch (type) {

        case MAC_NOTE_LINK:
                vsw_notify_link(vswp);
                break;

        default:
                break;

        }
}

/*
 * Invoked upon receiving a MAC_NOTE_LINK
 * notification for the underlying physical device.
 */
static void
vsw_notify_link(vsw_t *vswp)
{
        link_state_t    link_state;

        /* link state change  notification */
        link_state = mac_stat_get(vswp->mh, MAC_STAT_LINK_STATE);

        if (vswp->phys_link_state != link_state) {
                D3(vswp, "%s: phys_link_state(%d)\n",
                    __func__, vswp->phys_link_state);

                vswp->phys_link_state = link_state;
                vsw_physlink_state_update(vswp);
        }
}

/*
 * Configure the bandwidth limit on the vsw or vnet devices via the MAC layer.
 * Note that bandwidth limit is not supported on a HybridIO enabled
 * vnet, as the HybridIO assigns a specific unit of hardware resource
 * that cannot be changed to limit bandwidth.
 */
static void
vsw_maccl_set_bandwidth(vsw_t *vswp, vsw_port_t *port, int type, uint64_t maxbw)
{
        int                     rv = 0;
        uint64_t                *bw;
        mac_resource_props_t    *mrp;
        mac_client_handle_t     mch;

        ASSERT((type == VSW_LOCALDEV) || (type == VSW_VNETPORT));

        if (type == VSW_VNETPORT) {
                ASSERT(RW_WRITE_HELD(&port->maccl_rwlock));
                mch = port->p_mch;
                bw = &port->p_bandwidth;
        } else {
                ASSERT(RW_WRITE_HELD(&vswp->maccl_rwlock));
                mch = vswp->mch;
                bw = &vswp->bandwidth;
        }

        if (mch == NULL) {
                return;
        }

        if (maxbw >= MRP_MAXBW_MINVAL || maxbw == 0) {
                mrp = kmem_zalloc(sizeof (*mrp), KM_SLEEP);
                if (maxbw == 0) {
                        mrp->mrp_maxbw = MRP_MAXBW_RESETVAL;
                } else {
                        mrp->mrp_maxbw = maxbw;
                }
                mrp->mrp_mask |= MRP_MAXBW;

                rv = mac_client_set_resources(mch, mrp);
                if (rv != 0) {
                        if (type == VSW_VNETPORT) {
                                cmn_err(CE_NOTE, "!port%d: cannot set "
                                    "bandwidth limit to (%ld), error(%d)\n",
                                    port->p_instance, maxbw, rv);
                        } else {
                                cmn_err(CE_NOTE, "!vsw%d: cannot set "
                                    "bandwidth limit to (%ld), error(%d)\n",
                                    vswp->instance, maxbw, rv);
                        }
                } else {
                        /*
                         * update with successfully configured bandwidth.
                         */
                        *bw = maxbw;
                }
                kmem_free(mrp, sizeof (*mrp));
        }
}