root/usr/src/uts/sun4v/io/vsw_ldc.c
/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */

/*
 * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
 */

#include <sys/types.h>
#include <sys/errno.h>
#include <sys/debug.h>
#include <sys/time.h>
#include <sys/sysmacros.h>
#include <sys/systm.h>
#include <sys/user.h>
#include <sys/stropts.h>
#include <sys/stream.h>
#include <sys/strlog.h>
#include <sys/strsubr.h>
#include <sys/cmn_err.h>
#include <sys/cpu.h>
#include <sys/kmem.h>
#include <sys/conf.h>
#include <sys/ddi.h>
#include <sys/sunddi.h>
#include <sys/ksynch.h>
#include <sys/stat.h>
#include <sys/kstat.h>
#include <sys/vtrace.h>
#include <sys/strsun.h>
#include <sys/dlpi.h>
#include <sys/ethernet.h>
#include <net/if.h>
#include <sys/varargs.h>
#include <sys/machsystm.h>
#include <sys/modctl.h>
#include <sys/modhash.h>
#include <sys/mac.h>
#include <sys/mac_ether.h>
#include <sys/taskq.h>
#include <sys/note.h>
#include <sys/mach_descrip.h>
#include <sys/mdeg.h>
#include <sys/ldc.h>
#include <sys/vsw_fdb.h>
#include <sys/vsw.h>
#include <sys/vio_mailbox.h>
#include <sys/vnet_mailbox.h>
#include <sys/vnet_common.h>
#include <sys/vio_util.h>
#include <sys/sdt.h>
#include <sys/atomic.h>
#include <sys/callb.h>
#include <sys/vlan.h>

/* Port add/deletion/etc routines */
static  void vsw_port_delete(vsw_port_t *port);
static  int vsw_ldc_attach(vsw_port_t *port, uint64_t ldc_id);
static  void vsw_ldc_detach(vsw_ldc_t *ldcp);
static  int vsw_ldc_init(vsw_ldc_t *ldcp);
static  void vsw_ldc_uninit(vsw_ldc_t *ldcp);
static  void vsw_ldc_drain(vsw_ldc_t *ldcp);
static  void vsw_drain_port_taskq(vsw_port_t *port);
static  void vsw_marker_task(void *);
static  int vsw_plist_del_node(vsw_t *, vsw_port_t *port);
void vsw_detach_ports(vsw_t *vswp);
int vsw_port_add(vsw_t *vswp, md_t *mdp, mde_cookie_t *node);
mcst_addr_t *vsw_del_addr(uint8_t devtype, void *arg, uint64_t addr);
int vsw_port_detach(vsw_t *vswp, int p_instance);
int vsw_portsend(vsw_port_t *port, mblk_t *mp);
int vsw_port_attach(vsw_port_t *portp);
vsw_port_t *vsw_lookup_port(vsw_t *vswp, int p_instance);
void vsw_vlan_unaware_port_reset(vsw_port_t *portp);
void vsw_hio_port_reset(vsw_port_t *portp, boolean_t immediate);
void vsw_reset_ports(vsw_t *vswp);
void vsw_port_reset(vsw_port_t *portp);
void vsw_physlink_update_ports(vsw_t *vswp);
static  void vsw_port_physlink_update(vsw_port_t *portp);

/* Interrupt routines */
static  uint_t vsw_ldc_cb(uint64_t cb, caddr_t arg);

/* Handshake routines */
static  void vsw_ldc_reinit(vsw_ldc_t *);
static  void vsw_conn_task(void *);
static  int vsw_check_flag(vsw_ldc_t *, int, uint64_t);
static  void vsw_next_milestone(vsw_ldc_t *);
static  int vsw_supported_version(vio_ver_msg_t *);
static  void vsw_set_vnet_proto_ops(vsw_ldc_t *ldcp);
static  void vsw_reset_vnet_proto_ops(vsw_ldc_t *ldcp);
void vsw_process_conn_evt(vsw_ldc_t *, uint16_t);

/* Data processing routines */
void vsw_process_pkt(void *);
static void vsw_dispatch_ctrl_task(vsw_ldc_t *, void *, vio_msg_tag_t *, int);
static void vsw_process_ctrl_pkt(void *);
static void vsw_process_ctrl_ver_pkt(vsw_ldc_t *, void *);
static void vsw_process_ctrl_attr_pkt(vsw_ldc_t *, void *);
static void vsw_process_ctrl_mcst_pkt(vsw_ldc_t *, void *);
static void vsw_process_ctrl_dring_reg_pkt(vsw_ldc_t *, void *);
static void vsw_process_ctrl_dring_unreg_pkt(vsw_ldc_t *, void *);
static void vsw_process_ctrl_rdx_pkt(vsw_ldc_t *, void *);
static void vsw_process_physlink_msg(vsw_ldc_t *, void *);
static void vsw_process_data_pkt(vsw_ldc_t *, void *, vio_msg_tag_t *,
        uint32_t);
static void vsw_process_pkt_data_nop(void *, void *, uint32_t);
static void vsw_process_pkt_data(void *, void *, uint32_t);
static void vsw_process_data_ibnd_pkt(vsw_ldc_t *, void *);
static void vsw_process_err_pkt(vsw_ldc_t *, void *, vio_msg_tag_t *);
static void vsw_process_evt_read(vsw_ldc_t *ldcp);
static void vsw_ldc_rcv(vsw_ldc_t *ldcp);

/* Switching/data transmit routines */
static  int vsw_descrsend(vsw_ldc_t *, mblk_t *);
static void vsw_ldcsend_pkt(vsw_ldc_t *ldcp, mblk_t *mp);
static int vsw_ldcsend(vsw_ldc_t *ldcp, mblk_t *mp, uint32_t retries);
static int vsw_ldctx_pri(void *arg, mblk_t *mp, mblk_t *mpt, uint32_t count);
static int vsw_ldctx(void *arg, mblk_t *mp, mblk_t *mpt, uint32_t count);

/* Packet creation routines */
static void vsw_send_ver(void *);
static void vsw_send_attr(vsw_ldc_t *);
static void vsw_send_dring_info(vsw_ldc_t *);
static void vsw_send_rdx(vsw_ldc_t *);
static void vsw_send_physlink_msg(vsw_ldc_t *ldcp, link_state_t plink_state);

/* Dring routines */
static void vsw_create_privring(vsw_ldc_t *);
static dring_info_t *vsw_map_dring(vsw_ldc_t *ldcp, void *pkt);
static void vsw_unmap_dring(vsw_ldc_t *ldcp);
static void vsw_destroy_dring(vsw_ldc_t *ldcp);
static void vsw_free_lane_resources(vsw_ldc_t *, uint64_t);
static int vsw_map_data(vsw_ldc_t *ldcp, dring_info_t *dp, void *pkt);
static void vsw_set_lane_attr(vsw_t *, lane_t *);
dring_info_t *vsw_map_dring_cmn(vsw_ldc_t *ldcp,
    vio_dring_reg_msg_t *dring_pkt);
static int vsw_mapin_avail(vsw_ldc_t *ldcp);

/* tx/msg/rcv thread routines */
static void vsw_stop_tx_thread(vsw_ldc_t *ldcp);
static void vsw_ldc_tx_worker(void *arg);

/* Misc support routines */
static void vsw_save_lmacaddr(vsw_t *vswp, uint64_t macaddr);
static int vsw_get_same_dest_list(struct ether_header *ehp,
    mblk_t **rhead, mblk_t **rtail, mblk_t **mpp);
static mblk_t *vsw_dupmsgchain(mblk_t *mp);

/* Debugging routines */
static void dump_flags(uint64_t);
static void display_state(void);
static void display_lane(lane_t *);
static void display_ring(dring_info_t *);

/*
 * Functions imported from other files.
 */
extern int vsw_set_hw(vsw_t *, vsw_port_t *, int);
extern void vsw_unset_hw(vsw_t *, vsw_port_t *, int);
extern int vsw_add_rem_mcst(vnet_mcast_msg_t *mcst_pkt, vsw_port_t *port);
extern void vsw_del_mcst_port(vsw_port_t *port);
extern int vsw_add_mcst(vsw_t *vswp, uint8_t devtype, uint64_t addr, void *arg);
extern int vsw_del_mcst(vsw_t *vswp, uint8_t devtype, uint64_t addr, void *arg);
extern void vsw_fdbe_add(vsw_t *vswp, void *port);
extern void vsw_fdbe_del(vsw_t *vswp, struct ether_addr *eaddr);
extern void vsw_create_vlans(void *arg, int type);
extern void vsw_destroy_vlans(void *arg, int type);
extern void vsw_vlan_add_ids(void *arg, int type);
extern void vsw_vlan_remove_ids(void *arg, int type);
extern boolean_t vsw_frame_lookup_vid(void *arg, int caller,
        struct ether_header *ehp, uint16_t *vidp);
extern mblk_t *vsw_vlan_frame_pretag(void *arg, int type, mblk_t *mp);
extern uint32_t vsw_vlan_frame_untag(void *arg, int type, mblk_t **np,
        mblk_t **npt);
extern boolean_t vsw_vlan_lookup(mod_hash_t *vlan_hashp, uint16_t vid);
extern void vsw_hio_start(vsw_t *vswp, vsw_ldc_t *ldcp);
extern void vsw_hio_stop(vsw_t *vswp, vsw_ldc_t *ldcp);
extern void vsw_process_dds_msg(vsw_t *vswp, vsw_ldc_t *ldcp, void *msg);
extern void vsw_hio_stop_port(vsw_port_t *portp);
extern void vsw_publish_macaddr(vsw_t *vswp, vsw_port_t *portp);
extern int vsw_mac_client_init(vsw_t *vswp, vsw_port_t *port, int type);
extern void vsw_mac_client_cleanup(vsw_t *vswp, vsw_port_t *port, int type);
extern void vsw_destroy_rxpools(void *arg);
extern void vsw_stop_msg_thread(vsw_ldc_t *ldcp);
extern int vsw_send_msg(vsw_ldc_t *, void *, int, boolean_t);
extern int vsw_dringsend(vsw_ldc_t *, mblk_t *);
extern int vsw_reclaim_dring(dring_info_t *dp, int start);
extern int vsw_dring_find_free_desc(dring_info_t *, vsw_private_desc_t **,
    int *);
extern vio_dring_reg_msg_t *vsw_create_tx_dring_info(vsw_ldc_t *);
extern int vsw_setup_tx_dring(vsw_ldc_t *ldcp, dring_info_t *dp);
extern void vsw_destroy_tx_dring(vsw_ldc_t *ldcp);
extern dring_info_t *vsw_map_rx_dring(vsw_ldc_t *ldcp, void *pkt);
extern void vsw_unmap_rx_dring(vsw_ldc_t *ldcp);
extern void vsw_ldc_msg_worker(void *arg);
extern void vsw_process_dringdata(void *, void *);
extern vio_dring_reg_msg_t *vsw_create_rx_dring_info(vsw_ldc_t *);
extern void vsw_destroy_rx_dring(vsw_ldc_t *ldcp);
extern dring_info_t *vsw_map_tx_dring(vsw_ldc_t *ldcp, void *pkt);
extern void vsw_unmap_tx_dring(vsw_ldc_t *ldcp);
extern void vsw_ldc_rcv_worker(void *arg);
extern void vsw_stop_rcv_thread(vsw_ldc_t *ldcp);
extern int vsw_dringsend_shm(vsw_ldc_t *, mblk_t *);
extern void vsw_process_dringdata_shm(void *, void *);

/*
 * Tunables used in this file.
 */
extern int vsw_num_handshakes;
extern int vsw_ldc_tx_delay;
extern int vsw_ldc_tx_retries;
extern int vsw_ldc_retries;
extern int vsw_ldc_delay;
extern boolean_t vsw_ldc_rxthr_enabled;
extern boolean_t vsw_ldc_txthr_enabled;
extern uint32_t vsw_num_descriptors;
extern uint8_t  vsw_dring_mode;
extern uint32_t vsw_max_tx_qcount;
extern boolean_t vsw_obp_ver_proto_workaround;
extern uint32_t vsw_publish_macaddr_count;
extern uint32_t vsw_nrbufs_factor;

#define LDC_ENTER_LOCK(ldcp)    \
                                mutex_enter(&((ldcp)->ldc_cblock));\
                                mutex_enter(&((ldcp)->ldc_rxlock));\
                                mutex_enter(&((ldcp)->ldc_txlock));
#define LDC_EXIT_LOCK(ldcp)     \
                                mutex_exit(&((ldcp)->ldc_txlock));\
                                mutex_exit(&((ldcp)->ldc_rxlock));\
                                mutex_exit(&((ldcp)->ldc_cblock));

#define VSW_VER_EQ(ldcp, major, minor)  \
        ((ldcp)->lane_out.ver_major == (major) &&       \
            (ldcp)->lane_out.ver_minor == (minor))

#define VSW_VER_LT(ldcp, major, minor)  \
        (((ldcp)->lane_out.ver_major < (major)) ||      \
            ((ldcp)->lane_out.ver_major == (major) &&   \
            (ldcp)->lane_out.ver_minor < (minor)))

#define VSW_VER_GTEQ(ldcp, major, minor)        \
        (((ldcp)->lane_out.ver_major > (major)) ||      \
            ((ldcp)->lane_out.ver_major == (major) &&   \
            (ldcp)->lane_out.ver_minor >= (minor)))

#define VSW_VER_LTEQ(ldcp, major, minor)        \
        (((ldcp)->lane_out.ver_major < (major)) ||      \
            ((ldcp)->lane_out.ver_major == (major) &&   \
            (ldcp)->lane_out.ver_minor <= (minor)))

/*
 * VIO Protocol Version Info:
 *
 * The version specified below represents the version of protocol currently
 * supported in the driver. It means the driver can negotiate with peers with
 * versions <= this version. Here is a summary of the feature(s) that are
 * supported at each version of the protocol:
 *
 * 1.0                  Basic VIO protocol.
 * 1.1                  vDisk protocol update (no virtual network update).
 * 1.2                  Support for priority frames (priority-ether-types).
 * 1.3                  VLAN and HybridIO support.
 * 1.4                  Jumbo Frame support.
 * 1.5                  Link State Notification support with optional support
 *                      for Physical Link information.
 * 1.6                  Support for RxDringData mode.
 */
static  ver_sup_t       vsw_versions[] = { {1, 6} };

/*
 * For the moment the state dump routines have their own
 * private flag.
 */
#define DUMP_STATE      0

#if DUMP_STATE

#define DUMP_TAG(tag) \
{                       \
        D1(NULL, "DUMP_TAG: type 0x%llx", (tag).vio_msgtype); \
        D1(NULL, "DUMP_TAG: stype 0x%llx", (tag).vio_subtype);  \
        D1(NULL, "DUMP_TAG: senv 0x%llx", (tag).vio_subtype_env);       \
}

#define DUMP_TAG_PTR(tag) \
{                       \
        D1(NULL, "DUMP_TAG: type 0x%llx", (tag)->vio_msgtype); \
        D1(NULL, "DUMP_TAG: stype 0x%llx", (tag)->vio_subtype); \
        D1(NULL, "DUMP_TAG: senv 0x%llx", (tag)->vio_subtype_env);      \
}

#define DUMP_FLAGS(flags) dump_flags(flags);
#define DISPLAY_STATE() display_state()

#else

#define DUMP_TAG(tag)
#define DUMP_TAG_PTR(tag)
#define DUMP_FLAGS(state)
#define DISPLAY_STATE()

#endif  /* DUMP_STATE */

/*
 * Attach the specified port.
 *
 * Returns 0 on success, 1 on failure.
 */
int
vsw_port_attach(vsw_port_t *port)
{
        vsw_t                   *vswp = port->p_vswp;
        vsw_port_list_t         *plist = &vswp->plist;
        vsw_port_t              *p, **pp;
        int                     nids = port->num_ldcs;
        uint64_t                *ldcids;
        int                     rv;

        D1(vswp, "%s: enter : port %d", __func__, port->p_instance);

        /* port already exists? */
        READ_ENTER(&plist->lockrw);
        for (p = plist->head; p != NULL; p = p->p_next) {
                if (p->p_instance == port->p_instance) {
                        DWARN(vswp, "%s: port instance %d already attached",
                            __func__, p->p_instance);
                        RW_EXIT(&plist->lockrw);
                        return (1);
                }
        }
        RW_EXIT(&plist->lockrw);

        mutex_init(&port->tx_lock, NULL, MUTEX_DRIVER, NULL);
        mutex_init(&port->mca_lock, NULL, MUTEX_DRIVER, NULL);
        rw_init(&port->maccl_rwlock, NULL, RW_DRIVER, NULL);

        mutex_init(&port->state_lock, NULL, MUTEX_DRIVER, NULL);
        cv_init(&port->state_cv, NULL, CV_DRIVER, NULL);
        port->state = VSW_PORT_INIT;

        D2(vswp, "%s: %d nids", __func__, nids);
        ldcids = port->ldc_ids;
        D2(vswp, "%s: ldcid (%llx)", __func__, (uint64_t)ldcids[0]);
        if (vsw_ldc_attach(port, (uint64_t)ldcids[0]) != 0) {
                DERR(vswp, "%s: ldc_attach failed", __func__);
                goto exit_error;
        }

        if (vswp->switching_setup_done == B_TRUE) {
                /*
                 * If the underlying network device has been setup,
                 * then open a mac client and porgram the mac address
                 * for this port.
                 */
                rv = vsw_mac_client_init(vswp, port, VSW_VNETPORT);
                if (rv != 0) {
                        goto exit_error;
                }
        }

        /* create the fdb entry for this port/mac address */
        vsw_fdbe_add(vswp, port);

        vsw_create_vlans(port, VSW_VNETPORT);

        WRITE_ENTER(&plist->lockrw);

        /* link it into the list of ports for this vsw instance */
        pp = (vsw_port_t **)(&plist->head);
        port->p_next = *pp;
        *pp = port;
        plist->num_ports++;

        RW_EXIT(&plist->lockrw);

        /*
         * Initialise the port and any ldc's under it.
         */
        (void) vsw_ldc_init(port->ldcp);

        /* announce macaddr of vnet to the physical switch */
        if (vsw_publish_macaddr_count != 0) {   /* enabled */
                vsw_publish_macaddr(vswp, port);
        }

        D1(vswp, "%s: exit", __func__);
        return (0);

exit_error:

        cv_destroy(&port->state_cv);
        mutex_destroy(&port->state_lock);

        rw_destroy(&port->maccl_rwlock);
        mutex_destroy(&port->tx_lock);
        mutex_destroy(&port->mca_lock);
        kmem_free(port, sizeof (vsw_port_t));
        return (1);
}

/*
 * Detach the specified port.
 *
 * Returns 0 on success, 1 on failure.
 */
int
vsw_port_detach(vsw_t *vswp, int p_instance)
{
        vsw_port_t      *port = NULL;
        vsw_port_list_t *plist = &vswp->plist;

        D1(vswp, "%s: enter: port id %d", __func__, p_instance);

        WRITE_ENTER(&plist->lockrw);

        if ((port = vsw_lookup_port(vswp, p_instance)) == NULL) {
                RW_EXIT(&plist->lockrw);
                return (1);
        }

        if (vsw_plist_del_node(vswp, port)) {
                RW_EXIT(&plist->lockrw);
                return (1);
        }

        /* cleanup any HybridIO for this port */
        vsw_hio_stop_port(port);

        /*
         * No longer need to hold writer lock on port list now
         * that we have unlinked the target port from the list.
         */
        RW_EXIT(&plist->lockrw);

        /* Cleanup and close the mac client */
        vsw_mac_client_cleanup(vswp, port, VSW_VNETPORT);

        /* Remove the fdb entry for this port/mac address */
        vsw_fdbe_del(vswp, &(port->p_macaddr));
        vsw_destroy_vlans(port, VSW_VNETPORT);

        /* Remove any multicast addresses.. */
        vsw_del_mcst_port(port);

        vsw_port_delete(port);

        D1(vswp, "%s: exit: p_instance(%d)", __func__, p_instance);
        return (0);
}

/*
 * Detach all active ports.
 */
void
vsw_detach_ports(vsw_t *vswp)
{
        vsw_port_list_t         *plist = &vswp->plist;
        vsw_port_t              *port = NULL;

        D1(vswp, "%s: enter", __func__);

        WRITE_ENTER(&plist->lockrw);

        while ((port = plist->head) != NULL) {
                (void) vsw_plist_del_node(vswp, port);

                /* cleanup any HybridIO for this port */
                vsw_hio_stop_port(port);

                /* Cleanup and close the mac client */
                vsw_mac_client_cleanup(vswp, port, VSW_VNETPORT);

                /* Remove the fdb entry for this port/mac address */
                vsw_fdbe_del(vswp, &(port->p_macaddr));
                vsw_destroy_vlans(port, VSW_VNETPORT);

                /* Remove any multicast addresses.. */
                vsw_del_mcst_port(port);

                /*
                 * No longer need to hold the lock on the port list
                 * now that we have unlinked the target port from the
                 * list.
                 */
                RW_EXIT(&plist->lockrw);
                vsw_port_delete(port);
                WRITE_ENTER(&plist->lockrw);
        }
        RW_EXIT(&plist->lockrw);

        D1(vswp, "%s: exit", __func__);
}

/*
 * Delete the specified port.
 */
static void
vsw_port_delete(vsw_port_t *port)
{
        vsw_t                   *vswp = port->p_vswp;

        D1(vswp, "%s: enter : port id %d", __func__, port->p_instance);

        vsw_ldc_uninit(port->ldcp);

        /*
         * Wait for any pending ctrl msg tasks which reference this
         * port to finish.
         */
        vsw_drain_port_taskq(port);

        /*
         * Wait for any active callbacks to finish
         */
        vsw_ldc_drain(port->ldcp);

        vsw_ldc_detach(port->ldcp);

        rw_destroy(&port->maccl_rwlock);
        mutex_destroy(&port->mca_lock);
        mutex_destroy(&port->tx_lock);

        cv_destroy(&port->state_cv);
        mutex_destroy(&port->state_lock);

        if (port->num_ldcs != 0) {
                kmem_free(port->ldc_ids, port->num_ldcs * sizeof (uint64_t));
                port->num_ldcs = 0;
        }

        if (port->nvids != 0) {
                kmem_free(port->vids, sizeof (vsw_vlanid_t) * port->nvids);
        }

        kmem_free(port, sizeof (vsw_port_t));

        D1(vswp, "%s: exit", __func__);
}

/*
 * Attach a logical domain channel (ldc) under a specified port.
 *
 * Returns 0 on success, 1 on failure.
 */
static int
vsw_ldc_attach(vsw_port_t *port, uint64_t ldc_id)
{
        vsw_t           *vswp = port->p_vswp;
        vsw_ldc_t       *ldcp = NULL;
        ldc_attr_t      attr;
        ldc_status_t    istatus;
        int             status = DDI_FAILURE;
        char            kname[MAXNAMELEN];
        enum            { PROG_init = 0x0,
                            PROG_callback = 0x1,
                            PROG_tx_thread = 0x2}
                        progress;

        progress = PROG_init;

        D1(vswp, "%s: enter", __func__);

        ldcp = kmem_zalloc(sizeof (vsw_ldc_t), KM_NOSLEEP);
        if (ldcp == NULL) {
                DERR(vswp, "%s: kmem_zalloc failed", __func__);
                return (1);
        }
        ldcp->ldc_id = ldc_id;

        mutex_init(&ldcp->ldc_txlock, NULL, MUTEX_DRIVER, NULL);
        mutex_init(&ldcp->ldc_rxlock, NULL, MUTEX_DRIVER, NULL);
        mutex_init(&ldcp->ldc_cblock, NULL, MUTEX_DRIVER, NULL);
        ldcp->msg_thr_flags = 0;
        mutex_init(&ldcp->msg_thr_lock, NULL, MUTEX_DRIVER, NULL);
        cv_init(&ldcp->msg_thr_cv, NULL, CV_DRIVER, NULL);
        ldcp->rcv_thr_flags = 0;
        mutex_init(&ldcp->rcv_thr_lock, NULL, MUTEX_DRIVER, NULL);
        cv_init(&ldcp->rcv_thr_cv, NULL, CV_DRIVER, NULL);
        mutex_init(&ldcp->drain_cv_lock, NULL, MUTEX_DRIVER, NULL);
        cv_init(&ldcp->drain_cv, NULL, CV_DRIVER, NULL);

        /* required for handshake with peer */
        ldcp->local_session = (uint64_t)ddi_get_lbolt();
        ldcp->peer_session = 0;
        ldcp->session_status = 0;
        ldcp->hss_id = 1;       /* Initial handshake session id */
        ldcp->hphase = VSW_MILESTONE0;

        (void) atomic_swap_32(&port->p_hio_capable, B_FALSE);

        /* only set for outbound lane, inbound set by peer */
        vsw_set_lane_attr(vswp, &ldcp->lane_out);

        attr.devclass = LDC_DEV_NT_SVC;
        attr.instance = ddi_get_instance(vswp->dip);
        attr.mode = LDC_MODE_UNRELIABLE;
        attr.mtu = VSW_LDC_MTU;
        status = ldc_init(ldc_id, &attr, &ldcp->ldc_handle);
        if (status != 0) {
                DERR(vswp, "%s(%lld): ldc_init failed, rv (%d)",
                    __func__, ldc_id, status);
                goto ldc_attach_fail;
        }

        if (vsw_ldc_txthr_enabled) {
                ldcp->tx_thr_flags = 0;
                ldcp->tx_mhead = ldcp->tx_mtail = NULL;

                mutex_init(&ldcp->tx_thr_lock, NULL, MUTEX_DRIVER, NULL);
                cv_init(&ldcp->tx_thr_cv, NULL, CV_DRIVER, NULL);
                ldcp->tx_thread = thread_create(NULL, 2 * DEFAULTSTKSZ,
                    vsw_ldc_tx_worker, ldcp, 0, &p0, TS_RUN, maxclsyspri);

                progress |= PROG_tx_thread;
                if (ldcp->tx_thread == NULL) {
                        DWARN(vswp, "%s(%lld): Failed to create worker thread",
                            __func__, ldc_id);
                        goto ldc_attach_fail;
                }
        }

        status = ldc_reg_callback(ldcp->ldc_handle, vsw_ldc_cb, (caddr_t)ldcp);
        if (status != 0) {
                DERR(vswp, "%s(%lld): ldc_reg_callback failed, rv (%d)",
                    __func__, ldc_id, status);
                (void) ldc_fini(ldcp->ldc_handle);
                goto ldc_attach_fail;
        }
        /*
         * allocate a message for ldc_read()s, big enough to hold ctrl and
         * data msgs, including raw data msgs used to recv priority frames.
         */
        ldcp->msglen = VIO_PKT_DATA_HDRSIZE + vswp->max_frame_size;
        ldcp->ldcmsg = kmem_alloc(ldcp->msglen, KM_SLEEP);

        progress |= PROG_callback;

        mutex_init(&ldcp->status_lock, NULL, MUTEX_DRIVER, NULL);

        if (ldc_status(ldcp->ldc_handle, &istatus) != 0) {
                DERR(vswp, "%s: ldc_status failed", __func__);
                mutex_destroy(&ldcp->status_lock);
                goto ldc_attach_fail;
        }

        ldcp->ldc_status = istatus;
        ldcp->ldc_port = port;
        ldcp->ldc_vswp = vswp;

        vsw_reset_vnet_proto_ops(ldcp);

        (void) sprintf(kname, "%sldc0x%lx", DRV_NAME, ldcp->ldc_id);
        ldcp->ksp = vgen_setup_kstats(DRV_NAME, vswp->instance,
            kname, &ldcp->ldc_stats);
        if (ldcp->ksp == NULL) {
                DERR(vswp, "%s: kstats setup failed", __func__);
                goto ldc_attach_fail;
        }

        /* link it into this port */
        port->ldcp = ldcp;

        D1(vswp, "%s: exit", __func__);
        return (0);

ldc_attach_fail:

        if (progress & PROG_callback) {
                (void) ldc_unreg_callback(ldcp->ldc_handle);
                kmem_free(ldcp->ldcmsg, ldcp->msglen);
        }

        if (progress & PROG_tx_thread) {
                if (ldcp->tx_thread != NULL) {
                        vsw_stop_tx_thread(ldcp);
                }
                mutex_destroy(&ldcp->tx_thr_lock);
                cv_destroy(&ldcp->tx_thr_cv);
        }
        if (ldcp->ksp != NULL) {
                vgen_destroy_kstats(ldcp->ksp);
        }
        mutex_destroy(&ldcp->msg_thr_lock);
        mutex_destroy(&ldcp->rcv_thr_lock);
        mutex_destroy(&ldcp->ldc_txlock);
        mutex_destroy(&ldcp->ldc_rxlock);
        mutex_destroy(&ldcp->ldc_cblock);
        mutex_destroy(&ldcp->drain_cv_lock);
        cv_destroy(&ldcp->msg_thr_cv);
        cv_destroy(&ldcp->rcv_thr_cv);
        cv_destroy(&ldcp->drain_cv);

        kmem_free(ldcp, sizeof (vsw_ldc_t));

        return (1);
}

/*
 * Detach a logical domain channel (ldc) belonging to a
 * particular port.
 */
static void
vsw_ldc_detach(vsw_ldc_t *ldcp)
{
        int             rv;
        vsw_t           *vswp = ldcp->ldc_port->p_vswp;
        int             retries = 0;

        D2(vswp, "%s: detaching channel %lld", __func__, ldcp->ldc_id);

        /* Stop msg/rcv thread */
        if (ldcp->rcv_thread != NULL) {
                vsw_stop_rcv_thread(ldcp);
        } else if (ldcp->msg_thread != NULL) {
                vsw_stop_msg_thread(ldcp);
        }
        kmem_free(ldcp->ldcmsg, ldcp->msglen);

        /* Stop the tx thread */
        if (ldcp->tx_thread != NULL) {
                vsw_stop_tx_thread(ldcp);
                mutex_destroy(&ldcp->tx_thr_lock);
                cv_destroy(&ldcp->tx_thr_cv);
                if (ldcp->tx_mhead != NULL) {
                        freemsgchain(ldcp->tx_mhead);
                        ldcp->tx_mhead = ldcp->tx_mtail = NULL;
                        ldcp->tx_cnt = 0;
                }
        }

        /* Destory kstats */
        vgen_destroy_kstats(ldcp->ksp);

        /*
         * Before we can close the channel we must release any mapped
         * resources (e.g. drings).
         */
        vsw_free_lane_resources(ldcp, INBOUND);
        vsw_free_lane_resources(ldcp, OUTBOUND);

        /*
         * Close the channel, retry on EAAGIN.
         */
        while ((rv = ldc_close(ldcp->ldc_handle)) == EAGAIN) {
                if (++retries > vsw_ldc_retries) {
                        break;
                }
                drv_usecwait(vsw_ldc_delay);
        }
        if (rv != 0) {
                cmn_err(CE_NOTE,
                    "!vsw%d: Error(%d) closing the channel(0x%lx)\n",
                    vswp->instance, rv, ldcp->ldc_id);
        }

        (void) ldc_fini(ldcp->ldc_handle);

        ldcp->ldc_status = LDC_INIT;
        ldcp->ldc_handle = 0;
        ldcp->ldc_vswp = NULL;

        mutex_destroy(&ldcp->msg_thr_lock);
        mutex_destroy(&ldcp->rcv_thr_lock);
        mutex_destroy(&ldcp->ldc_txlock);
        mutex_destroy(&ldcp->ldc_rxlock);
        mutex_destroy(&ldcp->ldc_cblock);
        mutex_destroy(&ldcp->drain_cv_lock);
        mutex_destroy(&ldcp->status_lock);
        cv_destroy(&ldcp->msg_thr_cv);
        cv_destroy(&ldcp->rcv_thr_cv);
        cv_destroy(&ldcp->drain_cv);

        kmem_free(ldcp, sizeof (vsw_ldc_t));
}

/*
 * Open and attempt to bring up the channel. Note that channel
 * can only be brought up if peer has also opened channel.
 *
 * Returns 0 if can open and bring up channel, otherwise
 * returns 1.
 */
static int
vsw_ldc_init(vsw_ldc_t *ldcp)
{
        vsw_t           *vswp = ldcp->ldc_vswp;
        ldc_status_t    istatus = 0;
        int             rv;

        D1(vswp, "%s: enter", __func__);

        LDC_ENTER_LOCK(ldcp);

        /* don't start at 0 in case clients don't like that */
        ldcp->next_ident = 1;

        rv = ldc_open(ldcp->ldc_handle);
        if (rv != 0) {
                DERR(vswp, "%s: ldc_open failed: id(%lld) rv(%d)",
                    __func__, ldcp->ldc_id, rv);
                LDC_EXIT_LOCK(ldcp);
                return (1);
        }

        if (ldc_status(ldcp->ldc_handle, &istatus) != 0) {
                DERR(vswp, "%s: unable to get status", __func__);
                LDC_EXIT_LOCK(ldcp);
                return (1);

        } else if (istatus != LDC_OPEN && istatus != LDC_READY) {
                DERR(vswp, "%s: id (%lld) status(%d) is not OPEN/READY",
                    __func__, ldcp->ldc_id, istatus);
                LDC_EXIT_LOCK(ldcp);
                return (1);
        }

        mutex_enter(&ldcp->status_lock);
        ldcp->ldc_status = istatus;
        mutex_exit(&ldcp->status_lock);

        rv = ldc_up(ldcp->ldc_handle);
        if (rv != 0) {
                /*
                 * Not a fatal error for ldc_up() to fail, as peer
                 * end point may simply not be ready yet.
                 */
                D2(vswp, "%s: ldc_up err id(%lld) rv(%d)", __func__,
                    ldcp->ldc_id, rv);
                LDC_EXIT_LOCK(ldcp);
                return (1);
        }

        /*
         * ldc_up() call is non-blocking so need to explicitly
         * check channel status to see if in fact the channel
         * is UP.
         */
        mutex_enter(&ldcp->status_lock);
        if (ldc_status(ldcp->ldc_handle, &ldcp->ldc_status) != 0) {
                DERR(vswp, "%s: unable to get status", __func__);
                mutex_exit(&ldcp->status_lock);
                LDC_EXIT_LOCK(ldcp);
                return (1);

        }

        if (ldcp->ldc_status == LDC_UP) {
                D2(vswp, "%s: channel %ld now UP (%ld)", __func__,
                    ldcp->ldc_id, istatus);
                mutex_exit(&ldcp->status_lock);
                LDC_EXIT_LOCK(ldcp);

                vsw_process_conn_evt(ldcp, VSW_CONN_UP);
                return (0);
        }

        mutex_exit(&ldcp->status_lock);
        LDC_EXIT_LOCK(ldcp);

        D1(vswp, "%s: exit", __func__);
        return (0);
}

/* disable callbacks on the channel */
static void
vsw_ldc_uninit(vsw_ldc_t *ldcp)
{
        vsw_t   *vswp = ldcp->ldc_vswp;
        int     rv;

        D1(vswp, "vsw_ldc_uninit: enter: id(%lx)\n", ldcp->ldc_id);

        LDC_ENTER_LOCK(ldcp);

        rv = ldc_set_cb_mode(ldcp->ldc_handle, LDC_CB_DISABLE);
        if (rv != 0) {
                cmn_err(CE_NOTE, "!vsw_ldc_uninit(%ld): error disabling "
                    "interrupts (rv = %d)\n", ldcp->ldc_id, rv);
        }

        mutex_enter(&ldcp->status_lock);
        ldcp->ldc_status = LDC_INIT;
        mutex_exit(&ldcp->status_lock);

        LDC_EXIT_LOCK(ldcp);

        D1(vswp, "vsw_ldc_uninit: exit: id(%lx)", ldcp->ldc_id);
}

/*
 * Wait until the callback(s) associated with the ldcs under the specified
 * port have completed.
 *
 * Prior to this function being invoked each channel under this port
 * should have been quiesced via ldc_set_cb_mode(DISABLE).
 *
 * A short explaination of what we are doing below..
 *
 * The simplest approach would be to have a reference counter in
 * the ldc structure which is increment/decremented by the callbacks as
 * they use the channel. The drain function could then simply disable any
 * further callbacks and do a cv_wait for the ref to hit zero. Unfortunately
 * there is a tiny window here - before the callback is able to get the lock
 * on the channel it is interrupted and this function gets to execute. It
 * sees that the ref count is zero and believes its free to delete the
 * associated data structures.
 *
 * We get around this by taking advantage of the fact that before the ldc
 * framework invokes a callback it sets a flag to indicate that there is a
 * callback active (or about to become active). If when we attempt to
 * unregister a callback when this active flag is set then the unregister
 * will fail with EWOULDBLOCK.
 *
 * If the unregister fails we do a cv_timedwait. We will either be signaled
 * by the callback as it is exiting (note we have to wait a short period to
 * allow the callback to return fully to the ldc framework and it to clear
 * the active flag), or by the timer expiring. In either case we again attempt
 * the unregister. We repeat this until we can succesfully unregister the
 * callback.
 *
 * The reason we use a cv_timedwait rather than a simple cv_wait is to catch
 * the case where the callback has finished but the ldc framework has not yet
 * cleared the active flag. In this case we would never get a cv_signal.
 */
static void
vsw_ldc_drain(vsw_ldc_t *ldcp)
{
        vsw_t   *vswp = ldcp->ldc_port->p_vswp;

        D1(vswp, "%s: enter", __func__);

        /*
         * If we can unregister the channel callback then we
         * know that there is no callback either running or
         * scheduled to run for this channel so move on to next
         * channel in the list.
         */
        mutex_enter(&ldcp->drain_cv_lock);

        /* prompt active callbacks to quit */
        ldcp->drain_state = VSW_LDC_DRAINING;

        if ((ldc_unreg_callback(ldcp->ldc_handle)) == 0) {
                D2(vswp, "%s: unreg callback for chan %ld", __func__,
                    ldcp->ldc_id);
                mutex_exit(&ldcp->drain_cv_lock);
        } else {
                /*
                 * If we end up here we know that either 1) a callback
                 * is currently executing, 2) is about to start (i.e.
                 * the ldc framework has set the active flag but
                 * has not actually invoked the callback yet, or 3)
                 * has finished and has returned to the ldc framework
                 * but the ldc framework has not yet cleared the
                 * active bit.
                 *
                 * Wait for it to finish.
                 */
                while (ldc_unreg_callback(ldcp->ldc_handle) == EWOULDBLOCK) {
                        (void) cv_timedwait(&ldcp->drain_cv,
                            &ldcp->drain_cv_lock, ddi_get_lbolt() + hz);
                }

                mutex_exit(&ldcp->drain_cv_lock);
                D2(vswp, "%s: unreg callback for chan %ld after "
                    "timeout", __func__, ldcp->ldc_id);
        }

        D1(vswp, "%s: exit", __func__);
}

/*
 * Wait until all tasks which reference this port have completed.
 *
 * Prior to this function being invoked each channel under this port
 * should have been quiesced via ldc_set_cb_mode(DISABLE).
 */
static void
vsw_drain_port_taskq(vsw_port_t *port)
{
        vsw_t           *vswp = port->p_vswp;

        D1(vswp, "%s: enter", __func__);

        /*
         * Mark the port as in the process of being detached, and
         * dispatch a marker task to the queue so we know when all
         * relevant tasks have completed.
         */
        mutex_enter(&port->state_lock);
        port->state = VSW_PORT_DETACHING;

        if ((vswp->taskq_p == NULL) ||
            (ddi_taskq_dispatch(vswp->taskq_p, vsw_marker_task,
            port, DDI_NOSLEEP) != DDI_SUCCESS)) {
                cmn_err(CE_NOTE, "!vsw%d: unable to dispatch marker task",
                    vswp->instance);
                mutex_exit(&port->state_lock);
                return;
        }

        /*
         * Wait for the marker task to finish.
         */
        while (port->state != VSW_PORT_DETACHABLE)
                cv_wait(&port->state_cv, &port->state_lock);

        mutex_exit(&port->state_lock);

        D1(vswp, "%s: exit", __func__);
}

static void
vsw_marker_task(void *arg)
{
        vsw_port_t      *port = arg;
        vsw_t           *vswp = port->p_vswp;

        D1(vswp, "%s: enter", __func__);

        mutex_enter(&port->state_lock);

        /*
         * No further tasks should be dispatched which reference
         * this port so ok to mark it as safe to detach.
         */
        port->state = VSW_PORT_DETACHABLE;

        cv_signal(&port->state_cv);

        mutex_exit(&port->state_lock);

        D1(vswp, "%s: exit", __func__);
}

vsw_port_t *
vsw_lookup_port(vsw_t *vswp, int p_instance)
{
        vsw_port_list_t *plist = &vswp->plist;
        vsw_port_t      *port;

        for (port = plist->head; port != NULL; port = port->p_next) {
                if (port->p_instance == p_instance) {
                        D2(vswp, "vsw_lookup_port: found p_instance\n");
                        return (port);
                }
        }

        return (NULL);
}

void
vsw_vlan_unaware_port_reset(vsw_port_t *portp)
{
        vsw_ldc_t       *ldcp = portp->ldcp;

        mutex_enter(&ldcp->ldc_cblock);

        /*
         * If the peer is vlan_unaware(ver < 1.3), reset channel and terminate
         * the connection. See comments in vsw_set_vnet_proto_ops().
         */
        if (ldcp->hphase == VSW_MILESTONE4 && VSW_VER_LT(ldcp, 1, 3) &&
            portp->nvids != 0) {
                vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
        }

        mutex_exit(&ldcp->ldc_cblock);
}

void
vsw_hio_port_reset(vsw_port_t *portp, boolean_t immediate)
{
        vsw_ldc_t       *ldcp = portp->ldcp;

        mutex_enter(&ldcp->ldc_cblock);

        /*
         * If the peer is HybridIO capable (ver >= 1.3), reset channel
         * to trigger re-negotiation, which inturn trigger HybridIO
         * setup/cleanup.
         */
        if ((ldcp->hphase == VSW_MILESTONE4) &&
            (portp->p_hio_capable == B_TRUE)) {
                if (immediate == B_TRUE) {
                        (void) ldc_down(ldcp->ldc_handle);
                } else {
                        vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
                }
        }

        mutex_exit(&ldcp->ldc_cblock);
}

void
vsw_port_reset(vsw_port_t *portp)
{
        vsw_ldc_t       *ldcp = portp->ldcp;

        mutex_enter(&ldcp->ldc_cblock);

        /*
         * reset channel and terminate the connection.
         */
        vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);

        mutex_exit(&ldcp->ldc_cblock);
}

void
vsw_reset_ports(vsw_t *vswp)
{
        vsw_port_list_t *plist = &vswp->plist;
        vsw_port_t      *portp;

        READ_ENTER(&plist->lockrw);
        for (portp = plist->head; portp != NULL; portp = portp->p_next) {
                if ((portp->p_hio_capable) && (portp->p_hio_enabled)) {
                        vsw_hio_stop_port(portp);
                }
                vsw_port_reset(portp);
        }
        RW_EXIT(&plist->lockrw);
}

static void
vsw_send_physlink_msg(vsw_ldc_t *ldcp, link_state_t plink_state)
{
        vnet_physlink_msg_t     msg;
        vnet_physlink_msg_t     *msgp = &msg;
        uint32_t                physlink_info = 0;

        if (plink_state == LINK_STATE_UP) {
                physlink_info |= VNET_PHYSLINK_STATE_UP;
        } else {
                physlink_info |= VNET_PHYSLINK_STATE_DOWN;
        }

        msgp->tag.vio_msgtype = VIO_TYPE_CTRL;
        msgp->tag.vio_subtype = VIO_SUBTYPE_INFO;
        msgp->tag.vio_subtype_env = VNET_PHYSLINK_INFO;
        msgp->tag.vio_sid = ldcp->local_session;
        msgp->physlink_info = physlink_info;

        (void) vsw_send_msg(ldcp, msgp, sizeof (msg), B_TRUE);
}

static void
vsw_port_physlink_update(vsw_port_t *portp)
{
        vsw_ldc_t       *ldcp;
        vsw_t           *vswp;

        vswp = portp->p_vswp;
        ldcp = portp->ldcp;

        mutex_enter(&ldcp->ldc_cblock);

        /*
         * If handshake has completed successfully and if the vnet device
         * has negotiated to get physical link state updates, send a message
         * with the current state.
         */
        if (ldcp->hphase == VSW_MILESTONE4 && ldcp->pls_negotiated == B_TRUE) {
                vsw_send_physlink_msg(ldcp, vswp->phys_link_state);
        }

        mutex_exit(&ldcp->ldc_cblock);
}

void
vsw_physlink_update_ports(vsw_t *vswp)
{
        vsw_port_list_t *plist = &vswp->plist;
        vsw_port_t      *portp;

        READ_ENTER(&plist->lockrw);
        for (portp = plist->head; portp != NULL; portp = portp->p_next) {
                vsw_port_physlink_update(portp);
        }
        RW_EXIT(&plist->lockrw);
}

/*
 * Search for and remove the specified port from the port
 * list. Returns 0 if able to locate and remove port, otherwise
 * returns 1.
 */
static int
vsw_plist_del_node(vsw_t *vswp, vsw_port_t *port)
{
        vsw_port_list_t *plist = &vswp->plist;
        vsw_port_t      *curr_p, *prev_p;

        if (plist->head == NULL)
                return (1);

        curr_p = prev_p = plist->head;

        while (curr_p != NULL) {
                if (curr_p == port) {
                        if (prev_p == curr_p) {
                                plist->head = curr_p->p_next;
                        } else {
                                prev_p->p_next = curr_p->p_next;
                        }
                        plist->num_ports--;
                        break;
                } else {
                        prev_p = curr_p;
                        curr_p = curr_p->p_next;
                }
        }
        return (0);
}

/*
 * Interrupt handler for ldc messages.
 */
static uint_t
vsw_ldc_cb(uint64_t event, caddr_t arg)
{
        vsw_ldc_t       *ldcp = (vsw_ldc_t  *)arg;
        vsw_t           *vswp = ldcp->ldc_vswp;

        D1(vswp, "%s: enter: ldcid (%lld)\n", __func__, ldcp->ldc_id);

        mutex_enter(&ldcp->ldc_cblock);
        ldcp->ldc_stats.callbacks++;

        mutex_enter(&ldcp->status_lock);
        if ((ldcp->ldc_status == LDC_INIT) || (ldcp->ldc_handle == 0)) {
                mutex_exit(&ldcp->status_lock);
                mutex_exit(&ldcp->ldc_cblock);
                return (LDC_SUCCESS);
        }
        mutex_exit(&ldcp->status_lock);

        if (event & LDC_EVT_UP) {
                /*
                 * Channel has come up.
                 */
                D2(vswp, "%s: id(%ld) event(%llx) UP: status(%ld)",
                    __func__, ldcp->ldc_id, event, ldcp->ldc_status);

                vsw_process_conn_evt(ldcp, VSW_CONN_UP);

                ASSERT((event & (LDC_EVT_RESET | LDC_EVT_DOWN)) == 0);
        }

        if (event & LDC_EVT_READ) {
                /*
                 * Data available for reading.
                 */
                D2(vswp, "%s: id(ld) event(%llx) data READ",
                    __func__, ldcp->ldc_id, event);

                vsw_process_evt_read(ldcp);

                ASSERT((event & (LDC_EVT_RESET | LDC_EVT_DOWN)) == 0);

                goto vsw_cb_exit;
        }

        if (event & (LDC_EVT_DOWN | LDC_EVT_RESET)) {
                D2(vswp, "%s: id(%ld) event (%lx) DOWN/RESET: status(%ld)",
                    __func__, ldcp->ldc_id, event, ldcp->ldc_status);

                vsw_process_conn_evt(ldcp, VSW_CONN_RESET);
        }

        /*
         * Catch either LDC_EVT_WRITE which we don't support or any
         * unknown event.
         */
        if (event &
            ~(LDC_EVT_UP | LDC_EVT_RESET | LDC_EVT_DOWN | LDC_EVT_READ)) {
                DERR(vswp, "%s: id(%ld) Unexpected event=(%llx) status(%ld)",
                    __func__, ldcp->ldc_id, event, ldcp->ldc_status);
        }

vsw_cb_exit:
        mutex_exit(&ldcp->ldc_cblock);

        /*
         * Let the drain function know we are finishing if it
         * is waiting.
         */
        mutex_enter(&ldcp->drain_cv_lock);
        if (ldcp->drain_state == VSW_LDC_DRAINING)
                cv_signal(&ldcp->drain_cv);
        mutex_exit(&ldcp->drain_cv_lock);

        return (LDC_SUCCESS);
}

/*
 * Reinitialise data structures associated with the channel.
 */
static void
vsw_ldc_reinit(vsw_ldc_t *ldcp)
{
        vsw_t           *vswp = ldcp->ldc_vswp;
        vsw_port_t      *port;

        D1(vswp, "%s: enter", __func__);

        port = ldcp->ldc_port;

        D2(vswp, "%s: in 0x%llx : out 0x%llx", __func__,
            ldcp->lane_in.lstate, ldcp->lane_out.lstate);

        vsw_free_lane_resources(ldcp, INBOUND);
        vsw_free_lane_resources(ldcp, OUTBOUND);

        ldcp->lane_in.lstate = 0;
        ldcp->lane_out.lstate = 0;

        /*
         * Remove parent port from any multicast groups
         * it may have registered with. Client must resend
         * multicast add command after handshake completes.
         */
        vsw_del_mcst_port(port);

        ldcp->peer_session = 0;
        ldcp->session_status = 0;
        ldcp->hcnt = 0;
        ldcp->hphase = VSW_MILESTONE0;

        vsw_reset_vnet_proto_ops(ldcp);

        D1(vswp, "%s: exit", __func__);
}

/*
 * Process a connection event.
 */
void
vsw_process_conn_evt(vsw_ldc_t *ldcp, uint16_t evt)
{
        vsw_t           *vswp = ldcp->ldc_vswp;
        vsw_conn_evt_t  *conn = NULL;

        D1(vswp, "%s: enter", __func__);

        /*
         * Check if either a reset or restart event is pending
         * or in progress. If so just return.
         *
         * A VSW_CONN_RESET event originates either with a LDC_RESET_EVT
         * being received by the callback handler, or a ECONNRESET error
         * code being returned from a ldc_read() or ldc_write() call.
         *
         * A VSW_CONN_RESTART event occurs when some error checking code
         * decides that there is a problem with data from the channel,
         * and that the handshake should be restarted.
         */
        if (((evt == VSW_CONN_RESET) || (evt == VSW_CONN_RESTART)) &&
            (ldstub((uint8_t *)&ldcp->reset_active)))
                return;

        /*
         * If it is an LDC_UP event we first check the recorded
         * state of the channel. If this is UP then we know that
         * the channel moving to the UP state has already been dealt
         * with and don't need to dispatch a  new task.
         *
         * The reason for this check is that when we do a ldc_up(),
         * depending on the state of the peer, we may or may not get
         * a LDC_UP event. As we can't depend on getting a LDC_UP evt
         * every time we do ldc_up() we explicitly check the channel
         * status to see has it come up (ldc_up() is asynch and will
         * complete at some undefined time), and take the appropriate
         * action.
         *
         * The flip side of this is that we may get a LDC_UP event
         * when we have already seen that the channel is up and have
         * dealt with that.
         */
        mutex_enter(&ldcp->status_lock);
        if (evt == VSW_CONN_UP) {
                if ((ldcp->ldc_status == LDC_UP) || (ldcp->reset_active != 0)) {
                        mutex_exit(&ldcp->status_lock);
                        return;
                }
        }
        mutex_exit(&ldcp->status_lock);

        /*
         * The transaction group id allows us to identify and discard
         * any tasks which are still pending on the taskq and refer
         * to the handshake session we are about to restart or reset.
         * These stale messages no longer have any real meaning.
         */
        (void) atomic_inc_32(&ldcp->hss_id);

        ASSERT(vswp->taskq_p != NULL);

        if ((conn = kmem_zalloc(sizeof (vsw_conn_evt_t), KM_NOSLEEP)) == NULL) {
                cmn_err(CE_WARN, "!vsw%d: unable to allocate memory for"
                    " connection event", vswp->instance);
                goto err_exit;
        }

        conn->evt = evt;
        conn->ldcp = ldcp;

        if (ddi_taskq_dispatch(vswp->taskq_p, vsw_conn_task, conn,
            DDI_NOSLEEP) != DDI_SUCCESS) {
                cmn_err(CE_WARN, "!vsw%d: Can't dispatch connection task",
                    vswp->instance);

                kmem_free(conn, sizeof (vsw_conn_evt_t));
                goto err_exit;
        }

        D1(vswp, "%s: exit", __func__);
        return;

err_exit:
        /*
         * Have mostly likely failed due to memory shortage. Clear the flag so
         * that future requests will at least be attempted and will hopefully
         * succeed.
         */
        if ((evt == VSW_CONN_RESET) || (evt == VSW_CONN_RESTART))
                ldcp->reset_active = 0;
}

/*
 * Deal with events relating to a connection. Invoked from a taskq.
 */
static void
vsw_conn_task(void *arg)
{
        vsw_conn_evt_t  *conn = (vsw_conn_evt_t *)arg;
        vsw_ldc_t       *ldcp = NULL;
        vsw_port_t      *portp;
        vsw_t           *vswp = NULL;
        uint16_t        evt;
        ldc_status_t    curr_status;

        ldcp = conn->ldcp;
        evt = conn->evt;
        vswp = ldcp->ldc_vswp;
        portp = ldcp->ldc_port;

        D1(vswp, "%s: enter", __func__);

        /* can safely free now have copied out data */
        kmem_free(conn, sizeof (vsw_conn_evt_t));

        if (ldcp->rcv_thread != NULL) {
                vsw_stop_rcv_thread(ldcp);
        } else if (ldcp->msg_thread != NULL) {
                vsw_stop_msg_thread(ldcp);
        }

        mutex_enter(&ldcp->status_lock);
        if (ldc_status(ldcp->ldc_handle, &curr_status) != 0) {
                cmn_err(CE_WARN, "!vsw%d: Unable to read status of "
                    "channel %ld", vswp->instance, ldcp->ldc_id);
                mutex_exit(&ldcp->status_lock);
                return;
        }

        /*
         * If we wish to restart the handshake on this channel, then if
         * the channel is UP we bring it DOWN to flush the underlying
         * ldc queue.
         */
        if ((evt == VSW_CONN_RESTART) && (curr_status == LDC_UP))
                (void) ldc_down(ldcp->ldc_handle);

        if ((portp->p_hio_capable) && (portp->p_hio_enabled)) {
                vsw_hio_stop(vswp, ldcp);
        }

        /*
         * re-init all the associated data structures.
         */
        vsw_ldc_reinit(ldcp);

        /*
         * Bring the channel back up (note it does no harm to
         * do this even if the channel is already UP, Just
         * becomes effectively a no-op).
         */
        (void) ldc_up(ldcp->ldc_handle);

        /*
         * Check if channel is now UP. This will only happen if
         * peer has also done a ldc_up().
         */
        if (ldc_status(ldcp->ldc_handle, &curr_status) != 0) {
                cmn_err(CE_WARN, "!vsw%d: Unable to read status of "
                    "channel %ld", vswp->instance, ldcp->ldc_id);
                mutex_exit(&ldcp->status_lock);
                return;
        }

        ldcp->ldc_status = curr_status;

        /* channel UP so restart handshake by sending version info */
        if (curr_status == LDC_UP) {
                if (ldcp->hcnt++ > vsw_num_handshakes) {
                        cmn_err(CE_WARN, "!vsw%d: exceeded number of permitted"
                            " handshake attempts (%d) on channel %ld",
                            vswp->instance, ldcp->hcnt, ldcp->ldc_id);
                        mutex_exit(&ldcp->status_lock);
                        return;
                }

                if (vsw_obp_ver_proto_workaround == B_FALSE &&
                    (ddi_taskq_dispatch(vswp->taskq_p, vsw_send_ver, ldcp,
                    DDI_NOSLEEP) != DDI_SUCCESS)) {
                        cmn_err(CE_WARN, "!vsw%d: Can't dispatch version task",
                            vswp->instance);

                        /*
                         * Don't count as valid restart attempt if couldn't
                         * send version msg.
                         */
                        if (ldcp->hcnt > 0)
                                ldcp->hcnt--;
                }
        }

        /*
         * Mark that the process is complete by clearing the flag.
         *
         * Note is it possible that the taskq dispatch above may have failed,
         * most likely due to memory shortage. We still clear the flag so
         * future attempts will at least be attempted and will hopefully
         * succeed.
         */
        if ((evt == VSW_CONN_RESET) || (evt == VSW_CONN_RESTART))
                ldcp->reset_active = 0;

        mutex_exit(&ldcp->status_lock);

        D1(vswp, "%s: exit", __func__);
}

/*
 * returns 0 if legal for event signified by flag to have
 * occured at the time it did. Otherwise returns 1.
 */
int
vsw_check_flag(vsw_ldc_t *ldcp, int dir, uint64_t flag)
{
        vsw_t           *vswp = ldcp->ldc_vswp;
        uint64_t        state;
        uint64_t        phase;

        if (dir == INBOUND)
                state = ldcp->lane_in.lstate;
        else
                state = ldcp->lane_out.lstate;

        phase = ldcp->hphase;

        switch (flag) {
        case VSW_VER_INFO_RECV:
                if (phase > VSW_MILESTONE0) {
                        DERR(vswp, "vsw_check_flag (%d): VER_INFO_RECV"
                            " when in state %d\n", ldcp->ldc_id, phase);
                        vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
                        return (1);
                }
                break;

        case VSW_VER_ACK_RECV:
        case VSW_VER_NACK_RECV:
                if (!(state & VSW_VER_INFO_SENT)) {
                        DERR(vswp, "vsw_check_flag (%d): spurious VER_ACK or "
                            "VER_NACK when in state %d\n", ldcp->ldc_id, phase);
                        vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
                        return (1);
                } else
                        state &= ~VSW_VER_INFO_SENT;
                break;

        case VSW_ATTR_INFO_RECV:
                if ((phase < VSW_MILESTONE1) || (phase >= VSW_MILESTONE2)) {
                        DERR(vswp, "vsw_check_flag (%d): ATTR_INFO_RECV"
                            " when in state %d\n", ldcp->ldc_id, phase);
                        vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
                        return (1);
                }
                break;

        case VSW_ATTR_ACK_RECV:
        case VSW_ATTR_NACK_RECV:
                if (!(state & VSW_ATTR_INFO_SENT)) {
                        DERR(vswp, "vsw_check_flag (%d): spurious ATTR_ACK"
                            " or ATTR_NACK when in state %d\n",
                            ldcp->ldc_id, phase);
                        vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
                        return (1);
                } else
                        state &= ~VSW_ATTR_INFO_SENT;
                break;

        case VSW_DRING_INFO_RECV:
                if (phase < VSW_MILESTONE1) {
                        DERR(vswp, "vsw_check_flag (%d): DRING_INFO_RECV"
                            " when in state %d\n", ldcp->ldc_id, phase);
                        vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
                        return (1);
                }
                break;

        case VSW_DRING_ACK_RECV:
        case VSW_DRING_NACK_RECV:
                if (!(state & VSW_DRING_INFO_SENT)) {
                        DERR(vswp, "vsw_check_flag (%d): spurious DRING_ACK "
                            " or DRING_NACK when in state %d\n",
                            ldcp->ldc_id, phase);
                        vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
                        return (1);
                } else
                        state &= ~VSW_DRING_INFO_SENT;
                break;

        case VSW_RDX_INFO_RECV:
                if (phase < VSW_MILESTONE3) {
                        DERR(vswp, "vsw_check_flag (%d): RDX_INFO_RECV"
                            " when in state %d\n", ldcp->ldc_id, phase);
                        vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
                        return (1);
                }
                break;

        case VSW_RDX_ACK_RECV:
        case VSW_RDX_NACK_RECV:
                if (!(state & VSW_RDX_INFO_SENT)) {
                        DERR(vswp, "vsw_check_flag (%d): spurious RDX_ACK or "
                            "RDX_NACK when in state %d\n", ldcp->ldc_id, phase);
                        vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
                        return (1);
                } else
                        state &= ~VSW_RDX_INFO_SENT;
                break;

        case VSW_MCST_INFO_RECV:
                if (phase < VSW_MILESTONE3) {
                        DERR(vswp, "vsw_check_flag (%d): VSW_MCST_INFO_RECV"
                            " when in state %d\n", ldcp->ldc_id, phase);
                        vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
                        return (1);
                }
                break;

        default:
                DERR(vswp, "vsw_check_flag (%lld): unknown flag (%llx)",
                    ldcp->ldc_id, flag);
                return (1);
        }

        if (dir == INBOUND)
                ldcp->lane_in.lstate = state;
        else
                ldcp->lane_out.lstate = state;

        D1(vswp, "vsw_check_flag (chan %lld): exit", ldcp->ldc_id);

        return (0);
}

void
vsw_next_milestone(vsw_ldc_t *ldcp)
{
        vsw_t           *vswp = ldcp->ldc_vswp;
        vsw_port_t      *portp = ldcp->ldc_port;
        lane_t          *lane_out = &ldcp->lane_out;
        lane_t          *lane_in = &ldcp->lane_in;

        D1(vswp, "%s (chan %lld): enter (phase %ld)", __func__,
            ldcp->ldc_id, ldcp->hphase);

        DUMP_FLAGS(lane_in->lstate);
        DUMP_FLAGS(lane_out->lstate);

        switch (ldcp->hphase) {

        case VSW_MILESTONE0:
                /*
                 * If we haven't started to handshake with our peer,
                 * start to do so now.
                 */
                if (lane_out->lstate == 0) {
                        D2(vswp, "%s: (chan %lld) starting handshake "
                            "with peer", __func__, ldcp->ldc_id);
                        vsw_process_conn_evt(ldcp, VSW_CONN_UP);
                }

                /*
                 * Only way to pass this milestone is to have successfully
                 * negotiated version info.
                 */
                if ((lane_in->lstate & VSW_VER_ACK_SENT) &&
                    (lane_out->lstate & VSW_VER_ACK_RECV)) {

                        D2(vswp, "%s: (chan %lld) leaving milestone 0",
                            __func__, ldcp->ldc_id);

                        vsw_set_vnet_proto_ops(ldcp);

                        /*
                         * Next milestone is passed when attribute
                         * information has been successfully exchanged.
                         */
                        ldcp->hphase = VSW_MILESTONE1;
                        vsw_send_attr(ldcp);

                }
                break;

        case VSW_MILESTONE1:
                /*
                 * Only way to pass this milestone is to have successfully
                 * negotiated attribute information, in both directions.
                 */
                if (!((lane_in->lstate & VSW_ATTR_ACK_SENT) &&
                    (lane_out->lstate & VSW_ATTR_ACK_RECV))) {
                        break;
                }

                ldcp->hphase = VSW_MILESTONE2;

                /*
                 * If the peer device has said it wishes to
                 * use descriptor rings then we send it our ring
                 * info, otherwise we just set up a private ring
                 * which we use an internal buffer
                 */
                if ((VSW_VER_GTEQ(ldcp, 1, 2) &&
                    (lane_in->xfer_mode & VIO_DRING_MODE_V1_2)) ||
                    (VSW_VER_LT(ldcp, 1, 2) &&
                    (lane_in->xfer_mode == VIO_DRING_MODE_V1_0))) {
                        vsw_send_dring_info(ldcp);
                        break;
                }

                /*
                 * The peer doesn't operate in dring mode; we
                 * can simply fallthru to the RDX phase from
                 * here.
                 */
                /*FALLTHRU*/

        case VSW_MILESTONE2:
                /*
                 * If peer has indicated in its attribute message that
                 * it wishes to use descriptor rings then the only way
                 * to pass this milestone is for us to have received
                 * valid dring info.
                 *
                 * If peer is not using descriptor rings then just fall
                 * through.
                 */
                if ((VSW_VER_GTEQ(ldcp, 1, 2) &&
                    (lane_in->xfer_mode & VIO_DRING_MODE_V1_2)) ||
                    (VSW_VER_LT(ldcp, 1, 2) &&
                    (lane_in->xfer_mode ==
                    VIO_DRING_MODE_V1_0))) {
                        if (!(lane_in->lstate & VSW_DRING_ACK_SENT))
                                break;
                }

                D2(vswp, "%s: (chan %lld) leaving milestone 2",
                    __func__, ldcp->ldc_id);

                ldcp->hphase = VSW_MILESTONE3;
                vsw_send_rdx(ldcp);
                break;

        case VSW_MILESTONE3:
                /*
                 * Pass this milestone when all paramaters have been
                 * successfully exchanged and RDX sent in both directions.
                 *
                 * Mark the relevant lane as available to transmit data. In
                 * RxDringData mode, lane_in is associated with transmit and
                 * lane_out is associated with receive. It is the reverse in
                 * TxDring mode.
                 */
                if ((lane_out->lstate & VSW_RDX_ACK_SENT) &&
                    (lane_in->lstate & VSW_RDX_ACK_RECV)) {

                        D2(vswp, "%s: (chan %lld) leaving milestone 3",
                            __func__, ldcp->ldc_id);
                        D2(vswp, "%s: ** handshake complete (0x%llx : "
                            "0x%llx) **", __func__, lane_in->lstate,
                            lane_out->lstate);
                        if (lane_out->dring_mode == VIO_RX_DRING_DATA) {
                                lane_in->lstate |= VSW_LANE_ACTIVE;
                        } else {
                                lane_out->lstate |= VSW_LANE_ACTIVE;
                        }
                        ldcp->hphase = VSW_MILESTONE4;
                        ldcp->hcnt = 0;
                        DISPLAY_STATE();
                        /* Start HIO if enabled and capable */
                        if ((portp->p_hio_enabled) && (portp->p_hio_capable)) {
                                D2(vswp, "%s: start HybridIO setup", __func__);
                                vsw_hio_start(vswp, ldcp);
                        }

                        if (ldcp->pls_negotiated == B_TRUE) {
                                /*
                                 * The vnet device has negotiated to get phys
                                 * link updates. Now that the handshake with
                                 * the vnet device is complete, send an initial
                                 * update with the current physical link state.
                                 */
                                vsw_send_physlink_msg(ldcp,
                                    vswp->phys_link_state);
                        }

                } else {
                        D2(vswp, "%s: still in milestone 3 (0x%llx : 0x%llx)",
                            __func__, lane_in->lstate,
                            lane_out->lstate);
                }
                break;

        case VSW_MILESTONE4:
                D2(vswp, "%s: (chan %lld) in milestone 4", __func__,
                    ldcp->ldc_id);
                break;

        default:
                DERR(vswp, "%s: (chan %lld) Unknown Phase %x", __func__,
                    ldcp->ldc_id, ldcp->hphase);
        }

        D1(vswp, "%s (chan %lld): exit (phase %ld)", __func__, ldcp->ldc_id,
            ldcp->hphase);
}

/*
 * Check if major version is supported.
 *
 * Returns 0 if finds supported major number, and if necessary
 * adjusts the minor field.
 *
 * Returns 1 if can't match major number exactly. Sets mjor/minor
 * to next lowest support values, or to zero if no other values possible.
 */
static int
vsw_supported_version(vio_ver_msg_t *vp)
{
        int     i;

        D1(NULL, "vsw_supported_version: enter");

        for (i = 0; i < VSW_NUM_VER; i++) {
                if (vsw_versions[i].ver_major == vp->ver_major) {
                        /*
                         * Matching or lower major version found. Update
                         * minor number if necessary.
                         */
                        if (vp->ver_minor > vsw_versions[i].ver_minor) {
                                D2(NULL, "%s: adjusting minor value from %d "
                                    "to %d", __func__, vp->ver_minor,
                                    vsw_versions[i].ver_minor);
                                vp->ver_minor = vsw_versions[i].ver_minor;
                        }

                        return (0);
                }

                /*
                 * If the message contains a higher major version number, set
                 * the message's major/minor versions to the current values
                 * and return false, so this message will get resent with
                 * these values.
                 */
                if (vsw_versions[i].ver_major < vp->ver_major) {
                        D2(NULL, "%s: adjusting major and minor "
                            "values to %d, %d\n",
                            __func__, vsw_versions[i].ver_major,
                            vsw_versions[i].ver_minor);
                        vp->ver_major = vsw_versions[i].ver_major;
                        vp->ver_minor = vsw_versions[i].ver_minor;
                        return (1);
                }
        }

        /* No match was possible, zero out fields */
        vp->ver_major = 0;
        vp->ver_minor = 0;

        D1(NULL, "vsw_supported_version: exit");

        return (1);
}

/*
 * Set vnet-protocol-version dependent functions based on version.
 */
static void
vsw_set_vnet_proto_ops(vsw_ldc_t *ldcp)
{
        vsw_t   *vswp = ldcp->ldc_vswp;
        lane_t  *lp = &ldcp->lane_out;

        /*
         * Setup the appropriate dring data processing routine and any
         * associated thread based on the version.
         *
         * In versions < 1.6, we support only TxDring mode. In this mode, the
         * msg worker thread processes all types of VIO msgs (ctrl and data).
         *
         * In versions >= 1.6, we also support RxDringData mode. In this mode,
         * the rcv worker thread processes dring data messages (msgtype:
         * VIO_TYPE_DATA, subtype: VIO_SUBTYPE_INFO, env: VIO_DRING_DATA). The
         * rest of the data messages (including acks) and ctrl messages are
         * handled directly by the callback (intr) thread.
         *
         * However, for versions >= 1.6, we could still fallback to TxDring
         * mode. This could happen if RxDringData mode has been disabled (see
         * below) on this guest or on the peer guest. This info is determined
         * as part of attr exchange phase of handshake. Hence, we setup these
         * pointers for v1.6 after attr msg phase completes during handshake.
         */
        if (VSW_VER_GTEQ(ldcp, 1, 6)) {
                /*
                 * Set data dring mode for vsw_send_attr(). We setup msg worker
                 * thread in TxDring mode or rcv worker thread in RxDringData
                 * mode when attr phase of handshake completes.
                 */
                if (vsw_mapin_avail(ldcp) == B_TRUE) {
                        lp->dring_mode = (VIO_RX_DRING_DATA | VIO_TX_DRING);
                } else {
                        lp->dring_mode = VIO_TX_DRING;
                }
        } else {
                lp->dring_mode = VIO_TX_DRING;
        }

        /*
         * Setup the MTU for attribute negotiation based on the version.
         */
        if (VSW_VER_GTEQ(ldcp, 1, 4)) {
                /*
                 * If the version negotiated with peer is >= 1.4(Jumbo Frame
                 * Support), set the mtu in our attributes to max_frame_size.
                 */
                lp->mtu = vswp->max_frame_size;
        } else if (VSW_VER_EQ(ldcp, 1, 3)) {
                /*
                 * If the version negotiated with peer is == 1.3 (Vlan Tag
                 * Support) set the attr.mtu to ETHERMAX + VLAN_TAGSZ.
                 */
                lp->mtu = ETHERMAX + VLAN_TAGSZ;
        } else {
                vsw_port_t      *portp = ldcp->ldc_port;
                /*
                 * Pre-1.3 peers expect max frame size of ETHERMAX.
                 * We can negotiate that size with those peers provided only
                 * pvid is defined for our peer and there are no vids. Then we
                 * can send/recv only untagged frames of max size ETHERMAX.
                 * Note that pvid of the peer can be different, as vsw has to
                 * serve the vnet in that vlan even if itself is not assigned
                 * to that vlan.
                 */
                if (portp->nvids == 0) {
                        lp->mtu = ETHERMAX;
                }
        }

        /*
         * Setup version dependent data processing functions.
         */
        if (VSW_VER_GTEQ(ldcp, 1, 2)) {
                /* Versions >= 1.2 */

                if (VSW_PRI_ETH_DEFINED(vswp)) {
                        /*
                         * enable priority routines and pkt mode only if
                         * at least one pri-eth-type is specified in MD.
                         */
                        ldcp->tx = vsw_ldctx_pri;
                        ldcp->rx_pktdata = vsw_process_pkt_data;

                        /* set xfer mode for vsw_send_attr() */
                        lp->xfer_mode = VIO_PKT_MODE | VIO_DRING_MODE_V1_2;
                } else {
                        /* no priority eth types defined in MD */

                        ldcp->tx = vsw_ldctx;
                        ldcp->rx_pktdata = vsw_process_pkt_data_nop;

                        /* set xfer mode for vsw_send_attr() */
                        lp->xfer_mode = VIO_DRING_MODE_V1_2;
                }

        } else {
                /* Versions prior to 1.2  */

                vsw_reset_vnet_proto_ops(ldcp);
        }
}

/*
 * Reset vnet-protocol-version dependent functions to v1.0.
 */
static void
vsw_reset_vnet_proto_ops(vsw_ldc_t *ldcp)
{
        lane_t  *lp = &ldcp->lane_out;

        ldcp->tx = vsw_ldctx;
        ldcp->rx_pktdata = vsw_process_pkt_data_nop;

        /* set xfer mode for vsw_send_attr() */
        lp->xfer_mode = VIO_DRING_MODE_V1_0;
}

static void
vsw_process_evt_read(vsw_ldc_t *ldcp)
{
        if (ldcp->msg_thread != NULL) {
                /*
                 * TxDring mode; wakeup message worker
                 * thread to process the VIO messages.
                 */
                mutex_exit(&ldcp->ldc_cblock);
                mutex_enter(&ldcp->msg_thr_lock);
                if (!(ldcp->msg_thr_flags & VSW_WTHR_DATARCVD)) {
                        ldcp->msg_thr_flags |= VSW_WTHR_DATARCVD;
                        cv_signal(&ldcp->msg_thr_cv);
                }
                mutex_exit(&ldcp->msg_thr_lock);
                mutex_enter(&ldcp->ldc_cblock);
        } else {
                /*
                 * We invoke vsw_process_pkt() in the context of the LDC
                 * callback (vsw_ldc_cb()) during handshake, until the dring
                 * mode is negotiated. After the dring mode is negotiated, the
                 * msgs are processed by the msg worker thread (above case) if
                 * the dring mode is TxDring. Otherwise (in RxDringData mode)
                 * we continue to process the msgs directly in the callback
                 * context.
                 */
                vsw_process_pkt(ldcp);
        }
}

/*
 * Main routine for processing messages received over LDC.
 */
void
vsw_process_pkt(void *arg)
{
        vsw_ldc_t       *ldcp = (vsw_ldc_t  *)arg;
        vsw_t           *vswp = ldcp->ldc_vswp;
        size_t          msglen;
        vio_msg_tag_t   *tagp;
        uint64_t        *ldcmsg;
        int             rv = 0;


        D1(vswp, "%s enter: ldcid (%lld)\n", __func__, ldcp->ldc_id);

        ASSERT(MUTEX_HELD(&ldcp->ldc_cblock));

        ldcmsg = ldcp->ldcmsg;
        /*
         * If channel is up read messages until channel is empty.
         */
        do {
                msglen = ldcp->msglen;
                rv = ldc_read(ldcp->ldc_handle, (caddr_t)ldcmsg, &msglen);

                if (rv != 0) {
                        DERR(vswp, "%s :ldc_read err id(%lld) rv(%d) len(%d)\n",
                            __func__, ldcp->ldc_id, rv, msglen);
                }

                /* channel has been reset */
                if (rv == ECONNRESET) {
                        vsw_process_conn_evt(ldcp, VSW_CONN_RESET);
                        break;
                }

                if (msglen == 0) {
                        D2(vswp, "%s: ldc_read id(%lld) NODATA", __func__,
                            ldcp->ldc_id);
                        break;
                }

                D2(vswp, "%s: ldc_read id(%lld): msglen(%d)", __func__,
                    ldcp->ldc_id, msglen);

                /*
                 * Figure out what sort of packet we have gotten by
                 * examining the msg tag, and then switch it appropriately.
                 */
                tagp = (vio_msg_tag_t *)ldcmsg;

                switch (tagp->vio_msgtype) {
                case VIO_TYPE_CTRL:
                        vsw_dispatch_ctrl_task(ldcp, ldcmsg, tagp, msglen);
                        break;
                case VIO_TYPE_DATA:
                        vsw_process_data_pkt(ldcp, ldcmsg, tagp, msglen);
                        break;
                case VIO_TYPE_ERR:
                        vsw_process_err_pkt(ldcp, ldcmsg, tagp);
                        break;
                default:
                        DERR(vswp, "%s: Unknown tag(%lx) ", __func__,
                            "id(%lx)\n", tagp->vio_msgtype, ldcp->ldc_id);
                        break;
                }
        } while (msglen);

        D1(vswp, "%s exit: ldcid (%lld)\n", __func__, ldcp->ldc_id);
}

/*
 * Dispatch a task to process a VIO control message.
 */
static void
vsw_dispatch_ctrl_task(vsw_ldc_t *ldcp, void *cpkt, vio_msg_tag_t *tagp,
    int msglen)
{
        vsw_ctrl_task_t         *ctaskp = NULL;
        vsw_port_t              *port = ldcp->ldc_port;
        vsw_t                   *vswp = port->p_vswp;

        D1(vswp, "%s: enter", __func__);

        /*
         * We need to handle RDX ACK messages in-band as once they
         * are exchanged it is possible that we will get an
         * immediate (legitimate) data packet.
         */
        if ((tagp->vio_subtype_env == VIO_RDX) &&
            (tagp->vio_subtype == VIO_SUBTYPE_ACK)) {

                if (vsw_check_flag(ldcp, INBOUND, VSW_RDX_ACK_RECV))
                        return;

                ldcp->lane_in.lstate |= VSW_RDX_ACK_RECV;
                D2(vswp, "%s (%ld) handling RDX_ACK in place "
                    "(ostate 0x%llx : hphase %d)", __func__,
                    ldcp->ldc_id, ldcp->lane_in.lstate, ldcp->hphase);
                vsw_next_milestone(ldcp);
                return;
        }

        ctaskp = kmem_alloc(sizeof (vsw_ctrl_task_t), KM_NOSLEEP);

        if (ctaskp == NULL) {
                DERR(vswp, "%s: unable to alloc space for ctrl msg", __func__);
                vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
                return;
        }

        ctaskp->ldcp = ldcp;
        bcopy((def_msg_t *)cpkt, &ctaskp->pktp, msglen);
        ctaskp->hss_id = ldcp->hss_id;

        /*
         * Dispatch task to processing taskq if port is not in
         * the process of being detached.
         */
        mutex_enter(&port->state_lock);
        if (port->state == VSW_PORT_INIT) {
                if ((vswp->taskq_p == NULL) ||
                    (ddi_taskq_dispatch(vswp->taskq_p, vsw_process_ctrl_pkt,
                    ctaskp, DDI_NOSLEEP) != DDI_SUCCESS)) {
                        mutex_exit(&port->state_lock);
                        DERR(vswp, "%s: unable to dispatch task to taskq",
                            __func__);
                        vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
                        kmem_free(ctaskp, sizeof (vsw_ctrl_task_t));
                        return;
                }
        } else {
                kmem_free(ctaskp, sizeof (vsw_ctrl_task_t));
                DWARN(vswp, "%s: port %d detaching, not dispatching "
                    "task", __func__, port->p_instance);
        }

        mutex_exit(&port->state_lock);

        D2(vswp, "%s: dispatched task to taskq for chan %d", __func__,
            ldcp->ldc_id);
        D1(vswp, "%s: exit", __func__);
}

/*
 * Process a VIO ctrl message. Invoked from taskq.
 */
static void
vsw_process_ctrl_pkt(void *arg)
{
        vsw_ctrl_task_t *ctaskp = (vsw_ctrl_task_t *)arg;
        vsw_ldc_t       *ldcp = ctaskp->ldcp;
        vsw_t           *vswp = ldcp->ldc_vswp;
        vio_msg_tag_t   tag;
        uint16_t        env;

        D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);

        bcopy(&ctaskp->pktp, &tag, sizeof (vio_msg_tag_t));
        env = tag.vio_subtype_env;

        /* stale pkt check */
        if (ctaskp->hss_id < ldcp->hss_id) {
                DWARN(vswp, "%s: discarding stale packet belonging to earlier"
                    " (%ld) handshake session", __func__, ctaskp->hss_id);
                kmem_free(ctaskp, sizeof (vsw_ctrl_task_t));
                return;
        }

        /* session id check */
        if (ldcp->session_status & VSW_PEER_SESSION) {
                if (ldcp->peer_session != tag.vio_sid) {
                        DERR(vswp, "%s (chan %d): invalid session id (%llx)",
                            __func__, ldcp->ldc_id, tag.vio_sid);
                        kmem_free(ctaskp, sizeof (vsw_ctrl_task_t));
                        vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
                        return;
                }
        }

        /*
         * Switch on vio_subtype envelope, then let lower routines
         * decide if its an INFO, ACK or NACK packet.
         */
        switch (env) {
        case VIO_VER_INFO:
                vsw_process_ctrl_ver_pkt(ldcp, &ctaskp->pktp);
                break;
        case VIO_DRING_REG:
                vsw_process_ctrl_dring_reg_pkt(ldcp, &ctaskp->pktp);
                break;
        case VIO_DRING_UNREG:
                vsw_process_ctrl_dring_unreg_pkt(ldcp, &ctaskp->pktp);
                break;
        case VIO_ATTR_INFO:
                vsw_process_ctrl_attr_pkt(ldcp, &ctaskp->pktp);
                break;
        case VNET_MCAST_INFO:
                vsw_process_ctrl_mcst_pkt(ldcp, &ctaskp->pktp);
                break;
        case VIO_RDX:
                vsw_process_ctrl_rdx_pkt(ldcp, &ctaskp->pktp);
                break;
        case VIO_DDS_INFO:
                vsw_process_dds_msg(vswp, ldcp, &ctaskp->pktp);
                break;

        case VNET_PHYSLINK_INFO:
                vsw_process_physlink_msg(ldcp, &ctaskp->pktp);
                break;
        default:
                DERR(vswp, "%s: unknown vio_subtype_env (%x)\n", __func__, env);
        }

        kmem_free(ctaskp, sizeof (vsw_ctrl_task_t));
        D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
}

/*
 * Version negotiation. We can end up here either because our peer
 * has responded to a handshake message we have sent it, or our peer
 * has initiated a handshake with us. If its the former then can only
 * be ACK or NACK, if its the later can only be INFO.
 *
 * If its an ACK we move to the next stage of the handshake, namely
 * attribute exchange. If its a NACK we see if we can specify another
 * version, if we can't we stop.
 *
 * If it is an INFO we reset all params associated with communication
 * in that direction over this channel (remember connection is
 * essentially 2 independent simplex channels).
 */
void
vsw_process_ctrl_ver_pkt(vsw_ldc_t *ldcp, void *pkt)
{
        vio_ver_msg_t   *ver_pkt;
        vsw_t           *vswp = ldcp->ldc_vswp;

        D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);

        /*
         * We know this is a ctrl/version packet so
         * cast it into the correct structure.
         */
        ver_pkt = (vio_ver_msg_t *)pkt;

        switch (ver_pkt->tag.vio_subtype) {
        case VIO_SUBTYPE_INFO:
                D2(vswp, "vsw_process_ctrl_ver_pkt: VIO_SUBTYPE_INFO\n");

                /*
                 * Record the session id, which we will use from now
                 * until we see another VER_INFO msg. Even then the
                 * session id in most cases will be unchanged, execpt
                 * if channel was reset.
                 */
                if ((ldcp->session_status & VSW_PEER_SESSION) &&
                    (ldcp->peer_session != ver_pkt->tag.vio_sid)) {
                        DERR(vswp, "%s: updating session id for chan %lld "
                            "from %llx to %llx", __func__, ldcp->ldc_id,
                            ldcp->peer_session, ver_pkt->tag.vio_sid);
                }

                ldcp->peer_session = ver_pkt->tag.vio_sid;
                ldcp->session_status |= VSW_PEER_SESSION;

                /* Legal message at this time ? */
                if (vsw_check_flag(ldcp, INBOUND, VSW_VER_INFO_RECV))
                        return;

                /*
                 * First check the device class. Currently only expect
                 * to be talking to a network device. In the future may
                 * also talk to another switch.
                 */
                if (ver_pkt->dev_class != VDEV_NETWORK) {
                        DERR(vswp, "%s: illegal device class %d", __func__,
                            ver_pkt->dev_class);

                        ver_pkt->tag.vio_sid = ldcp->local_session;
                        ver_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;

                        DUMP_TAG_PTR((vio_msg_tag_t *)ver_pkt);

                        (void) vsw_send_msg(ldcp, (void *)ver_pkt,
                            sizeof (vio_ver_msg_t), B_TRUE);

                        ldcp->lane_in.lstate |= VSW_VER_NACK_SENT;
                        vsw_next_milestone(ldcp);
                        return;
                } else {
                        ldcp->dev_class = ver_pkt->dev_class;
                }

                /*
                 * Now check the version.
                 */
                if (vsw_supported_version(ver_pkt) == 0) {
                        /*
                         * Support this major version and possibly
                         * adjusted minor version.
                         */

                        D2(vswp, "%s: accepted ver %d:%d", __func__,
                            ver_pkt->ver_major, ver_pkt->ver_minor);

                        /* Store accepted values */
                        ldcp->lane_in.ver_major = ver_pkt->ver_major;
                        ldcp->lane_in.ver_minor = ver_pkt->ver_minor;

                        ver_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;

                        ldcp->lane_in.lstate |= VSW_VER_ACK_SENT;

                        if (vsw_obp_ver_proto_workaround == B_TRUE) {
                                /*
                                 * Send a version info message
                                 * using the accepted version that
                                 * we are about to ack. Also note that
                                 * we send our ver info before we ack.
                                 * Otherwise, as soon as receiving the
                                 * ack, obp sends attr info msg, which
                                 * breaks vsw_check_flag() invoked
                                 * from vsw_process_ctrl_attr_pkt();
                                 * as we also need VSW_VER_ACK_RECV to
                                 * be set in lane_out.lstate, before
                                 * we can receive attr info.
                                 */
                                vsw_send_ver(ldcp);
                        }
                } else {
                        /*
                         * NACK back with the next lower major/minor
                         * pairing we support (if don't suuport any more
                         * versions then they will be set to zero.
                         */

                        D2(vswp, "%s: replying with ver %d:%d", __func__,
                            ver_pkt->ver_major, ver_pkt->ver_minor);

                        /* Store updated values */
                        ldcp->lane_in.ver_major = ver_pkt->ver_major;
                        ldcp->lane_in.ver_minor = ver_pkt->ver_minor;

                        ver_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;

                        ldcp->lane_in.lstate |= VSW_VER_NACK_SENT;
                }

                DUMP_TAG_PTR((vio_msg_tag_t *)ver_pkt);
                ver_pkt->tag.vio_sid = ldcp->local_session;
                (void) vsw_send_msg(ldcp, (void *)ver_pkt,
                    sizeof (vio_ver_msg_t), B_TRUE);

                vsw_next_milestone(ldcp);
                break;

        case VIO_SUBTYPE_ACK:
                D2(vswp, "%s: VIO_SUBTYPE_ACK\n", __func__);

                if (vsw_check_flag(ldcp, OUTBOUND, VSW_VER_ACK_RECV))
                        return;

                /* Store updated values */
                ldcp->lane_out.ver_major = ver_pkt->ver_major;
                ldcp->lane_out.ver_minor = ver_pkt->ver_minor;

                ldcp->lane_out.lstate |= VSW_VER_ACK_RECV;
                vsw_next_milestone(ldcp);

                break;

        case VIO_SUBTYPE_NACK:
                D2(vswp, "%s: VIO_SUBTYPE_NACK\n", __func__);

                if (vsw_check_flag(ldcp, OUTBOUND, VSW_VER_NACK_RECV))
                        return;

                /*
                 * If our peer sent us a NACK with the ver fields set to
                 * zero then there is nothing more we can do. Otherwise see
                 * if we support either the version suggested, or a lesser
                 * one.
                 */
                if ((ver_pkt->ver_major == 0) && (ver_pkt->ver_minor == 0)) {
                        DERR(vswp, "%s: peer unable to negotiate any "
                            "further.", __func__);
                        ldcp->lane_out.lstate |= VSW_VER_NACK_RECV;
                        vsw_next_milestone(ldcp);
                        return;
                }

                /*
                 * Check to see if we support this major version or
                 * a lower one. If we don't then maj/min will be set
                 * to zero.
                 */
                (void) vsw_supported_version(ver_pkt);
                if ((ver_pkt->ver_major == 0) && (ver_pkt->ver_minor == 0)) {
                        /* Nothing more we can do */
                        DERR(vswp, "%s: version negotiation failed.\n",
                            __func__);
                        ldcp->lane_out.lstate |= VSW_VER_NACK_RECV;
                        vsw_next_milestone(ldcp);
                } else {
                        /* found a supported major version */
                        ldcp->lane_out.ver_major = ver_pkt->ver_major;
                        ldcp->lane_out.ver_minor = ver_pkt->ver_minor;

                        D2(vswp, "%s: resending with updated values (%x, %x)",
                            __func__, ver_pkt->ver_major, ver_pkt->ver_minor);

                        ldcp->lane_out.lstate |= VSW_VER_INFO_SENT;
                        ver_pkt->tag.vio_sid = ldcp->local_session;
                        ver_pkt->tag.vio_subtype = VIO_SUBTYPE_INFO;

                        DUMP_TAG_PTR((vio_msg_tag_t *)ver_pkt);

                        (void) vsw_send_msg(ldcp, (void *)ver_pkt,
                            sizeof (vio_ver_msg_t), B_TRUE);

                        vsw_next_milestone(ldcp);

                }
                break;

        default:
                DERR(vswp, "%s: unknown vio_subtype %x\n", __func__,
                    ver_pkt->tag.vio_subtype);
        }

        D1(vswp, "%s(%lld): exit\n", __func__, ldcp->ldc_id);
}

static int
vsw_process_attr_info(vsw_ldc_t *ldcp, vnet_attr_msg_t *msg)
{
        vsw_t                   *vswp = ldcp->ldc_vswp;
        vsw_port_t              *port = ldcp->ldc_port;
        struct ether_addr       ea;
        uint64_t                macaddr = 0;
        lane_t                  *lane_out = &ldcp->lane_out;
        lane_t                  *lane_in = &ldcp->lane_in;
        uint32_t                mtu;
        int                     i;
        uint8_t                 dring_mode;

        D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);

        if (vsw_check_flag(ldcp, INBOUND, VSW_ATTR_INFO_RECV)) {
                return (1);
        }

        if ((msg->xfer_mode != VIO_DESC_MODE) &&
            (msg->xfer_mode != lane_out->xfer_mode)) {
                D2(NULL, "%s: unknown mode %x\n", __func__, msg->xfer_mode);
                return (1);
        }

        /* Only support MAC addresses at moment. */
        if ((msg->addr_type != ADDR_TYPE_MAC) || (msg->addr == 0)) {
                D2(NULL, "%s: invalid addr_type %x, or address 0x%llx\n",
                    __func__, msg->addr_type, msg->addr);
                return (1);
        }

        /*
         * MAC address supplied by device should match that stored
         * in the vsw-port OBP node. Need to decide what to do if they
         * don't match, for the moment just warn but don't fail.
         */
        vnet_macaddr_ultostr(msg->addr, ea.ether_addr_octet);
        if (ether_cmp(&ea, &port->p_macaddr) != 0) {
                DERR(NULL, "%s: device supplied address "
                    "0x%llx doesn't match node address 0x%llx\n",
                    __func__, msg->addr, port->p_macaddr);
        }

        /*
         * Ack freq only makes sense in pkt mode, in shared
         * mode the ring descriptors say whether or not to
         * send back an ACK.
         */
        if ((VSW_VER_GTEQ(ldcp, 1, 2) &&
            (msg->xfer_mode & VIO_DRING_MODE_V1_2)) ||
            (VSW_VER_LT(ldcp, 1, 2) &&
            (msg->xfer_mode == VIO_DRING_MODE_V1_0))) {
                if (msg->ack_freq > 0) {
                        D2(NULL, "%s: non zero ack freq in SHM mode\n",
                            __func__);
                        return (1);
                }
        }

        /*
         * Process dring mode attribute.
         */
        if (VSW_VER_GTEQ(ldcp, 1, 6)) {
                /*
                 * Versions >= 1.6:
                 * Though we are operating in v1.6 mode, it is possible that
                 * RxDringData mode has been disabled either on this guest or
                 * on the peer guest. If so, we revert to pre v1.6 behavior of
                 * TxDring mode. But this must be agreed upon in both
                 * directions of attr exchange. We first determine the mode
                 * that can be negotiated.
                 */
                if ((msg->options & VIO_RX_DRING_DATA) != 0 &&
                    vsw_mapin_avail(ldcp) == B_TRUE) {
                        /*
                         * The peer is capable of handling RxDringData AND we
                         * are also capable of it; we enable RxDringData mode
                         * on this channel.
                         */
                        dring_mode = VIO_RX_DRING_DATA;
                } else if ((msg->options & VIO_TX_DRING) != 0) {
                        /*
                         * If the peer is capable of TxDring mode, we
                         * negotiate TxDring mode on this channel.
                         */
                        dring_mode = VIO_TX_DRING;
                } else {
                        /*
                         * We support only VIO_TX_DRING and VIO_RX_DRING_DATA
                         * modes. We don't support VIO_RX_DRING mode.
                         */
                        return (1);
                }

                /*
                 * If we have received an ack for the attr info that we sent,
                 * then check if the dring mode matches what the peer had ack'd
                 * (saved in lane_out). If they don't match, we fail the
                 * handshake.
                 */
                if (lane_out->lstate & VSW_ATTR_ACK_RECV) {
                        if (msg->options != lane_out->dring_mode) {
                                /* send NACK */
                                return (1);
                        }
                } else {
                        /*
                         * Save the negotiated dring mode in our attr
                         * parameters, so it gets sent in the attr info from us
                         * to the peer.
                         */
                        lane_out->dring_mode = dring_mode;
                }

                /* save the negotiated dring mode in the msg to be replied */
                msg->options = dring_mode;
        }

        /*
         * Process MTU attribute.
         */
        if (VSW_VER_GTEQ(ldcp, 1, 4)) {
                /*
                 * Versions >= 1.4:
                 * Validate mtu of the peer is at least ETHERMAX. Then, the mtu
                 * is negotiated down to the minimum of our mtu and peer's mtu.
                 */
                if (msg->mtu < ETHERMAX) {
                        return (1);
                }

                mtu = MIN(msg->mtu, vswp->max_frame_size);

                /*
                 * If we have received an ack for the attr info
                 * that we sent, then check if the mtu computed
                 * above matches the mtu that the peer had ack'd
                 * (saved in local hparams). If they don't
                 * match, we fail the handshake.
                 */
                if (lane_out->lstate & VSW_ATTR_ACK_RECV) {
                        if (mtu != lane_out->mtu) {
                                /* send NACK */
                                return (1);
                        }
                } else {
                        /*
                         * Save the mtu computed above in our
                         * attr parameters, so it gets sent in
                         * the attr info from us to the peer.
                         */
                        lane_out->mtu = mtu;
                }

                /* save the MIN mtu in the msg to be replied */
                msg->mtu = mtu;
        } else {
                /* Versions < 1.4, mtu must match */
                if (msg->mtu != lane_out->mtu) {
                        D2(NULL, "%s: invalid MTU (0x%llx)\n",
                            __func__, msg->mtu);
                        return (1);
                }
        }

        /*
         * Otherwise store attributes for this lane and update
         * lane state.
         */
        lane_in->mtu = msg->mtu;
        lane_in->addr = msg->addr;
        lane_in->addr_type = msg->addr_type;
        lane_in->xfer_mode = msg->xfer_mode;
        lane_in->ack_freq = msg->ack_freq;
        lane_in->physlink_update = msg->physlink_update;
        lane_in->dring_mode = msg->options;

        /*
         * Check if the client has requested physlink state updates.
         * If there is a physical device bound to this vswitch (L2
         * mode), set the ack bits to indicate it is supported.
         * Otherwise, set the nack bits.
         */
        if (VSW_VER_GTEQ(ldcp, 1, 5)) { /* Protocol ver >= 1.5 */

                /* Does the vnet need phys link state updates ? */
                if ((lane_in->physlink_update &
                    PHYSLINK_UPDATE_STATE_MASK) ==
                    PHYSLINK_UPDATE_STATE) {

                        if (vswp->smode & VSW_LAYER2) {
                                /* is a net-dev assigned to us ? */
                                msg->physlink_update =
                                    PHYSLINK_UPDATE_STATE_ACK;
                                ldcp->pls_negotiated = B_TRUE;
                        } else {
                                /* not in L2 mode */
                                msg->physlink_update =
                                    PHYSLINK_UPDATE_STATE_NACK;
                                ldcp->pls_negotiated = B_FALSE;
                        }

                } else {
                        msg->physlink_update =
                            PHYSLINK_UPDATE_NONE;
                        ldcp->pls_negotiated = B_FALSE;
                }

        } else {
                /*
                 * physlink_update bits are ignored
                 * if set by clients < v1.5 protocol.
                 */
                msg->physlink_update = PHYSLINK_UPDATE_NONE;
                ldcp->pls_negotiated = B_FALSE;
        }

        macaddr = lane_in->addr;
        for (i = ETHERADDRL - 1; i >= 0; i--) {
                port->p_macaddr.ether_addr_octet[i] = macaddr & 0xFF;
                macaddr >>= 8;
        }

        /*
         * Setup device specific xmit routines. Note this could be changed
         * further in vsw_send_dring_info() for versions >= 1.6 if operating in
         * RxDringData mode.
         */
        mutex_enter(&port->tx_lock);

        if ((VSW_VER_GTEQ(ldcp, 1, 2) &&
            (lane_in->xfer_mode & VIO_DRING_MODE_V1_2)) ||
            (VSW_VER_LT(ldcp, 1, 2) &&
            (lane_in->xfer_mode == VIO_DRING_MODE_V1_0))) {
                D2(vswp, "%s: mode = VIO_DRING_MODE", __func__);
                port->transmit = vsw_dringsend;
        } else if (lane_in->xfer_mode == VIO_DESC_MODE) {
                D2(vswp, "%s: mode = VIO_DESC_MODE", __func__);
                vsw_create_privring(ldcp);
                port->transmit = vsw_descrsend;
                lane_out->xfer_mode = VIO_DESC_MODE;
        }

        /*
         * HybridIO is supported only vnet, not by OBP.
         * So, set hio_capable to true only when in DRING mode.
         */
        if (VSW_VER_GTEQ(ldcp, 1, 3) &&
            (lane_in->xfer_mode != VIO_DESC_MODE)) {
                (void) atomic_swap_32(&port->p_hio_capable, B_TRUE);
        } else {
                (void) atomic_swap_32(&port->p_hio_capable, B_FALSE);
        }

        mutex_exit(&port->tx_lock);

        return (0);
}

static int
vsw_process_attr_ack(vsw_ldc_t *ldcp, vnet_attr_msg_t *msg)
{
        vsw_t   *vswp = ldcp->ldc_vswp;
        lane_t  *lane_out = &ldcp->lane_out;
        lane_t  *lane_in = &ldcp->lane_in;

        D2(vswp, "%s: VIO_SUBTYPE_ACK", __func__);

        if (vsw_check_flag(ldcp, OUTBOUND, VSW_ATTR_ACK_RECV)) {
                return (1);
        }

        /*
         * Process dring mode attribute.
         */
        if (VSW_VER_GTEQ(ldcp, 1, 6)) {
                /*
                 * Versions >= 1.6:
                 * The ack msg sent by the peer contains the negotiated dring
                 * mode between our capability (that we had sent in our attr
                 * info) and the peer's capability.
                 */
                if (lane_in->lstate & VSW_ATTR_ACK_SENT) {
                        /*
                         * If we have sent an ack for the attr info msg from
                         * the peer, check if the dring mode that was
                         * negotiated then (saved in lane_out) matches the
                         * mode that the peer has ack'd. If they don't match,
                         * we fail the handshake.
                         */
                        if (lane_out->dring_mode != msg->options) {
                                return (1);
                        }
                } else {
                        if ((msg->options & lane_out->dring_mode) == 0) {
                                /*
                                 * Peer ack'd with a mode that we don't
                                 * support; we fail the handshake.
                                 */
                                return (1);
                        }
                        if ((msg->options & (VIO_TX_DRING|VIO_RX_DRING_DATA))
                            == (VIO_TX_DRING|VIO_RX_DRING_DATA)) {
                                /*
                                 * Peer must ack with only one negotiated mode.
                                 * Otherwise fail handshake.
                                 */
                                return (1);
                        }

                        /*
                         * Save the negotiated mode, so we can validate it when
                         * we receive attr info from the peer.
                         */
                        lane_out->dring_mode = msg->options;
                }
        }

        /*
         * Process MTU attribute.
         */
        if (VSW_VER_GTEQ(ldcp, 1, 4)) {
                /*
                 * Versions >= 1.4:
                 * The ack msg sent by the peer contains the minimum of
                 * our mtu (that we had sent in our attr info) and the
                 * peer's mtu.
                 *
                 * If we have sent an ack for the attr info msg from
                 * the peer, check if the mtu that was computed then
                 * (saved in lane_out params) matches the mtu that the
                 * peer has ack'd. If they don't match, we fail the
                 * handshake.
                 */
                if (lane_in->lstate & VSW_ATTR_ACK_SENT) {
                        if (lane_out->mtu != msg->mtu) {
                                return (1);
                        }
                } else {
                        /*
                         * If the mtu ack'd by the peer is > our mtu
                         * fail handshake. Otherwise, save the mtu, so
                         * we can validate it when we receive attr info
                         * from our peer.
                         */
                        if (msg->mtu <= lane_out->mtu) {
                                lane_out->mtu = msg->mtu;
                        } else {
                                return (1);
                        }
                }
        }

        return (0);
}

/*
 * Process an attribute packet. We can end up here either because our peer
 * has ACK/NACK'ed back to an earlier ATTR msg we had sent it, or our
 * peer has sent us an attribute INFO message
 *
 * If its an ACK we then move to the next stage of the handshake which
 * is to send our descriptor ring info to our peer. If its a NACK then
 * there is nothing more we can (currently) do.
 *
 * If we get a valid/acceptable INFO packet (and we have already negotiated
 * a version) we ACK back and set channel state to ATTR_RECV, otherwise we
 * NACK back and reset channel state to INACTIV.
 *
 * FUTURE: in time we will probably negotiate over attributes, but for
 * the moment unacceptable attributes are regarded as a fatal error.
 *
 */
void
vsw_process_ctrl_attr_pkt(vsw_ldc_t *ldcp, void *pkt)
{
        vnet_attr_msg_t *attr_pkt;
        vsw_t           *vswp = ldcp->ldc_vswp;
        lane_t          *lane_out = &ldcp->lane_out;
        lane_t          *lane_in = &ldcp->lane_in;
        int             rv;

        D1(vswp, "%s(%lld) enter", __func__, ldcp->ldc_id);

        /*
         * We know this is a ctrl/attr packet so
         * cast it into the correct structure.
         */
        attr_pkt = (vnet_attr_msg_t *)pkt;

        switch (attr_pkt->tag.vio_subtype) {
        case VIO_SUBTYPE_INFO:

                rv = vsw_process_attr_info(ldcp, attr_pkt);
                if (rv != 0) {
                        vsw_free_lane_resources(ldcp, INBOUND);
                        attr_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
                        ldcp->lane_in.lstate |= VSW_ATTR_NACK_SENT;
                } else {
                        attr_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
                        lane_in->lstate |= VSW_ATTR_ACK_SENT;
                }
                attr_pkt->tag.vio_sid = ldcp->local_session;
                DUMP_TAG_PTR((vio_msg_tag_t *)attr_pkt);
                (void) vsw_send_msg(ldcp, (void *)attr_pkt,
                    sizeof (vnet_attr_msg_t), B_TRUE);
                vsw_next_milestone(ldcp);
                break;

        case VIO_SUBTYPE_ACK:

                rv = vsw_process_attr_ack(ldcp, attr_pkt);
                if (rv != 0) {
                        return;
                }
                lane_out->lstate |= VSW_ATTR_ACK_RECV;
                vsw_next_milestone(ldcp);
                break;

        case VIO_SUBTYPE_NACK:
                D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);

                if (vsw_check_flag(ldcp, OUTBOUND, VSW_ATTR_NACK_RECV))
                        return;

                lane_out->lstate |= VSW_ATTR_NACK_RECV;
                vsw_next_milestone(ldcp);
                break;

        default:
                DERR(vswp, "%s: unknown vio_subtype %x\n", __func__,
                    attr_pkt->tag.vio_subtype);
        }

        D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id);
}

static int
vsw_process_dring_reg_info(vsw_ldc_t *ldcp, vio_msg_tag_t *tagp)
{
        int             rv;
        vsw_t           *vswp = ldcp->ldc_vswp;
        lane_t          *lp = &ldcp->lane_out;
        dring_info_t    *dp = NULL;

        D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);

        rv = vsw_check_flag(ldcp, INBOUND, VSW_DRING_INFO_RECV);
        if (rv != 0) {
                return (1);
        }

        if (VSW_VER_GTEQ(ldcp, 1, 6) &&
            (lp->dring_mode != ((vio_dring_reg_msg_t *)tagp)->options)) {
                /*
                 * The earlier version of Solaris vnet driver doesn't set the
                 * option (VIO_TX_DRING in its case) correctly in its dring reg
                 * message. We workaround that here by doing the check only
                 * for versions >= v1.6.
                 */
                DWARN(vswp, "%s(%lld): Rcvd dring reg option (%d), "
                    "negotiated mode (%d)\n", __func__, ldcp->ldc_id,
                    ((vio_dring_reg_msg_t *)tagp)->options, lp->dring_mode);
                return (1);
        }

        /*
         * Map dring exported by the peer.
         */
        dp = vsw_map_dring(ldcp, (void *)tagp);
        if (dp == NULL) {
                return (1);
        }

        /*
         * Map data buffers exported by the peer if we are in RxDringData mode.
         */
        if (lp->dring_mode == VIO_RX_DRING_DATA) {
                rv = vsw_map_data(ldcp, dp, (void *)tagp);
                if (rv != 0) {
                        vsw_unmap_dring(ldcp);
                        return (1);
                }
        }

        return (0);
}

static int
vsw_process_dring_reg_ack(vsw_ldc_t *ldcp, vio_msg_tag_t *tagp)
{
        vsw_t           *vswp = ldcp->ldc_vswp;
        dring_info_t    *dp;

        D2(vswp, "%s: VIO_SUBTYPE_ACK", __func__);

        if (vsw_check_flag(ldcp, OUTBOUND, VSW_DRING_ACK_RECV)) {
                return (1);
        }

        dp = ldcp->lane_out.dringp;

        /* save dring_ident acked by peer */
        dp->ident = ((vio_dring_reg_msg_t *)tagp)->dring_ident;

        return (0);
}

/*
 * Process a dring info packet. We can end up here either because our peer
 * has ACK/NACK'ed back to an earlier DRING msg we had sent it, or our
 * peer has sent us a dring INFO message.
 *
 * If we get a valid/acceptable INFO packet (and we have already negotiated
 * a version) we ACK back and update the lane state, otherwise we NACK back.
 *
 * FUTURE: nothing to stop client from sending us info on multiple dring's
 * but for the moment we will just use the first one we are given.
 *
 */
void
vsw_process_ctrl_dring_reg_pkt(vsw_ldc_t *ldcp, void *pkt)
{
        int             rv;
        int             msgsize;
        dring_info_t    *dp;
        vio_msg_tag_t   *tagp = (vio_msg_tag_t *)pkt;
        vsw_t           *vswp = ldcp->ldc_vswp;
        lane_t          *lane_out = &ldcp->lane_out;
        lane_t          *lane_in = &ldcp->lane_in;

        D1(vswp, "%s(%lld) enter", __func__, ldcp->ldc_id);

        switch (tagp->vio_subtype) {
        case VIO_SUBTYPE_INFO:
                rv = vsw_process_dring_reg_info(ldcp, tagp);
                if (rv != 0) {
                        vsw_free_lane_resources(ldcp, INBOUND);
                        tagp->vio_subtype = VIO_SUBTYPE_NACK;
                        lane_in->lstate |= VSW_DRING_NACK_SENT;
                } else {
                        tagp->vio_subtype = VIO_SUBTYPE_ACK;
                        lane_in->lstate |= VSW_DRING_ACK_SENT;
                }
                tagp->vio_sid = ldcp->local_session;
                DUMP_TAG_PTR(tagp);
                if (lane_out->dring_mode == VIO_RX_DRING_DATA) {
                        dp = lane_in->dringp;
                        msgsize =
                            VNET_DRING_REG_EXT_MSG_SIZE(dp->data_ncookies);
                } else {
                        msgsize = sizeof (vio_dring_reg_msg_t);
                }
                (void) vsw_send_msg(ldcp, (void *)tagp, msgsize, B_TRUE);
                vsw_next_milestone(ldcp);
                break;

        case VIO_SUBTYPE_ACK:
                rv = vsw_process_dring_reg_ack(ldcp, tagp);
                if (rv != 0) {
                        return;
                }
                lane_out->lstate |= VSW_DRING_ACK_RECV;
                vsw_next_milestone(ldcp);
                break;

        case VIO_SUBTYPE_NACK:
                D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);

                if (vsw_check_flag(ldcp, OUTBOUND, VSW_DRING_NACK_RECV))
                        return;

                lane_out->lstate |= VSW_DRING_NACK_RECV;
                vsw_next_milestone(ldcp);
                break;

        default:
                DERR(vswp, "%s: Unknown vio_subtype %x\n", __func__,
                    tagp->vio_subtype);
        }

        D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id);
}

/*
 * Process a request from peer to unregister a dring.
 *
 * For the moment we just restart the handshake if our
 * peer endpoint attempts to unregister a dring.
 */
void
vsw_process_ctrl_dring_unreg_pkt(vsw_ldc_t *ldcp, void *pkt)
{
        vsw_t                   *vswp = ldcp->ldc_vswp;
        vio_dring_unreg_msg_t   *dring_pkt;

        /*
         * We know this is a ctrl/dring packet so
         * cast it into the correct structure.
         */
        dring_pkt = (vio_dring_unreg_msg_t *)pkt;

        D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);

        switch (dring_pkt->tag.vio_subtype) {
        case VIO_SUBTYPE_INFO:
                D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);

                DWARN(vswp, "%s: restarting handshake..", __func__);
                break;

        case VIO_SUBTYPE_ACK:
                D2(vswp, "%s: VIO_SUBTYPE_ACK", __func__);

                DWARN(vswp, "%s: restarting handshake..", __func__);
                break;

        case VIO_SUBTYPE_NACK:
                D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);

                DWARN(vswp, "%s: restarting handshake..", __func__);
                break;

        default:
                DERR(vswp, "%s: Unknown vio_subtype %x\n", __func__,
                    dring_pkt->tag.vio_subtype);
        }

        vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);

        D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
}

#define SND_MCST_NACK(ldcp, pkt) \
        pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; \
        pkt->tag.vio_sid = ldcp->local_session; \
        (void) vsw_send_msg(ldcp, (void *)pkt, \
                        sizeof (vnet_mcast_msg_t), B_TRUE);

/*
 * Process a multicast request from a vnet.
 *
 * Vnet's specify a multicast address that they are interested in. This
 * address is used as a key into the hash table which forms the multicast
 * forwarding database (mFDB).
 *
 * The table keys are the multicast addresses, while the table entries
 * are pointers to lists of ports which wish to receive packets for the
 * specified multicast address.
 *
 * When a multicast packet is being switched we use the address as a key
 * into the hash table, and then walk the appropriate port list forwarding
 * the pkt to each port in turn.
 *
 * If a vnet is no longer interested in a particular multicast grouping
 * we simply find the correct location in the hash table and then delete
 * the relevant port from the port list.
 *
 * To deal with the case whereby a port is being deleted without first
 * removing itself from the lists in the hash table, we maintain a list
 * of multicast addresses the port has registered an interest in, within
 * the port structure itself. We then simply walk that list of addresses
 * using them as keys into the hash table and remove the port from the
 * appropriate lists.
 */
static void
vsw_process_ctrl_mcst_pkt(vsw_ldc_t *ldcp, void *pkt)
{
        vnet_mcast_msg_t        *mcst_pkt;
        vsw_port_t              *port = ldcp->ldc_port;
        vsw_t                   *vswp = ldcp->ldc_vswp;
        int                     i;

        D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);

        /*
         * We know this is a ctrl/mcast packet so
         * cast it into the correct structure.
         */
        mcst_pkt = (vnet_mcast_msg_t *)pkt;

        switch (mcst_pkt->tag.vio_subtype) {
        case VIO_SUBTYPE_INFO:
                D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);

                /*
                 * Check if in correct state to receive a multicast
                 * message (i.e. handshake complete). If not reset
                 * the handshake.
                 */
                if (vsw_check_flag(ldcp, INBOUND, VSW_MCST_INFO_RECV))
                        return;

                /*
                 * Before attempting to add or remove address check
                 * that they are valid multicast addresses.
                 * If not, then NACK back.
                 */
                for (i = 0; i < mcst_pkt->count; i++) {
                        if ((mcst_pkt->mca[i].ether_addr_octet[0] & 01) != 1) {
                                DERR(vswp, "%s: invalid multicast address",
                                    __func__);
                                SND_MCST_NACK(ldcp, mcst_pkt);
                                return;
                        }
                }

                /*
                 * Now add/remove the addresses. If this fails we
                 * NACK back.
                 */
                if (vsw_add_rem_mcst(mcst_pkt, port) != 0) {
                        SND_MCST_NACK(ldcp, mcst_pkt);
                        return;
                }

                mcst_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
                mcst_pkt->tag.vio_sid = ldcp->local_session;

                DUMP_TAG_PTR((vio_msg_tag_t *)mcst_pkt);

                (void) vsw_send_msg(ldcp, (void *)mcst_pkt,
                    sizeof (vnet_mcast_msg_t), B_TRUE);
                break;

        case VIO_SUBTYPE_ACK:
                DWARN(vswp, "%s: VIO_SUBTYPE_ACK", __func__);

                /*
                 * We shouldn't ever get a multicast ACK message as
                 * at the moment we never request multicast addresses
                 * to be set on some other device. This may change in
                 * the future if we have cascading switches.
                 */
                if (vsw_check_flag(ldcp, OUTBOUND, VSW_MCST_ACK_RECV))
                        return;

                                /* Do nothing */
                break;

        case VIO_SUBTYPE_NACK:
                DWARN(vswp, "%s: VIO_SUBTYPE_NACK", __func__);

                /*
                 * We shouldn't get a multicast NACK packet for the
                 * same reasons as we shouldn't get a ACK packet.
                 */
                if (vsw_check_flag(ldcp, OUTBOUND, VSW_MCST_NACK_RECV))
                        return;

                                /* Do nothing */
                break;

        default:
                DERR(vswp, "%s: unknown vio_subtype %x\n", __func__,
                    mcst_pkt->tag.vio_subtype);
        }

        D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
}

static void
vsw_process_ctrl_rdx_pkt(vsw_ldc_t *ldcp, void *pkt)
{
        vio_rdx_msg_t   *rdx_pkt;
        vsw_t           *vswp = ldcp->ldc_vswp;

        /*
         * We know this is a ctrl/rdx packet so
         * cast it into the correct structure.
         */
        rdx_pkt = (vio_rdx_msg_t *)pkt;

        D1(vswp, "%s(%lld) enter", __func__, ldcp->ldc_id);

        switch (rdx_pkt->tag.vio_subtype) {
        case VIO_SUBTYPE_INFO:
                D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);

                if (vsw_check_flag(ldcp, OUTBOUND, VSW_RDX_INFO_RECV))
                        return;

                rdx_pkt->tag.vio_sid = ldcp->local_session;
                rdx_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;

                DUMP_TAG_PTR((vio_msg_tag_t *)rdx_pkt);

                ldcp->lane_out.lstate |= VSW_RDX_ACK_SENT;

                (void) vsw_send_msg(ldcp, (void *)rdx_pkt,
                    sizeof (vio_rdx_msg_t), B_TRUE);

                vsw_next_milestone(ldcp);
                break;

        case VIO_SUBTYPE_ACK:
                /*
                 * Should be handled in-band by callback handler.
                 */
                DERR(vswp, "%s: Unexpected VIO_SUBTYPE_ACK", __func__);
                vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
                break;

        case VIO_SUBTYPE_NACK:
                D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);

                if (vsw_check_flag(ldcp, INBOUND, VSW_RDX_NACK_RECV))
                        return;

                ldcp->lane_in.lstate |= VSW_RDX_NACK_RECV;
                vsw_next_milestone(ldcp);
                break;

        default:
                DERR(vswp, "%s: Unknown vio_subtype %x\n", __func__,
                    rdx_pkt->tag.vio_subtype);
        }

        D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
}

static void
vsw_process_physlink_msg(vsw_ldc_t *ldcp, void *pkt)
{
        vnet_physlink_msg_t     *msgp;
        vsw_t                   *vswp = ldcp->ldc_vswp;

        msgp = (vnet_physlink_msg_t *)pkt;

        D1(vswp, "%s(%lld) enter", __func__, ldcp->ldc_id);

        switch (msgp->tag.vio_subtype) {
        case VIO_SUBTYPE_INFO:

                /* vsw shouldn't recv physlink info */
                DWARN(vswp, "%s: Unexpected VIO_SUBTYPE_INFO", __func__);
                break;

        case VIO_SUBTYPE_ACK:

                D2(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
                break;

        case VIO_SUBTYPE_NACK:

                D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
                break;

        default:
                DERR(vswp, "%s: Unknown vio_subtype %x\n", __func__,
                    msgp->tag.vio_subtype);
        }

        D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
}

static void
vsw_process_data_pkt(vsw_ldc_t *ldcp, void *dpkt, vio_msg_tag_t *tagp,
    uint32_t msglen)
{
        uint16_t        env = tagp->vio_subtype_env;
        vsw_t           *vswp = ldcp->ldc_vswp;
        lane_t          *lp = &ldcp->lane_out;
        uint8_t         dring_mode = lp->dring_mode;

        D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);

        /* session id check */
        if (ldcp->session_status & VSW_PEER_SESSION) {
                if (ldcp->peer_session != tagp->vio_sid) {
                        DERR(vswp, "%s (chan %d): invalid session id (%llx)",
                            __func__, ldcp->ldc_id, tagp->vio_sid);
                        vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
                        return;
                }
        }

        /*
         * It is an error for us to be getting data packets
         * before the handshake has completed.
         */
        if (ldcp->hphase != VSW_MILESTONE4) {
                DERR(vswp, "%s: got data packet before handshake complete "
                    "hphase %d (%x: %x)", __func__, ldcp->hphase,
                    ldcp->lane_in.lstate, ldcp->lane_out.lstate);
                DUMP_FLAGS(ldcp->lane_in.lstate);
                DUMP_FLAGS(ldcp->lane_out.lstate);
                vsw_process_conn_evt(ldcp, VSW_CONN_RESTART);
                return;
        }
        if (dring_mode == VIO_TX_DRING) {
                /*
                 * To reduce the locking contention, release the ldc_cblock
                 * here and re-acquire it once we are done receiving packets.
                 * We do this only in TxDring mode to allow further callbaks to
                 * continue while the msg worker thread processes the messages.
                 * In RxDringData mode, we process the messages in the callback
                 * itself and wake up rcv worker thread to process only data
                 * info messages.
                 */
                mutex_exit(&ldcp->ldc_cblock);
                mutex_enter(&ldcp->ldc_rxlock);
        }

        /*
         * Switch on vio_subtype envelope, then let lower routines
         * decide if its an INFO, ACK or NACK packet.
         */
        if (env == VIO_DRING_DATA) {
                ldcp->rx_dringdata(ldcp, dpkt);
        } else if (env == VIO_PKT_DATA) {
                ldcp->rx_pktdata(ldcp, dpkt, msglen);
        } else if (env == VIO_DESC_DATA) {
                vsw_process_data_ibnd_pkt(ldcp, dpkt);
        } else {
                DERR(vswp, "%s: unknown vio_subtype_env (%x)\n",
                    __func__, env);
        }

        if (dring_mode == VIO_TX_DRING) {
                mutex_exit(&ldcp->ldc_rxlock);
                mutex_enter(&ldcp->ldc_cblock);
        }

        D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
}

/*
 * dummy pkt data handler function for vnet protocol version 1.0
 */
static void
vsw_process_pkt_data_nop(void *arg1, void *arg2, uint32_t msglen)
{
        _NOTE(ARGUNUSED(arg1, arg2, msglen))
}

/*
 * This function handles raw pkt data messages received over the channel.
 * Currently, only priority-eth-type frames are received through this mechanism.
 * In this case, the frame(data) is present within the message itself which
 * is copied into an mblk before switching it.
 */
static void
vsw_process_pkt_data(void *arg1, void *arg2, uint32_t msglen)
{
        vsw_ldc_t               *ldcp = (vsw_ldc_t *)arg1;
        vio_raw_data_msg_t      *dpkt = (vio_raw_data_msg_t *)arg2;
        uint32_t                size;
        mblk_t                  *mp;
        vio_mblk_t              *vmp;
        vsw_t                   *vswp = ldcp->ldc_vswp;
        vgen_stats_t            *statsp = &ldcp->ldc_stats;
        lane_t                  *lp = &ldcp->lane_out;

        size = msglen - VIO_PKT_DATA_HDRSIZE;
        if (size < ETHERMIN || size > lp->mtu) {
                (void) atomic_inc_32(&statsp->rx_pri_fail);
                DWARN(vswp, "%s(%lld) invalid size(%d)\n", __func__,
                    ldcp->ldc_id, size);
                return;
        }

        vmp = vio_multipool_allocb(&ldcp->vmp, size + VLAN_TAGSZ);
        if (vmp == NULL) {
                mp = allocb(size + VLAN_TAGSZ, BPRI_MED);
                if (mp == NULL) {
                        (void) atomic_inc_32(&statsp->rx_pri_fail);
                        DWARN(vswp, "%s(%lld) allocb failure, "
                            "unable to process priority frame\n", __func__,
                            ldcp->ldc_id);
                        return;
                }
        } else {
                mp = vmp->mp;
        }

        /* skip over the extra space for vlan tag */
        mp->b_rptr += VLAN_TAGSZ;

        /* copy the frame from the payload of raw data msg into the mblk */
        bcopy(dpkt->data, mp->b_rptr, size);
        mp->b_wptr = mp->b_rptr + size;

        if (vmp != NULL) {
                vmp->state = VIO_MBLK_HAS_DATA;
        }

        /* update stats */
        (void) atomic_inc_64(&statsp->rx_pri_packets);
        (void) atomic_add_64(&statsp->rx_pri_bytes, size);

        /*
         * VLAN_TAGSZ of extra space has been pre-alloc'd if tag is needed.
         */
        (void) vsw_vlan_frame_pretag(ldcp->ldc_port, VSW_VNETPORT, mp);

        /* switch the frame to destination */
        vswp->vsw_switch_frame(vswp, mp, VSW_VNETPORT, ldcp->ldc_port, NULL);
}

/*
 * Process an in-band descriptor message (most likely from
 * OBP).
 */
static void
vsw_process_data_ibnd_pkt(vsw_ldc_t *ldcp, void *pkt)
{
        vnet_ibnd_desc_t        *ibnd_desc;
        dring_info_t            *dp = NULL;
        vsw_private_desc_t      *priv_addr = NULL;
        vsw_t                   *vswp = ldcp->ldc_vswp;
        mblk_t                  *mp = NULL;
        size_t                  nbytes = 0;
        size_t                  off = 0;
        uint64_t                idx = 0;
        uint32_t                num = 1, len, datalen = 0;
        uint64_t                ncookies = 0;
        int                     i, rv;
        int                     j = 0;

        D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);

        ibnd_desc = (vnet_ibnd_desc_t *)pkt;

        switch (ibnd_desc->hdr.tag.vio_subtype) {
        case VIO_SUBTYPE_INFO:
                D1(vswp, "%s: VIO_SUBTYPE_INFO", __func__);

                if (vsw_check_flag(ldcp, INBOUND, VSW_DRING_INFO_RECV))
                        return;

                /*
                 * Data is padded to align on a 8 byte boundary,
                 * nbytes is actual data length, i.e. minus that
                 * padding.
                 */
                datalen = ibnd_desc->nbytes;

                D2(vswp, "%s(%lld): processing inband desc : "
                    ": datalen 0x%lx", __func__, ldcp->ldc_id, datalen);

                ncookies = ibnd_desc->ncookies;

                /*
                 * allocb(9F) returns an aligned data block. We
                 * need to ensure that we ask ldc for an aligned
                 * number of bytes also.
                 */
                nbytes = datalen;
                if (nbytes & 0x7) {
                        off = 8 - (nbytes & 0x7);
                        nbytes += off;
                }

                /* alloc extra space for VLAN_TAG */
                mp = allocb(datalen + 8, BPRI_MED);
                if (mp == NULL) {
                        DERR(vswp, "%s(%lld): allocb failed",
                            __func__, ldcp->ldc_id);
                        ldcp->ldc_stats.rx_allocb_fail++;
                        return;
                }

                /* skip over the extra space for VLAN_TAG */
                mp->b_rptr += 8;

                rv = ldc_mem_copy(ldcp->ldc_handle, (caddr_t)mp->b_rptr,
                    0, &nbytes, ibnd_desc->memcookie, (uint64_t)ncookies,
                    LDC_COPY_IN);

                if (rv != 0) {
                        DERR(vswp, "%s(%d): unable to copy in data from "
                            "%d cookie(s)", __func__, ldcp->ldc_id, ncookies);
                        freemsg(mp);
                        ldcp->ldc_stats.ierrors++;
                        return;
                }

                D2(vswp, "%s(%d): copied in %ld bytes using %d cookies",
                    __func__, ldcp->ldc_id, nbytes, ncookies);

                /* point to the actual end of data */
                mp->b_wptr = mp->b_rptr + datalen;
                ldcp->ldc_stats.ipackets++;
                ldcp->ldc_stats.rbytes += datalen;

                /*
                 * We ACK back every in-band descriptor message we process
                 */
                ibnd_desc->hdr.tag.vio_subtype = VIO_SUBTYPE_ACK;
                ibnd_desc->hdr.tag.vio_sid = ldcp->local_session;
                (void) vsw_send_msg(ldcp, (void *)ibnd_desc,
                    sizeof (vnet_ibnd_desc_t), B_TRUE);

                /*
                 * there is extra space alloc'd for VLAN_TAG
                 */
                (void) vsw_vlan_frame_pretag(ldcp->ldc_port, VSW_VNETPORT, mp);

                /* send the packet to be switched */
                vswp->vsw_switch_frame(vswp, mp, VSW_VNETPORT,
                    ldcp->ldc_port, NULL);

                break;

        case VIO_SUBTYPE_ACK:
                D1(vswp, "%s: VIO_SUBTYPE_ACK", __func__);

                /* Verify the ACK is valid */
                idx = ibnd_desc->hdr.desc_handle;

                if (idx >= vsw_num_descriptors) {
                        cmn_err(CE_WARN, "!vsw%d: corrupted ACK received "
                            "(idx %ld)", vswp->instance, idx);
                        return;
                }

                if ((dp = ldcp->lane_out.dringp) == NULL) {
                        DERR(vswp, "%s: no dring found", __func__);
                        return;
                }

                len = dp->num_descriptors;
                /*
                 * If the descriptor we are being ACK'ed for is not the
                 * one we expected, then pkts were lost somwhere, either
                 * when we tried to send a msg, or a previous ACK msg from
                 * our peer. In either case we now reclaim the descriptors
                 * in the range from the last ACK we received up to the
                 * current ACK.
                 */
                if (idx != dp->last_ack_recv) {
                        DWARN(vswp, "%s: dropped pkts detected, (%ld, %ld)",
                            __func__, dp->last_ack_recv, idx);
                        num = idx >= dp->last_ack_recv ?
                            idx - dp->last_ack_recv + 1:
                            (len - dp->last_ack_recv + 1) + idx;
                }

                /*
                 * When we sent the in-band message to our peer we
                 * marked the copy in our private ring as READY. We now
                 * check that the descriptor we are being ACK'ed for is in
                 * fact READY, i.e. it is one we have shared with our peer.
                 *
                 * If its not we flag an error, but still reset the descr
                 * back to FREE.
                 */
                for (i = dp->last_ack_recv; j < num; i = (i + 1) % len, j++) {
                        priv_addr = (vsw_private_desc_t *)dp->priv_addr + i;
                        mutex_enter(&priv_addr->dstate_lock);
                        if (priv_addr->dstate != VIO_DESC_READY) {
                                DERR(vswp, "%s: (%ld) desc at index %ld not "
                                    "READY (0x%lx)", __func__,
                                    ldcp->ldc_id, idx, priv_addr->dstate);
                                DERR(vswp, "%s: bound %d: ncookies %ld : "
                                    "datalen %ld", __func__,
                                    priv_addr->bound, priv_addr->ncookies,
                                    priv_addr->datalen);
                        }
                        D2(vswp, "%s: (%lld) freeing descp at %lld", __func__,
                            ldcp->ldc_id, idx);
                        /* release resources associated with sent msg */
                        priv_addr->datalen = 0;
                        priv_addr->dstate = VIO_DESC_FREE;
                        mutex_exit(&priv_addr->dstate_lock);
                }
                /* update to next expected value */
                dp->last_ack_recv = (idx + 1) % dp->num_descriptors;

                break;

        case VIO_SUBTYPE_NACK:
                DERR(vswp, "%s: VIO_SUBTYPE_NACK", __func__);

                /*
                 * We should only get a NACK if our peer doesn't like
                 * something about a message we have sent it. If this
                 * happens we just release the resources associated with
                 * the message. (We are relying on higher layers to decide
                 * whether or not to resend.
                 */

                /* limit check */
                idx = ibnd_desc->hdr.desc_handle;

                if (idx >= vsw_num_descriptors) {
                        DERR(vswp, "%s: corrupted NACK received (idx %lld)",
                            __func__, idx);
                        return;
                }

                if ((dp = ldcp->lane_out.dringp) == NULL) {
                        DERR(vswp, "%s: no dring found", __func__);
                        return;
                }

                priv_addr = (vsw_private_desc_t *)dp->priv_addr;

                /* move to correct location in ring */
                priv_addr += idx;

                /* release resources associated with sent msg */
                mutex_enter(&priv_addr->dstate_lock);
                priv_addr->datalen = 0;
                priv_addr->dstate = VIO_DESC_FREE;
                mutex_exit(&priv_addr->dstate_lock);

                break;

        default:
                DERR(vswp, "%s(%lld): Unknown vio_subtype %x\n", __func__,
                    ldcp->ldc_id, ibnd_desc->hdr.tag.vio_subtype);
        }

        D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id);
}

static void
vsw_process_err_pkt(vsw_ldc_t *ldcp, void *epkt, vio_msg_tag_t *tagp)
{
        _NOTE(ARGUNUSED(epkt))

        vsw_t           *vswp = ldcp->ldc_vswp;
        uint16_t        env = tagp->vio_subtype_env;

        D1(vswp, "%s (%lld): enter\n", __func__, ldcp->ldc_id);

        /*
         * Error vio_subtypes have yet to be defined. So for
         * the moment we can't do anything.
         */
        D2(vswp, "%s: (%x) vio_subtype env", __func__, env);

        D1(vswp, "%s (%lld): exit\n", __func__, ldcp->ldc_id);
}

/* transmit the packet over the given port */
int
vsw_portsend(vsw_port_t *port, mblk_t *mp)
{
        mblk_t          *mpt;
        int             count;
        vsw_ldc_t       *ldcp = port->ldcp;
        int             status = 0;

        count = vsw_vlan_frame_untag(port, VSW_VNETPORT, &mp, &mpt);
        if (count != 0) {
                status = ldcp->tx(ldcp, mp, mpt, count);
        }
        return (status);
}

/*
 * Break up frames into 2 seperate chains: normal and
 * priority, based on the frame type. The number of
 * priority frames is also counted and returned.
 *
 * Params:
 *      vswp:   pointer to the instance of vsw
 *      np:     head of packet chain to be broken
 *      npt:    tail of packet chain to be broken
 *
 * Returns:
 *      np:     head of normal data packets
 *      npt:    tail of normal data packets
 *      hp:     head of high priority packets
 *      hpt:    tail of high priority packets
 */
static uint32_t
vsw_get_pri_packets(vsw_t *vswp, mblk_t **np, mblk_t **npt,
    mblk_t **hp, mblk_t **hpt)
{
        mblk_t                  *tmp = NULL;
        mblk_t                  *smp = NULL;
        mblk_t                  *hmp = NULL;    /* high prio pkts head */
        mblk_t                  *hmpt = NULL;   /* high prio pkts tail */
        mblk_t                  *nmp = NULL;    /* normal pkts head */
        mblk_t                  *nmpt = NULL;   /* normal pkts tail */
        uint32_t                count = 0;
        int                     i;
        struct ether_header     *ehp;
        uint32_t                num_types;
        uint16_t                *types;

        tmp = *np;
        while (tmp != NULL) {

                smp = tmp;
                tmp = tmp->b_next;
                smp->b_next = NULL;
                smp->b_prev = NULL;

                ehp = (struct ether_header *)smp->b_rptr;
                num_types = vswp->pri_num_types;
                types = vswp->pri_types;
                for (i = 0; i < num_types; i++) {
                        if (ehp->ether_type == types[i]) {
                                /* high priority frame */

                                if (hmp != NULL) {
                                        hmpt->b_next = smp;
                                        hmpt = smp;
                                } else {
                                        hmp = hmpt = smp;
                                }
                                count++;
                                break;
                        }
                }
                if (i == num_types) {
                        /* normal data frame */

                        if (nmp != NULL) {
                                nmpt->b_next = smp;
                                nmpt = smp;
                        } else {
                                nmp = nmpt = smp;
                        }
                }
        }

        *hp = hmp;
        *hpt = hmpt;
        *np = nmp;
        *npt = nmpt;

        return (count);
}

/*
 * Wrapper function to transmit normal and/or priority frames over the channel.
 */
static int
vsw_ldctx_pri(void *arg, mblk_t *mp, mblk_t *mpt, uint32_t count)
{
        vsw_ldc_t               *ldcp = (vsw_ldc_t *)arg;
        mblk_t                  *tmp;
        mblk_t                  *smp;
        mblk_t                  *hmp;   /* high prio pkts head */
        mblk_t                  *hmpt;  /* high prio pkts tail */
        mblk_t                  *nmp;   /* normal pkts head */
        mblk_t                  *nmpt;  /* normal pkts tail */
        uint32_t                n = 0;
        vsw_t                   *vswp = ldcp->ldc_vswp;

        ASSERT(VSW_PRI_ETH_DEFINED(vswp));
        ASSERT(count != 0);

        nmp = mp;
        nmpt = mpt;

        /* gather any priority frames from the chain of packets */
        n = vsw_get_pri_packets(vswp, &nmp, &nmpt, &hmp, &hmpt);

        /* transmit priority frames */
        tmp = hmp;
        while (tmp != NULL) {
                smp = tmp;
                tmp = tmp->b_next;
                smp->b_next = NULL;
                vsw_ldcsend_pkt(ldcp, smp);
        }

        count -= n;

        if (count == 0) {
                /* no normal data frames to process */
                return (0);
        }

        return (vsw_ldctx(ldcp, nmp, nmpt, count));
}

/*
 * Wrapper function to transmit normal frames over the channel.
 */
static int
vsw_ldctx(void *arg, mblk_t *mp, mblk_t *mpt, uint32_t count)
{
        vsw_ldc_t       *ldcp = (vsw_ldc_t *)arg;
        mblk_t          *tmp = NULL;

        ASSERT(count != 0);
        /*
         * If the TX thread is enabled, then queue the
         * ordinary frames and signal the tx thread.
         */
        if (ldcp->tx_thread != NULL) {

                mutex_enter(&ldcp->tx_thr_lock);

                if ((ldcp->tx_cnt + count) >= vsw_max_tx_qcount) {
                        /*
                         * If we reached queue limit,
                         * do not queue new packets,
                         * drop them.
                         */
                        ldcp->ldc_stats.tx_qfull += count;
                        mutex_exit(&ldcp->tx_thr_lock);
                        freemsgchain(mp);
                        goto exit;
                }
                if (ldcp->tx_mhead == NULL) {
                        ldcp->tx_mhead = mp;
                        ldcp->tx_mtail = mpt;
                        cv_signal(&ldcp->tx_thr_cv);
                } else {
                        ldcp->tx_mtail->b_next = mp;
                        ldcp->tx_mtail = mpt;
                }
                ldcp->tx_cnt += count;
                mutex_exit(&ldcp->tx_thr_lock);
        } else {
                while (mp != NULL) {
                        tmp = mp->b_next;
                        mp->b_next = mp->b_prev = NULL;
                        (void) vsw_ldcsend(ldcp, mp, 1);
                        mp = tmp;
                }
        }

exit:
        return (0);
}

/*
 * This function transmits the frame in the payload of a raw data
 * (VIO_PKT_DATA) message. Thus, it provides an Out-Of-Band path to
 * send special frames with high priorities, without going through
 * the normal data path which uses descriptor ring mechanism.
 */
static void
vsw_ldcsend_pkt(vsw_ldc_t *ldcp, mblk_t *mp)
{
        vio_raw_data_msg_t      *pkt;
        mblk_t                  *bp;
        mblk_t                  *nmp = NULL;
        vio_mblk_t              *vmp;
        caddr_t                 dst;
        uint32_t                mblksz;
        uint32_t                size;
        uint32_t                nbytes;
        int                     rv;
        vsw_t                   *vswp = ldcp->ldc_vswp;
        vgen_stats_t            *statsp = &ldcp->ldc_stats;

        if ((!(ldcp->lane_out.lstate & VSW_LANE_ACTIVE)) ||
            (ldcp->ldc_status != LDC_UP) || (ldcp->ldc_handle == 0)) {
                (void) atomic_inc_32(&statsp->tx_pri_fail);
                DWARN(vswp, "%s(%lld) status(%d) lstate(0x%llx), dropping "
                    "packet\n", __func__, ldcp->ldc_id, ldcp->ldc_status,
                    ldcp->lane_out.lstate);
                goto send_pkt_exit;
        }

        size = msgsize(mp);

        /* frame size bigger than available payload len of raw data msg ? */
        if (size > (size_t)(ldcp->msglen - VIO_PKT_DATA_HDRSIZE)) {
                (void) atomic_inc_32(&statsp->tx_pri_fail);
                DWARN(vswp, "%s(%lld) invalid size(%d)\n", __func__,
                    ldcp->ldc_id, size);
                goto send_pkt_exit;
        }

        if (size < ETHERMIN)
                size = ETHERMIN;

        /* alloc space for a raw data message */
        vmp = vio_allocb(vswp->pri_tx_vmp);
        if (vmp == NULL) {
                (void) atomic_inc_32(&statsp->tx_pri_fail);
                DWARN(vswp, "vio_allocb failed\n");
                goto send_pkt_exit;
        } else {
                nmp = vmp->mp;
        }
        pkt = (vio_raw_data_msg_t *)nmp->b_rptr;

        /* copy frame into the payload of raw data message */
        dst = (caddr_t)pkt->data;
        for (bp = mp; bp != NULL; bp = bp->b_cont) {
                mblksz = MBLKL(bp);
                bcopy(bp->b_rptr, dst, mblksz);
                dst += mblksz;
        }

        vmp->state = VIO_MBLK_HAS_DATA;

        /* setup the raw data msg */
        pkt->tag.vio_msgtype = VIO_TYPE_DATA;
        pkt->tag.vio_subtype = VIO_SUBTYPE_INFO;
        pkt->tag.vio_subtype_env = VIO_PKT_DATA;
        pkt->tag.vio_sid = ldcp->local_session;
        nbytes = VIO_PKT_DATA_HDRSIZE + size;

        /* send the msg over ldc */
        rv = vsw_send_msg(ldcp, (void *)pkt, nbytes, B_TRUE);
        if (rv != 0) {
                (void) atomic_inc_32(&statsp->tx_pri_fail);
                DWARN(vswp, "%s(%lld) Error sending priority frame\n", __func__,
                    ldcp->ldc_id);
                goto send_pkt_exit;
        }

        /* update stats */
        (void) atomic_inc_64(&statsp->tx_pri_packets);
        (void) atomic_add_64(&statsp->tx_pri_packets, size);

send_pkt_exit:
        if (nmp != NULL)
                freemsg(nmp);
        freemsg(mp);
}

/*
 * Transmit the packet over the given LDC channel.
 *
 * The 'retries' argument indicates how many times a packet
 * is retried before it is dropped. Note, the retry is done
 * only for a resource related failure, for all other failures
 * the packet is dropped immediately.
 */
static int
vsw_ldcsend(vsw_ldc_t *ldcp, mblk_t *mp, uint32_t retries)
{
        int             i;
        int             rc;
        int             status = 0;
        vsw_port_t      *port = ldcp->ldc_port;
        dring_info_t    *dp = NULL;
        lane_t          *lp = &ldcp->lane_out;

        for (i = 0; i < retries; ) {
                /*
                 * Send the message out using the appropriate
                 * transmit function which will free mblock when it
                 * is finished with it.
                 */
                mutex_enter(&port->tx_lock);
                if (port->transmit != NULL) {
                        status = (*port->transmit)(ldcp, mp);
                }
                if (status == LDC_TX_SUCCESS) {
                        mutex_exit(&port->tx_lock);
                        break;
                }
                i++;    /* increment the counter here */

                /* If its the last retry, then update the oerror */
                if ((i == retries) && (status == LDC_TX_NORESOURCES)) {
                        ldcp->ldc_stats.oerrors++;
                }
                mutex_exit(&port->tx_lock);

                if (status != LDC_TX_NORESOURCES) {
                        /*
                         * No retrying required for errors un-related
                         * to resources.
                         */
                        break;
                }
                if (((dp = ldcp->lane_out.dringp) != NULL) &&
                    ((VSW_VER_GTEQ(ldcp, 1, 2) &&
                    (ldcp->lane_out.xfer_mode & VIO_DRING_MODE_V1_2)) ||
                    ((VSW_VER_LT(ldcp, 1, 2) &&
                    (ldcp->lane_out.xfer_mode == VIO_DRING_MODE_V1_0))))) {

                        /* Need to reclaim in TxDring mode. */
                        if (lp->dring_mode == VIO_TX_DRING) {
                                rc = vsw_reclaim_dring(dp, dp->end_idx);
                        }

                } else {
                        /*
                         * If there is no dring or the xfer_mode is
                         * set to DESC_MODE(ie., OBP), then simply break here.
                         */
                        break;
                }

                /*
                 * Delay only if none were reclaimed
                 * and its not the last retry.
                 */
                if ((rc == 0) && (i < retries)) {
                        delay(drv_usectohz(vsw_ldc_tx_delay));
                }
        }
        freemsg(mp);
        return (status);
}

/*
 * Send an in-band descriptor message over ldc.
 */
static int
vsw_descrsend(vsw_ldc_t *ldcp, mblk_t *mp)
{
        vsw_t                   *vswp = ldcp->ldc_vswp;
        vnet_ibnd_desc_t        ibnd_msg;
        vsw_private_desc_t      *priv_desc = NULL;
        dring_info_t            *dp = NULL;
        size_t                  n, size = 0;
        caddr_t                 bufp;
        mblk_t                  *bp;
        int                     idx, i;
        int                     status = LDC_TX_SUCCESS;
        static int              warn_msg = 1;
        lane_t                  *lp = &ldcp->lane_out;

        D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);

        ASSERT(mp != NULL);

        if ((!(ldcp->lane_out.lstate & VSW_LANE_ACTIVE)) ||
            (ldcp->ldc_status != LDC_UP) || (ldcp->ldc_handle == 0)) {
                DERR(vswp, "%s(%lld) status(%d) state (0x%llx), dropping pkt",
                    __func__, ldcp->ldc_id, ldcp->ldc_status,
                    ldcp->lane_out.lstate);
                ldcp->ldc_stats.oerrors++;
                return (LDC_TX_FAILURE);
        }

        /*
         * The dring here is as an internal buffer,
         * rather than a transfer channel.
         */
        if ((dp = ldcp->lane_out.dringp) == NULL) {
                DERR(vswp, "%s(%lld): no dring for outbound lane",
                    __func__, ldcp->ldc_id);
                DERR(vswp, "%s(%lld) status(%d) state (0x%llx)", __func__,
                    ldcp->ldc_id, ldcp->ldc_status, ldcp->lane_out.lstate);
                ldcp->ldc_stats.oerrors++;
                return (LDC_TX_FAILURE);
        }

        size = msgsize(mp);
        if (size > (size_t)lp->mtu) {
                DERR(vswp, "%s(%lld) invalid size (%ld)\n", __func__,
                    ldcp->ldc_id, size);
                ldcp->ldc_stats.oerrors++;
                return (LDC_TX_FAILURE);
        }

        /*
         * Find a free descriptor in our buffer ring
         */
        if (vsw_dring_find_free_desc(dp, &priv_desc, &idx) != 0) {
                if (warn_msg) {
                        DERR(vswp, "%s(%lld): no descriptor available for ring "
                            "at 0x%llx", __func__, ldcp->ldc_id, dp);
                        warn_msg = 0;
                }

                /* nothing more we can do */
                status = LDC_TX_NORESOURCES;
                goto vsw_descrsend_free_exit;
        } else {
                D2(vswp, "%s(%lld): free private descriptor found at pos "
                    "%ld addr 0x%x\n", __func__, ldcp->ldc_id, idx, priv_desc);
                warn_msg = 1;
        }

        /* copy data into the descriptor */
        bufp = priv_desc->datap;
        for (bp = mp, n = 0; bp != NULL; bp = bp->b_cont) {
                n = MBLKL(bp);
                bcopy(bp->b_rptr, bufp, n);
                bufp += n;
        }

        priv_desc->datalen = (size < (size_t)ETHERMIN) ? ETHERMIN : size;

        /* create and send the in-band descp msg */
        ibnd_msg.hdr.tag.vio_msgtype = VIO_TYPE_DATA;
        ibnd_msg.hdr.tag.vio_subtype = VIO_SUBTYPE_INFO;
        ibnd_msg.hdr.tag.vio_subtype_env = VIO_DESC_DATA;
        ibnd_msg.hdr.tag.vio_sid = ldcp->local_session;

        /*
         * Copy the mem cookies describing the data from the
         * private region of the descriptor ring into the inband
         * descriptor.
         */
        for (i = 0; i < priv_desc->ncookies; i++) {
                bcopy(&priv_desc->memcookie[i], &ibnd_msg.memcookie[i],
                    sizeof (ldc_mem_cookie_t));
        }

        ibnd_msg.hdr.desc_handle = idx;
        ibnd_msg.ncookies = priv_desc->ncookies;
        ibnd_msg.nbytes = size;

        ldcp->ldc_stats.opackets++;
        ldcp->ldc_stats.obytes += size;

        (void) vsw_send_msg(ldcp, (void *)&ibnd_msg,
            sizeof (vnet_ibnd_desc_t), B_TRUE);

vsw_descrsend_free_exit:

        D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
        return (status);
}

static void
vsw_send_ver(void *arg)
{
        vsw_ldc_t       *ldcp = (vsw_ldc_t *)arg;
        vsw_t           *vswp = ldcp->ldc_vswp;
        lane_t          *lp = &ldcp->lane_out;
        vio_ver_msg_t   ver_msg;

        D1(vswp, "%s enter", __func__);

        ver_msg.tag.vio_msgtype = VIO_TYPE_CTRL;
        ver_msg.tag.vio_subtype = VIO_SUBTYPE_INFO;
        ver_msg.tag.vio_subtype_env = VIO_VER_INFO;
        ver_msg.tag.vio_sid = ldcp->local_session;

        if (vsw_obp_ver_proto_workaround == B_FALSE) {
                ver_msg.ver_major = vsw_versions[0].ver_major;
                ver_msg.ver_minor = vsw_versions[0].ver_minor;
        } else {
                /* use the major,minor that we've ack'd */
                lane_t  *lpi = &ldcp->lane_in;
                ver_msg.ver_major = lpi->ver_major;
                ver_msg.ver_minor = lpi->ver_minor;
        }
        ver_msg.dev_class = VDEV_NETWORK_SWITCH;

        lp->lstate |= VSW_VER_INFO_SENT;
        lp->ver_major = ver_msg.ver_major;
        lp->ver_minor = ver_msg.ver_minor;

        DUMP_TAG(ver_msg.tag);

        (void) vsw_send_msg(ldcp, &ver_msg, sizeof (vio_ver_msg_t), B_TRUE);

        D1(vswp, "%s (%d): exit", __func__, ldcp->ldc_id);
}

static void
vsw_send_attr(vsw_ldc_t *ldcp)
{
        vsw_t                   *vswp = ldcp->ldc_vswp;
        lane_t                  *lp = &ldcp->lane_out;
        vnet_attr_msg_t         attr_msg;

        D1(vswp, "%s (%ld) enter", __func__, ldcp->ldc_id);

        /*
         * Subtype is set to INFO by default
         */
        attr_msg.tag.vio_msgtype = VIO_TYPE_CTRL;
        attr_msg.tag.vio_subtype = VIO_SUBTYPE_INFO;
        attr_msg.tag.vio_subtype_env = VIO_ATTR_INFO;
        attr_msg.tag.vio_sid = ldcp->local_session;

        /* payload copied from default settings for lane */
        attr_msg.mtu = lp->mtu;
        attr_msg.addr_type = lp->addr_type;
        attr_msg.xfer_mode = lp->xfer_mode;
        attr_msg.ack_freq = lp->xfer_mode;
        attr_msg.options = lp->dring_mode;

        READ_ENTER(&vswp->if_lockrw);
        attr_msg.addr = vnet_macaddr_strtoul((vswp->if_addr).ether_addr_octet);
        RW_EXIT(&vswp->if_lockrw);

        ldcp->lane_out.lstate |= VSW_ATTR_INFO_SENT;

        DUMP_TAG(attr_msg.tag);

        (void) vsw_send_msg(ldcp, &attr_msg, sizeof (vnet_attr_msg_t), B_TRUE);

        D1(vswp, "%s (%ld) exit", __func__, ldcp->ldc_id);
}

static void
vsw_send_dring_info(vsw_ldc_t *ldcp)
{
        int             msgsize;
        void            *msg;
        vsw_t           *vswp = ldcp->ldc_vswp;
        vsw_port_t      *port = ldcp->ldc_port;
        lane_t          *lp = &ldcp->lane_out;
        vgen_stats_t    *statsp = &ldcp->ldc_stats;

        D1(vswp, "%s: (%ld) enter", __func__, ldcp->ldc_id);

        /* dring mode has been negotiated in attr phase; save in stats */
        statsp->dring_mode = lp->dring_mode;

        if (lp->dring_mode == VIO_RX_DRING_DATA) {
                /*
                 * Change the transmit routine for RxDringData mode.
                 */
                port->transmit = vsw_dringsend_shm;
                msg = (void *) vsw_create_rx_dring_info(ldcp);
                if (msg == NULL) {
                        return;
                }
                msgsize =
                    VNET_DRING_REG_EXT_MSG_SIZE(lp->dringp->data_ncookies);
                ldcp->rcv_thread = thread_create(NULL, 2 * DEFAULTSTKSZ,
                    vsw_ldc_rcv_worker, ldcp, 0, &p0, TS_RUN, maxclsyspri);
                ldcp->rx_dringdata = vsw_process_dringdata_shm;
        } else {
                msg = (void *) vsw_create_tx_dring_info(ldcp);
                if (msg == NULL) {
                        return;
                }
                msgsize = sizeof (vio_dring_reg_msg_t);
                ldcp->msg_thread = thread_create(NULL, 2 * DEFAULTSTKSZ,
                    vsw_ldc_msg_worker, ldcp, 0, &p0, TS_RUN, maxclsyspri);
                ldcp->rx_dringdata = vsw_process_dringdata;
        }

        lp->lstate |= VSW_DRING_INFO_SENT;
        DUMP_TAG_PTR((vio_msg_tag_t *)msg);
        (void) vsw_send_msg(ldcp, msg, msgsize, B_TRUE);
        kmem_free(msg, msgsize);

        D1(vswp, "%s: (%ld) exit", __func__, ldcp->ldc_id);
}

static void
vsw_send_rdx(vsw_ldc_t *ldcp)
{
        vsw_t           *vswp = ldcp->ldc_vswp;
        vio_rdx_msg_t   rdx_msg;

        D1(vswp, "%s (%ld) enter", __func__, ldcp->ldc_id);

        rdx_msg.tag.vio_msgtype = VIO_TYPE_CTRL;
        rdx_msg.tag.vio_subtype = VIO_SUBTYPE_INFO;
        rdx_msg.tag.vio_subtype_env = VIO_RDX;
        rdx_msg.tag.vio_sid = ldcp->local_session;

        ldcp->lane_in.lstate |= VSW_RDX_INFO_SENT;

        DUMP_TAG(rdx_msg.tag);

        (void) vsw_send_msg(ldcp, &rdx_msg, sizeof (vio_rdx_msg_t), B_TRUE);

        D1(vswp, "%s (%ld) exit", __func__, ldcp->ldc_id);
}

/*
 * Remove the specified address from the list of address maintained
 * in this port node.
 */
mcst_addr_t *
vsw_del_addr(uint8_t devtype, void *arg, uint64_t addr)
{
        vsw_t           *vswp = NULL;
        vsw_port_t      *port = NULL;
        mcst_addr_t     *prev_p = NULL;
        mcst_addr_t     *curr_p = NULL;

        D1(NULL, "%s: enter : devtype %d : addr 0x%llx",
            __func__, devtype, addr);

        if (devtype == VSW_VNETPORT) {
                port = (vsw_port_t *)arg;
                mutex_enter(&port->mca_lock);
                prev_p = curr_p = port->mcap;
        } else {
                vswp = (vsw_t *)arg;
                mutex_enter(&vswp->mca_lock);
                prev_p = curr_p = vswp->mcap;
        }

        while (curr_p != NULL) {
                if (curr_p->addr == addr) {
                        D2(NULL, "%s: address found", __func__);
                        /* match found */
                        if (prev_p == curr_p) {
                                /* list head */
                                if (devtype == VSW_VNETPORT)
                                        port->mcap = curr_p->nextp;
                                else
                                        vswp->mcap = curr_p->nextp;
                        } else {
                                prev_p->nextp = curr_p->nextp;
                        }
                        break;
                } else {
                        prev_p = curr_p;
                        curr_p = curr_p->nextp;
                }
        }

        if (devtype == VSW_VNETPORT)
                mutex_exit(&port->mca_lock);
        else
                mutex_exit(&vswp->mca_lock);

        D1(NULL, "%s: exit", __func__);

        return (curr_p);
}

/*
 * Create a ring consisting of just a private portion and link
 * it into the list of rings for the outbound lane.
 *
 * These type of rings are used primarily for temporary data
 * storage (i.e. as data buffers).
 */
void
vsw_create_privring(vsw_ldc_t *ldcp)
{
        dring_info_t            *dp;
        vsw_t                   *vswp = ldcp->ldc_vswp;

        D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);

        dp = kmem_zalloc(sizeof (dring_info_t), KM_SLEEP);
        mutex_init(&dp->dlock, NULL, MUTEX_DRIVER, NULL);
        mutex_init(&dp->restart_lock, NULL, MUTEX_DRIVER, NULL);
        ldcp->lane_out.dringp = dp;

        /* no public section */
        dp->pub_addr = NULL;
        dp->priv_addr = kmem_zalloc(
            (sizeof (vsw_private_desc_t) * vsw_num_descriptors), KM_SLEEP);
        dp->num_descriptors = vsw_num_descriptors;

        if (vsw_setup_tx_dring(ldcp, dp)) {
                DERR(vswp, "%s: setup of ring failed", __func__);
                vsw_destroy_tx_dring(ldcp);
                return;
        }

        /* haven't used any descriptors yet */
        dp->end_idx = 0;
        dp->restart_reqd = B_TRUE;

        D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
}

/*
 * Set the default lane attributes. These are copied into
 * the attr msg we send to our peer. If they are not acceptable
 * then (currently) the handshake ends.
 */
static void
vsw_set_lane_attr(vsw_t *vswp, lane_t *lp)
{
        bzero(lp, sizeof (lane_t));

        READ_ENTER(&vswp->if_lockrw);
        ether_copy(&(vswp->if_addr), &(lp->addr));
        RW_EXIT(&vswp->if_lockrw);

        lp->mtu = vswp->max_frame_size;
        lp->addr_type = ADDR_TYPE_MAC;
        lp->xfer_mode = VIO_DRING_MODE_V1_0;
        lp->ack_freq = 0;       /* for shared mode */
        lp->seq_num = VNET_ISS;
}

/*
 * Map the descriptor ring exported by the peer.
 */
static dring_info_t *
vsw_map_dring(vsw_ldc_t *ldcp, void *pkt)
{
        dring_info_t    *dp = NULL;
        lane_t          *lp = &ldcp->lane_out;

        if (lp->dring_mode == VIO_RX_DRING_DATA) {
                /*
                 * In RxDringData mode, dring that we map in
                 * becomes our transmit descriptor ring.
                 */
                dp =  vsw_map_tx_dring(ldcp, pkt);
        } else {
                /*
                 * In TxDring mode, dring that we map in
                 * becomes our receive descriptor ring.
                 */
                dp =  vsw_map_rx_dring(ldcp, pkt);
        }
        return (dp);
}

/*
 * Common dring mapping function used in both TxDring and RxDringData modes.
 */
dring_info_t *
vsw_map_dring_cmn(vsw_ldc_t *ldcp, vio_dring_reg_msg_t *dring_pkt)
{
        int             rv;
        dring_info_t    *dp;
        ldc_mem_info_t  minfo;
        vsw_t           *vswp = ldcp->ldc_vswp;

        /*
         * If the dring params are unacceptable then we NACK back.
         */
        if ((dring_pkt->num_descriptors == 0) ||
            (dring_pkt->descriptor_size == 0) ||
            (dring_pkt->ncookies != 1)) {
                DERR(vswp, "%s (%lld): invalid dring info",
                    __func__, ldcp->ldc_id);
                return (NULL);
        }

        dp = kmem_zalloc(sizeof (dring_info_t), KM_SLEEP);

        dp->num_descriptors = dring_pkt->num_descriptors;
        dp->descriptor_size = dring_pkt->descriptor_size;
        dp->options = dring_pkt->options;
        dp->dring_ncookies = dring_pkt->ncookies;

        /*
         * Note: should only get one cookie. Enforced in
         * the ldc layer.
         */
        bcopy(&dring_pkt->cookie[0], &dp->dring_cookie[0],
            sizeof (ldc_mem_cookie_t));

        rv = ldc_mem_dring_map(ldcp->ldc_handle, &dp->dring_cookie[0],
            dp->dring_ncookies, dp->num_descriptors, dp->descriptor_size,
            LDC_DIRECT_MAP, &(dp->dring_handle));
        if (rv != 0) {
                goto fail;
        }

        rv = ldc_mem_dring_info(dp->dring_handle, &minfo);
        if (rv != 0) {
                goto fail;
        }
        /* store the address of the ring */
        dp->pub_addr = minfo.vaddr;

        /* cache the dring mtype */
        dp->dring_mtype = minfo.mtype;

        /* no private section as we are importing */
        dp->priv_addr = NULL;

        /*
         * Using simple mono increasing int for ident at the moment.
         */
        dp->ident = ldcp->next_ident;
        ldcp->next_ident++;

        /*
         * Acknowledge it; we send back a unique dring identifier that
         * the sending side will use in future to refer to this
         * descriptor ring.
         */
        dring_pkt->dring_ident = dp->ident;

        return (dp);
fail:
        if (dp->dring_handle != 0) {
                (void) ldc_mem_dring_unmap(dp->dring_handle);
        }
        kmem_free(dp, sizeof (*dp));
        return (NULL);
}

/*
 * Unmap the descriptor ring exported by the peer.
 */
static void
vsw_unmap_dring(vsw_ldc_t *ldcp)
{
        lane_t  *lane_out = &ldcp->lane_out;

        if (lane_out->dring_mode == VIO_RX_DRING_DATA) {
                vsw_unmap_tx_dring(ldcp);
        } else {
                vsw_unmap_rx_dring(ldcp);
        }
}

/*
 * Map the shared memory data buffer area exported by the peer.
 * Used in RxDringData mode only.
 */
static int
vsw_map_data(vsw_ldc_t *ldcp, dring_info_t *dp, void *pkt)
{
        int                     rv;
        vio_dring_reg_ext_msg_t *emsg;
        vio_dring_reg_msg_t     *msg = pkt;
        uint8_t                 *buf = (uint8_t *)msg->cookie;
        vsw_t                   *vswp = ldcp->ldc_vswp;
        ldc_mem_info_t          minfo;

        /* skip over dring cookies */
        ASSERT(msg->ncookies == 1);
        buf += (msg->ncookies * sizeof (ldc_mem_cookie_t));

        emsg = (vio_dring_reg_ext_msg_t *)buf;
        if (emsg->data_ncookies > VNET_DATA_AREA_COOKIES) {
                return (1);
        }

        /* save # of data area cookies */
        dp->data_ncookies = emsg->data_ncookies;

        /* save data area size */
        dp->data_sz = emsg->data_area_size;

        /* allocate ldc mem handle for data area */
        rv = ldc_mem_alloc_handle(ldcp->ldc_handle, &dp->data_handle);
        if (rv != 0) {
                cmn_err(CE_WARN, "ldc_mem_alloc_handle failed\n");
                DWARN(vswp, "%s (%lld) ldc_mem_alloc_handle() failed: %d\n",
                    __func__, ldcp->ldc_id, rv);
                return (1);
        }

        /* map the data area */
        rv = ldc_mem_map(dp->data_handle, emsg->data_cookie,
            emsg->data_ncookies, LDC_DIRECT_MAP, LDC_MEM_R,
            (caddr_t *)&dp->data_addr, NULL);
        if (rv != 0) {
                cmn_err(CE_WARN, "ldc_mem_map failed\n");
                DWARN(vswp, "%s (%lld) ldc_mem_map() failed: %d\n",
                    __func__, ldcp->ldc_id, rv);
                return (1);
        }

        /* get the map info */
        rv = ldc_mem_info(dp->data_handle, &minfo);
        if (rv != 0) {
                cmn_err(CE_WARN, "ldc_mem_info failed\n");
                DWARN(vswp, "%s (%lld) ldc_mem_info() failed: %d\n",
                    __func__, ldcp->ldc_id, rv);
                return (1);
        }

        if (minfo.mtype != LDC_DIRECT_MAP) {
                DWARN(vswp, "%s (%lld) mtype(%d) is not direct map\n",
                    __func__, ldcp->ldc_id, minfo.mtype);
                return (1);
        }

        /* allocate memory for data area cookies */
        dp->data_cookie = kmem_zalloc(emsg->data_ncookies *
            sizeof (ldc_mem_cookie_t), KM_SLEEP);

        /* save data area cookies */
        bcopy(emsg->data_cookie, dp->data_cookie,
            emsg->data_ncookies * sizeof (ldc_mem_cookie_t));

        return (0);
}

/*
 * Reset and free all the resources associated with the channel.
 */
static void
vsw_free_lane_resources(vsw_ldc_t *ldcp, uint64_t dir)
{
        lane_t  *lp;

        D1(ldcp->ldc_vswp, "%s (%lld): enter", __func__, ldcp->ldc_id);

        if (dir == INBOUND) {
                D2(ldcp->ldc_vswp, "%s: freeing INBOUND lane"
                    " of channel %lld", __func__, ldcp->ldc_id);
                lp = &ldcp->lane_in;
        } else {
                D2(ldcp->ldc_vswp, "%s: freeing OUTBOUND lane"
                    " of channel %lld", __func__, ldcp->ldc_id);
                lp = &ldcp->lane_out;
        }

        lp->lstate = VSW_LANE_INACTIV;
        lp->seq_num = VNET_ISS;

        if (dir == INBOUND) {
                /* Unmap the remote dring which is imported from the peer */
                vsw_unmap_dring(ldcp);
        } else {
                /* Destroy the local dring which is exported to the peer */
                vsw_destroy_dring(ldcp);
        }

        D1(ldcp->ldc_vswp, "%s (%lld): exit", __func__, ldcp->ldc_id);
}

/*
 * Destroy the descriptor ring.
 */
static void
vsw_destroy_dring(vsw_ldc_t *ldcp)
{
        lane_t  *lp = &ldcp->lane_out;

        if (lp->dring_mode == VIO_RX_DRING_DATA) {
                vsw_destroy_rx_dring(ldcp);
        } else {
                vsw_destroy_tx_dring(ldcp);
        }
}

/*
 * vsw_ldc_tx_worker -- A per LDC worker thread to transmit data.
 * This thread is woken up by the vsw_portsend to transmit
 * packets.
 */
static void
vsw_ldc_tx_worker(void *arg)
{
        callb_cpr_t     cprinfo;
        vsw_ldc_t *ldcp = (vsw_ldc_t *)arg;
        vsw_t *vswp = ldcp->ldc_vswp;
        mblk_t *mp;
        mblk_t *tmp;

        D1(vswp, "%s(%lld):enter\n", __func__, ldcp->ldc_id);
        CALLB_CPR_INIT(&cprinfo, &ldcp->tx_thr_lock, callb_generic_cpr,
            "vnet_tx_thread");
        mutex_enter(&ldcp->tx_thr_lock);
        while (!(ldcp->tx_thr_flags & VSW_WTHR_STOP)) {

                CALLB_CPR_SAFE_BEGIN(&cprinfo);
                /*
                 * Wait until the data is received or a stop
                 * request is received.
                 */
                while (!(ldcp->tx_thr_flags & VSW_WTHR_STOP) &&
                    (ldcp->tx_mhead == NULL)) {
                        cv_wait(&ldcp->tx_thr_cv, &ldcp->tx_thr_lock);
                }
                CALLB_CPR_SAFE_END(&cprinfo, &ldcp->tx_thr_lock)

                /*
                 * First process the stop request.
                 */
                if (ldcp->tx_thr_flags & VSW_WTHR_STOP) {
                        D2(vswp, "%s(%lld):tx thread stopped\n",
                            __func__, ldcp->ldc_id);
                        break;
                }
                mp = ldcp->tx_mhead;
                ldcp->tx_mhead = ldcp->tx_mtail = NULL;
                ldcp->tx_cnt = 0;
                mutex_exit(&ldcp->tx_thr_lock);
                D2(vswp, "%s(%lld):calling vsw_ldcsend\n",
                    __func__, ldcp->ldc_id);
                while (mp != NULL) {
                        tmp = mp->b_next;
                        mp->b_next = mp->b_prev = NULL;
                        (void) vsw_ldcsend(ldcp, mp, vsw_ldc_tx_retries);
                        mp = tmp;
                }
                mutex_enter(&ldcp->tx_thr_lock);
        }

        /*
         * Update the run status and wakeup the thread that
         * has sent the stop request.
         */
        ldcp->tx_thr_flags &= ~VSW_WTHR_STOP;
        ldcp->tx_thread = NULL;
        CALLB_CPR_EXIT(&cprinfo);
        D1(vswp, "%s(%lld):exit\n", __func__, ldcp->ldc_id);
        thread_exit();
}

/* vsw_stop_tx_thread -- Co-ordinate with receive thread to stop it */
static void
vsw_stop_tx_thread(vsw_ldc_t *ldcp)
{
        kt_did_t        tid = 0;
        vsw_t           *vswp = ldcp->ldc_vswp;

        D1(vswp, "%s(%lld):enter\n", __func__, ldcp->ldc_id);
        /*
         * Send a stop request by setting the stop flag and
         * wait until the receive thread stops.
         */
        mutex_enter(&ldcp->tx_thr_lock);
        if (ldcp->tx_thread != NULL) {
                tid = ldcp->tx_thread->t_did;
                ldcp->tx_thr_flags |= VSW_WTHR_STOP;
                cv_signal(&ldcp->tx_thr_cv);
        }
        mutex_exit(&ldcp->tx_thr_lock);

        if (tid != 0) {
                thread_join(tid);
        }

        D1(vswp, "%s(%lld):exit\n", __func__, ldcp->ldc_id);
}

static int
vsw_mapin_avail(vsw_ldc_t *ldcp)
{
        int             rv;
        ldc_info_t      info;
        uint64_t        mapin_sz_req;
        uint64_t        dblk_sz;
        vsw_t           *vswp = ldcp->ldc_vswp;

        rv = ldc_info(ldcp->ldc_handle, &info);
        if (rv != 0) {
                return (B_FALSE);
        }

        dblk_sz = RXDRING_DBLK_SZ(vswp->max_frame_size);
        mapin_sz_req = (VSW_RXDRING_NRBUFS * dblk_sz);

        if (info.direct_map_size_max >= mapin_sz_req) {
                return (B_TRUE);
        }

        return (B_FALSE);
}

/*
 * Debugging routines
 */
static void
display_state(void)
{
        vsw_t           *vswp;
        vsw_port_list_t *plist;
        vsw_port_t      *port;
        vsw_ldc_t       *ldcp;
        extern vsw_t    *vsw_head;

        cmn_err(CE_NOTE, "***** system state *****");

        for (vswp = vsw_head; vswp; vswp = vswp->next) {
                plist = &vswp->plist;
                READ_ENTER(&plist->lockrw);
                cmn_err(CE_CONT, "vsw instance %d has %d ports attached\n",
                    vswp->instance, plist->num_ports);

                for (port = plist->head; port != NULL; port = port->p_next) {
                        cmn_err(CE_CONT, "port %d : %d ldcs attached\n",
                            port->p_instance, port->num_ldcs);
                        ldcp = port->ldcp;
                        cmn_err(CE_CONT, "chan %lu : dev %d : "
                            "status %d : phase %u\n",
                            ldcp->ldc_id, ldcp->dev_class,
                            ldcp->ldc_status, ldcp->hphase);
                        cmn_err(CE_CONT, "chan %lu : lsession %lu : "
                            "psession %lu\n", ldcp->ldc_id,
                            ldcp->local_session, ldcp->peer_session);

                        cmn_err(CE_CONT, "Inbound lane:\n");
                        display_lane(&ldcp->lane_in);
                        cmn_err(CE_CONT, "Outbound lane:\n");
                        display_lane(&ldcp->lane_out);
                }
                RW_EXIT(&plist->lockrw);
        }
        cmn_err(CE_NOTE, "***** system state *****");
}

static void
display_lane(lane_t *lp)
{
        dring_info_t    *drp = lp->dringp;

        cmn_err(CE_CONT, "ver 0x%x:0x%x : state %lx : mtu 0x%lx\n",
            lp->ver_major, lp->ver_minor, lp->lstate, lp->mtu);
        cmn_err(CE_CONT, "addr_type %d : addr 0x%lx : xmode %d\n",
            lp->addr_type, lp->addr, lp->xfer_mode);
        cmn_err(CE_CONT, "dringp 0x%lx\n", (uint64_t)lp->dringp);

        cmn_err(CE_CONT, "Dring info:\n");
        cmn_err(CE_CONT, "\tnum_desc %u : dsize %u\n",
            drp->num_descriptors, drp->descriptor_size);
        cmn_err(CE_CONT, "\thandle 0x%lx\n", drp->dring_handle);
        cmn_err(CE_CONT, "\tpub_addr 0x%lx : priv_addr 0x%lx\n",
            (uint64_t)drp->pub_addr, (uint64_t)drp->priv_addr);
        cmn_err(CE_CONT, "\tident 0x%lx : end_idx %lu\n",
            drp->ident, drp->end_idx);
        display_ring(drp);
}

static void
display_ring(dring_info_t *dringp)
{
        uint64_t                i;
        uint64_t                priv_count = 0;
        uint64_t                pub_count = 0;
        vnet_public_desc_t      *pub_addr = NULL;
        vsw_private_desc_t      *priv_addr = NULL;

        for (i = 0; i < vsw_num_descriptors; i++) {
                if (dringp->pub_addr != NULL) {
                        pub_addr = (vnet_public_desc_t *)dringp->pub_addr + i;

                        if (pub_addr->hdr.dstate == VIO_DESC_FREE)
                                pub_count++;
                }

                if (dringp->priv_addr != NULL) {
                        priv_addr = (vsw_private_desc_t *)dringp->priv_addr + i;

                        if (priv_addr->dstate == VIO_DESC_FREE)
                                priv_count++;
                }
        }
        cmn_err(CE_CONT, "\t%lu elements: %lu priv free: %lu pub free\n",
            i, priv_count, pub_count);
}

static void
dump_flags(uint64_t state)
{
        int     i;

        typedef struct flag_name {
                int     flag_val;
                char    *flag_name;
        } flag_name_t;

        flag_name_t     flags[] = {
                VSW_VER_INFO_SENT, "VSW_VER_INFO_SENT",
                VSW_VER_INFO_RECV, "VSW_VER_INFO_RECV",
                VSW_VER_ACK_RECV, "VSW_VER_ACK_RECV",
                VSW_VER_ACK_SENT, "VSW_VER_ACK_SENT",
                VSW_VER_NACK_RECV, "VSW_VER_NACK_RECV",
                VSW_VER_NACK_SENT, "VSW_VER_NACK_SENT",
                VSW_ATTR_INFO_SENT, "VSW_ATTR_INFO_SENT",
                VSW_ATTR_INFO_RECV, "VSW_ATTR_INFO_RECV",
                VSW_ATTR_ACK_SENT, "VSW_ATTR_ACK_SENT",
                VSW_ATTR_ACK_RECV, "VSW_ATTR_ACK_RECV",
                VSW_ATTR_NACK_SENT, "VSW_ATTR_NACK_SENT",
                VSW_ATTR_NACK_RECV, "VSW_ATTR_NACK_RECV",
                VSW_DRING_INFO_SENT, "VSW_DRING_INFO_SENT",
                VSW_DRING_INFO_RECV, "VSW_DRING_INFO_RECV",
                VSW_DRING_ACK_SENT, "VSW_DRING_ACK_SENT",
                VSW_DRING_ACK_RECV, "VSW_DRING_ACK_RECV",
                VSW_DRING_NACK_SENT, "VSW_DRING_NACK_SENT",
                VSW_DRING_NACK_RECV, "VSW_DRING_NACK_RECV",
                VSW_RDX_INFO_SENT, "VSW_RDX_INFO_SENT",
                VSW_RDX_INFO_RECV, "VSW_RDX_INFO_RECV",
                VSW_RDX_ACK_SENT, "VSW_RDX_ACK_SENT",
                VSW_RDX_ACK_RECV, "VSW_RDX_ACK_RECV",
                VSW_RDX_NACK_SENT, "VSW_RDX_NACK_SENT",
                VSW_RDX_NACK_RECV, "VSW_RDX_NACK_RECV",
                VSW_MCST_INFO_SENT, "VSW_MCST_INFO_SENT",
                VSW_MCST_INFO_RECV, "VSW_MCST_INFO_RECV",
                VSW_MCST_ACK_SENT, "VSW_MCST_ACK_SENT",
                VSW_MCST_ACK_RECV, "VSW_MCST_ACK_RECV",
                VSW_MCST_NACK_SENT, "VSW_MCST_NACK_SENT",
                VSW_MCST_NACK_RECV, "VSW_MCST_NACK_RECV",
                VSW_LANE_ACTIVE, "VSW_LANE_ACTIVE"};

        DERR(NULL, "DUMP_FLAGS: %llx\n", state);
        for (i = 0; i < sizeof (flags)/sizeof (flag_name_t); i++) {
                if (state & flags[i].flag_val)
                        DERR(NULL, "DUMP_FLAGS %s", flags[i].flag_name);
        }
}