root/usr/src/uts/common/xen/io/xnb.c
/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */

/*
 * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 * Copyright 2018 Joyent, Inc.
 */

#ifdef DEBUG
#define XNB_DEBUG 1
#endif /* DEBUG */

#include "xnb.h"

#include <sys/sunddi.h>
#include <sys/sunndi.h>
#include <sys/modctl.h>
#include <sys/conf.h>
#include <sys/mac.h>
#include <sys/mac_impl.h> /* For mac_fix_cksum(). */
#include <sys/dlpi.h>
#include <sys/strsubr.h>
#include <sys/strsun.h>
#include <sys/types.h>
#include <sys/pattr.h>
#include <vm/seg_kmem.h>
#include <vm/hat_i86.h>
#include <xen/sys/xenbus_impl.h>
#include <xen/sys/xendev.h>
#include <sys/balloon_impl.h>
#include <sys/evtchn_impl.h>
#include <sys/gnttab.h>
#include <vm/vm_dep.h>
#include <sys/note.h>
#include <sys/gld.h>
#include <inet/ip.h>
#include <inet/ip_impl.h>

/*
 * The terms "transmit" and "receive" are used in alignment with domU,
 * which means that packets originating from the peer domU are "transmitted"
 * to other parts of the system and packets are "received" from them.
 */

/*
 * Should we allow guests to manipulate multicast group membership?
 */
static boolean_t        xnb_multicast_control = B_TRUE;

static boolean_t        xnb_connect_rings(dev_info_t *);
static void             xnb_disconnect_rings(dev_info_t *);
static void             xnb_oe_state_change(dev_info_t *, ddi_eventcookie_t,
    void *, void *);
static void             xnb_hp_state_change(dev_info_t *, ddi_eventcookie_t,
    void *, void *);

static int      xnb_txbuf_constructor(void *, void *, int);
static void     xnb_txbuf_destructor(void *, void *);
static void     xnb_tx_notify_peer(xnb_t *, boolean_t);
static void     xnb_tx_mark_complete(xnb_t *, RING_IDX, int16_t);

mblk_t          *xnb_to_peer(xnb_t *, mblk_t *);
mblk_t          *xnb_copy_to_peer(xnb_t *, mblk_t *);

static void             setup_gop(xnb_t *, gnttab_copy_t *, uchar_t *,
    size_t, size_t, size_t, grant_ref_t);
static boolean_t        is_foreign(void *);

#define INVALID_GRANT_HANDLE    ((grant_handle_t)-1)
#define INVALID_GRANT_REF       ((grant_ref_t)-1)

static kmutex_t xnb_alloc_page_lock;

/*
 * On a 32 bit PAE system physical and machine addresses are larger
 * than 32 bits.  ddi_btop() on such systems take an unsigned long
 * argument, and so addresses above 4G are truncated before ddi_btop()
 * gets to see them.  To avoid this, code the shift operation here.
 */
#define xnb_btop(addr)  ((addr) >> PAGESHIFT)

/* DMA attributes for transmit and receive data */
static ddi_dma_attr_t buf_dma_attr = {
        DMA_ATTR_V0,            /* version of this structure */
        0,                      /* lowest usable address */
        0xffffffffffffffffULL,  /* highest usable address */
        0x7fffffff,             /* maximum DMAable byte count */
        MMU_PAGESIZE,           /* alignment in bytes */
        0x7ff,                  /* bitmap of burst sizes */
        1,                      /* minimum transfer */
        0xffffffffU,            /* maximum transfer */
        0xffffffffffffffffULL,  /* maximum segment length */
        1,                      /* maximum number of segments */
        1,                      /* granularity */
        0,                      /* flags (reserved) */
};

/* DMA access attributes for data: NOT to be byte swapped. */
static ddi_device_acc_attr_t data_accattr = {
        DDI_DEVICE_ATTR_V0,
        DDI_NEVERSWAP_ACC,
        DDI_STRICTORDER_ACC
};

/*
 * Statistics.
 */
static const char * const aux_statistics[] = {
        "rx_cksum_deferred",
        "tx_cksum_no_need",
        "rx_rsp_notok",
        "tx_notify_deferred",
        "tx_notify_sent",
        "rx_notify_deferred",
        "rx_notify_sent",
        "tx_too_early",
        "rx_too_early",
        "rx_allocb_failed",
        "tx_allocb_failed",
        "rx_foreign_page",
        "mac_full",
        "spurious_intr",
        "allocation_success",
        "allocation_failure",
        "small_allocation_success",
        "small_allocation_failure",
        "other_allocation_failure",
        "rx_pageboundary_crossed",
        "rx_cpoparea_grown",
        "csum_hardware",
        "csum_software",
        "tx_overflow_page",
        "tx_unexpected_flags",
};

static int
xnb_ks_aux_update(kstat_t *ksp, int flag)
{
        xnb_t *xnbp;
        kstat_named_t *knp;

        if (flag != KSTAT_READ)
                return (EACCES);

        xnbp = ksp->ks_private;
        knp = ksp->ks_data;

        /*
         * Assignment order should match that of the names in
         * aux_statistics.
         */
        (knp++)->value.ui64 = xnbp->xnb_stat_rx_cksum_deferred;
        (knp++)->value.ui64 = xnbp->xnb_stat_tx_cksum_no_need;
        (knp++)->value.ui64 = xnbp->xnb_stat_rx_rsp_notok;
        (knp++)->value.ui64 = xnbp->xnb_stat_tx_notify_deferred;
        (knp++)->value.ui64 = xnbp->xnb_stat_tx_notify_sent;
        (knp++)->value.ui64 = xnbp->xnb_stat_rx_notify_deferred;
        (knp++)->value.ui64 = xnbp->xnb_stat_rx_notify_sent;
        (knp++)->value.ui64 = xnbp->xnb_stat_tx_too_early;
        (knp++)->value.ui64 = xnbp->xnb_stat_rx_too_early;
        (knp++)->value.ui64 = xnbp->xnb_stat_rx_allocb_failed;
        (knp++)->value.ui64 = xnbp->xnb_stat_tx_allocb_failed;
        (knp++)->value.ui64 = xnbp->xnb_stat_rx_foreign_page;
        (knp++)->value.ui64 = xnbp->xnb_stat_mac_full;
        (knp++)->value.ui64 = xnbp->xnb_stat_spurious_intr;
        (knp++)->value.ui64 = xnbp->xnb_stat_allocation_success;
        (knp++)->value.ui64 = xnbp->xnb_stat_allocation_failure;
        (knp++)->value.ui64 = xnbp->xnb_stat_small_allocation_success;
        (knp++)->value.ui64 = xnbp->xnb_stat_small_allocation_failure;
        (knp++)->value.ui64 = xnbp->xnb_stat_other_allocation_failure;
        (knp++)->value.ui64 = xnbp->xnb_stat_rx_pagebndry_crossed;
        (knp++)->value.ui64 = xnbp->xnb_stat_rx_cpoparea_grown;
        (knp++)->value.ui64 = xnbp->xnb_stat_csum_hardware;
        (knp++)->value.ui64 = xnbp->xnb_stat_csum_software;
        (knp++)->value.ui64 = xnbp->xnb_stat_tx_overflow_page;
        (knp++)->value.ui64 = xnbp->xnb_stat_tx_unexpected_flags;

        return (0);
}

static boolean_t
xnb_ks_init(xnb_t *xnbp)
{
        int nstat = sizeof (aux_statistics) /
            sizeof (aux_statistics[0]);
        const char * const *cp = aux_statistics;
        kstat_named_t *knp;

        /*
         * Create and initialise kstats.
         */
        xnbp->xnb_kstat_aux = kstat_create(ddi_driver_name(xnbp->xnb_devinfo),
            ddi_get_instance(xnbp->xnb_devinfo), "aux_statistics", "net",
            KSTAT_TYPE_NAMED, nstat, 0);
        if (xnbp->xnb_kstat_aux == NULL)
                return (B_FALSE);

        xnbp->xnb_kstat_aux->ks_private = xnbp;
        xnbp->xnb_kstat_aux->ks_update = xnb_ks_aux_update;

        knp = xnbp->xnb_kstat_aux->ks_data;
        while (nstat > 0) {
                kstat_named_init(knp, *cp, KSTAT_DATA_UINT64);

                knp++;
                cp++;
                nstat--;
        }

        kstat_install(xnbp->xnb_kstat_aux);

        return (B_TRUE);
}

static void
xnb_ks_free(xnb_t *xnbp)
{
        kstat_delete(xnbp->xnb_kstat_aux);
}

/*
 * Calculate and insert the transport checksum for an arbitrary packet.
 */
static mblk_t *
xnb_software_csum(xnb_t *xnbp, mblk_t *mp)
{
        _NOTE(ARGUNUSED(xnbp));

        /*
         * XXPV dme: shouldn't rely on mac_fix_cksum(), not least
         * because it doesn't cover all of the interesting cases :-(
         */
        mac_hcksum_set(mp, 0, 0, 0, 0, HCK_FULLCKSUM);
        mac_hw_emul(&mp, NULL, NULL, MAC_HWCKSUM_EMUL);
        return (mp);
}

mblk_t *
xnb_process_cksum_flags(xnb_t *xnbp, mblk_t *mp, uint32_t capab)
{
        struct ether_header *ehp;
        uint16_t sap;
        uint32_t offset;
        ipha_t *ipha;

        ASSERT(mp->b_next == NULL);

        /*
         * Check that the packet is contained in a single mblk.  In
         * the "from peer" path this is true today, but may change
         * when scatter gather support is added.  In the "to peer"
         * path we cannot be sure, but in most cases it will be true
         * (in the xnbo case the packet has come from a MAC device
         * which is unlikely to split packets).
         */
        if (mp->b_cont != NULL)
                goto software;

        /*
         * If the MAC has no hardware capability don't do any further
         * checking.
         */
        if (capab == 0)
                goto software;

        ASSERT(MBLKL(mp) >= sizeof (struct ether_header));
        ehp = (struct ether_header *)mp->b_rptr;

        if (ntohs(ehp->ether_type) == VLAN_TPID) {
                struct ether_vlan_header *evhp;

                ASSERT(MBLKL(mp) >= sizeof (struct ether_vlan_header));
                evhp = (struct ether_vlan_header *)mp->b_rptr;
                sap = ntohs(evhp->ether_type);
                offset = sizeof (struct ether_vlan_header);
        } else {
                sap = ntohs(ehp->ether_type);
                offset = sizeof (struct ether_header);
        }

        /*
         * We only attempt to do IPv4 packets in hardware.
         */
        if (sap != ETHERTYPE_IP)
                goto software;

        /*
         * We know that this is an IPv4 packet.
         */
        ipha = (ipha_t *)(mp->b_rptr + offset);

        switch (ipha->ipha_protocol) {
        case IPPROTO_TCP:
        case IPPROTO_UDP: {
                uint32_t start, length, stuff, cksum;
                uint16_t *stuffp;

                /*
                 * This is a TCP/IPv4 or UDP/IPv4 packet, for which we
                 * can use full IPv4 and partial checksum offload.
                 */
                if ((capab & (HCKSUM_INET_FULL_V4|HCKSUM_INET_PARTIAL)) == 0)
                        break;

                start = IP_SIMPLE_HDR_LENGTH;
                length = ntohs(ipha->ipha_length);
                if (ipha->ipha_protocol == IPPROTO_TCP) {
                        stuff = start + TCP_CHECKSUM_OFFSET;
                        cksum = IP_TCP_CSUM_COMP;
                } else {
                        stuff = start + UDP_CHECKSUM_OFFSET;
                        cksum = IP_UDP_CSUM_COMP;
                }
                stuffp = (uint16_t *)(mp->b_rptr + offset + stuff);

                if (capab & HCKSUM_INET_FULL_V4) {
                        /*
                         * Some devices require that the checksum
                         * field of the packet is zero for full
                         * offload.
                         */
                        *stuffp = 0;

                        mac_hcksum_set(mp, 0, 0, 0, 0, HCK_FULLCKSUM);

                        xnbp->xnb_stat_csum_hardware++;

                        return (mp);
                }

                if (capab & HCKSUM_INET_PARTIAL) {
                        if (*stuffp == 0) {
                                ipaddr_t src, dst;

                                /*
                                 * Older Solaris guests don't insert
                                 * the pseudo-header checksum, so we
                                 * calculate it here.
                                 */
                                src = ipha->ipha_src;
                                dst = ipha->ipha_dst;

                                cksum += (dst >> 16) + (dst & 0xFFFF);
                                cksum += (src >> 16) + (src & 0xFFFF);
                                cksum += length - IP_SIMPLE_HDR_LENGTH;

                                cksum = (cksum >> 16) + (cksum & 0xFFFF);
                                cksum = (cksum >> 16) + (cksum & 0xFFFF);

                                ASSERT(cksum <= 0xFFFF);

                                *stuffp = (uint16_t)(cksum ? cksum : ~cksum);
                        }

                        mac_hcksum_set(mp, start, stuff, length, 0,
                            HCK_PARTIALCKSUM);

                        xnbp->xnb_stat_csum_hardware++;

                        return (mp);
                }

                /* NOTREACHED */
                break;
        }

        default:
                /* Use software. */
                break;
        }

software:
        /*
         * We are not able to use any offload so do the whole thing in
         * software.
         */
        xnbp->xnb_stat_csum_software++;

        return (xnb_software_csum(xnbp, mp));
}

int
xnb_attach(dev_info_t *dip, xnb_flavour_t *flavour, void *flavour_data)
{
        xnb_t *xnbp;
        char *xsname;
        char cachename[32];

        xnbp = kmem_zalloc(sizeof (*xnbp), KM_SLEEP);

        xnbp->xnb_flavour = flavour;
        xnbp->xnb_flavour_data = flavour_data;
        xnbp->xnb_devinfo = dip;
        xnbp->xnb_evtchn = INVALID_EVTCHN;
        xnbp->xnb_irq = B_FALSE;
        xnbp->xnb_tx_ring_handle = INVALID_GRANT_HANDLE;
        xnbp->xnb_rx_ring_handle = INVALID_GRANT_HANDLE;
        xnbp->xnb_connected = B_FALSE;
        xnbp->xnb_hotplugged = B_FALSE;
        xnbp->xnb_detachable = B_FALSE;
        xnbp->xnb_peer = xvdi_get_oeid(dip);
        xnbp->xnb_be_status = XNB_STATE_INIT;
        xnbp->xnb_fe_status = XNB_STATE_INIT;

        xnbp->xnb_tx_buf_count = 0;

        xnbp->xnb_rx_hv_copy = B_FALSE;
        xnbp->xnb_multicast_control = B_FALSE;

        xnbp->xnb_rx_va = vmem_alloc(heap_arena, PAGESIZE, VM_SLEEP);
        ASSERT(xnbp->xnb_rx_va != NULL);

        if (ddi_get_iblock_cookie(dip, 0, &xnbp->xnb_icookie)
            != DDI_SUCCESS)
                goto failure;

        /* Allocated on demand, when/if we enter xnb_copy_to_peer(). */
        xnbp->xnb_rx_cpop = NULL;
        xnbp->xnb_rx_cpop_count = 0;

        mutex_init(&xnbp->xnb_tx_lock, NULL, MUTEX_DRIVER,
            xnbp->xnb_icookie);
        mutex_init(&xnbp->xnb_rx_lock, NULL, MUTEX_DRIVER,
            xnbp->xnb_icookie);
        mutex_init(&xnbp->xnb_state_lock, NULL, MUTEX_DRIVER,
            xnbp->xnb_icookie);

        /* Set driver private pointer now. */
        ddi_set_driver_private(dip, xnbp);

        (void) sprintf(cachename, "xnb_tx_buf_cache_%d", ddi_get_instance(dip));
        xnbp->xnb_tx_buf_cache = kmem_cache_create(cachename,
            sizeof (xnb_txbuf_t), 0,
            xnb_txbuf_constructor, xnb_txbuf_destructor,
            NULL, xnbp, NULL, 0);
        if (xnbp->xnb_tx_buf_cache == NULL)
                goto failure_0;

        if (!xnb_ks_init(xnbp))
                goto failure_1;

        /*
         * Receive notification of changes in the state of the
         * driver in the guest domain.
         */
        if (xvdi_add_event_handler(dip, XS_OE_STATE, xnb_oe_state_change,
            NULL) != DDI_SUCCESS)
                goto failure_2;

        /*
         * Receive notification of hotplug events.
         */
        if (xvdi_add_event_handler(dip, XS_HP_STATE, xnb_hp_state_change,
            NULL) != DDI_SUCCESS)
                goto failure_2;

        xsname = xvdi_get_xsname(dip);

        if (xenbus_printf(XBT_NULL, xsname,
            "feature-multicast-control", "%d",
            xnb_multicast_control ? 1 : 0) != 0)
                goto failure_3;

        if (xenbus_printf(XBT_NULL, xsname,
            "feature-rx-copy", "%d",  1) != 0)
                goto failure_3;
        /*
         * Linux domUs seem to depend on "feature-rx-flip" being 0
         * in addition to "feature-rx-copy" being 1. It seems strange
         * to use four possible states to describe a binary decision,
         * but we might as well play nice.
         */
        if (xenbus_printf(XBT_NULL, xsname,
            "feature-rx-flip", "%d", 0) != 0)
                goto failure_3;

        (void) xvdi_switch_state(dip, XBT_NULL, XenbusStateInitWait);
        (void) xvdi_post_event(dip, XEN_HP_ADD);

        return (DDI_SUCCESS);

failure_3:
        xvdi_remove_event_handler(dip, NULL);

failure_2:
        xnb_ks_free(xnbp);

failure_1:
        kmem_cache_destroy(xnbp->xnb_tx_buf_cache);

failure_0:
        mutex_destroy(&xnbp->xnb_state_lock);
        mutex_destroy(&xnbp->xnb_rx_lock);
        mutex_destroy(&xnbp->xnb_tx_lock);

failure:
        vmem_free(heap_arena, xnbp->xnb_rx_va, PAGESIZE);
        kmem_free(xnbp, sizeof (*xnbp));
        return (DDI_FAILURE);
}

void
xnb_detach(dev_info_t *dip)
{
        xnb_t *xnbp = ddi_get_driver_private(dip);

        ASSERT(xnbp != NULL);
        ASSERT(!xnbp->xnb_connected);
        ASSERT(xnbp->xnb_tx_buf_count == 0);

        xnb_disconnect_rings(dip);

        xvdi_remove_event_handler(dip, NULL);

        xnb_ks_free(xnbp);

        kmem_cache_destroy(xnbp->xnb_tx_buf_cache);

        ddi_set_driver_private(dip, NULL);

        mutex_destroy(&xnbp->xnb_state_lock);
        mutex_destroy(&xnbp->xnb_rx_lock);
        mutex_destroy(&xnbp->xnb_tx_lock);

        if (xnbp->xnb_rx_cpop_count > 0)
                kmem_free(xnbp->xnb_rx_cpop, sizeof (xnbp->xnb_rx_cpop[0])
                    * xnbp->xnb_rx_cpop_count);

        ASSERT(xnbp->xnb_rx_va != NULL);
        vmem_free(heap_arena, xnbp->xnb_rx_va, PAGESIZE);

        kmem_free(xnbp, sizeof (*xnbp));
}

/*
 * Allocate a page from the hypervisor to be flipped to the peer.
 *
 * Try to get pages in batches to reduce the overhead of calls into
 * the balloon driver.
 */
static mfn_t
xnb_alloc_page(xnb_t *xnbp)
{
#define WARNING_RATE_LIMIT 100
#define BATCH_SIZE 256
        static mfn_t mfns[BATCH_SIZE];  /* common across all instances */
        static int nth = BATCH_SIZE;
        mfn_t mfn;

        mutex_enter(&xnb_alloc_page_lock);
        if (nth == BATCH_SIZE) {
                if (balloon_alloc_pages(BATCH_SIZE, mfns) != BATCH_SIZE) {
                        xnbp->xnb_stat_allocation_failure++;
                        mutex_exit(&xnb_alloc_page_lock);

                        /*
                         * Try for a single page in low memory situations.
                         */
                        if (balloon_alloc_pages(1, &mfn) != 1) {
                                if ((xnbp->xnb_stat_small_allocation_failure++
                                    % WARNING_RATE_LIMIT) == 0)
                                        cmn_err(CE_WARN, "xnb_alloc_page: "
                                            "Cannot allocate memory to "
                                            "transfer packets to peer.");
                                return (0);
                        } else {
                                xnbp->xnb_stat_small_allocation_success++;
                                return (mfn);
                        }
                }

                nth = 0;
                xnbp->xnb_stat_allocation_success++;
        }

        mfn = mfns[nth++];
        mutex_exit(&xnb_alloc_page_lock);

        ASSERT(mfn != 0);

        return (mfn);
#undef BATCH_SIZE
#undef WARNING_RATE_LIMIT
}

/*
 * Free a page back to the hypervisor.
 *
 * This happens only in the error path, so batching is not worth the
 * complication.
 */
static void
xnb_free_page(xnb_t *xnbp, mfn_t mfn)
{
        _NOTE(ARGUNUSED(xnbp));
        int r;
        pfn_t pfn;

        pfn = xen_assign_pfn(mfn);
        pfnzero(pfn, 0, PAGESIZE);
        xen_release_pfn(pfn);

        if ((r = balloon_free_pages(1, &mfn, NULL, NULL)) != 1) {
                cmn_err(CE_WARN, "free_page: cannot decrease memory "
                    "reservation (%d): page kept but unusable (mfn = 0x%lx).",
                    r, mfn);
        }
}

/*
 * Similar to RING_HAS_UNCONSUMED_REQUESTS(&xnbp->rx_ring) but using
 * local variables. Used in both xnb_to_peer() and xnb_copy_to_peer().
 */
#define XNB_RING_HAS_UNCONSUMED_REQUESTS(_r)            \
        ((((_r)->sring->req_prod - loop) <              \
                (RING_SIZE(_r) - (loop - prod))) ?      \
            ((_r)->sring->req_prod - loop) :            \
            (RING_SIZE(_r) - (loop - prod)))

/*
 * Pass packets to the peer using page flipping.
 */
mblk_t *
xnb_to_peer(xnb_t *xnbp, mblk_t *mp)
{
        mblk_t *free = mp, *prev = NULL;
        size_t len;
        gnttab_transfer_t *gop;
        boolean_t notify;
        RING_IDX loop, prod, end;

        /*
         * For each packet the sequence of operations is:
         *
         * 1. get a new page from the hypervisor.
         * 2. get a request slot from the ring.
         * 3. copy the data into the new page.
         * 4. transfer the page to the peer.
         * 5. update the request slot.
         * 6. kick the peer.
         * 7. free mp.
         *
         * In order to reduce the number of hypercalls, we prepare
         * several packets for the peer and perform a single hypercall
         * to transfer them.
         */

        len = 0;
        mutex_enter(&xnbp->xnb_rx_lock);

        /*
         * If we are not connected to the peer or have not yet
         * finished hotplug it is too early to pass packets to the
         * peer.
         */
        if (!(xnbp->xnb_connected && xnbp->xnb_hotplugged)) {
                mutex_exit(&xnbp->xnb_rx_lock);
                DTRACE_PROBE(flip_rx_too_early);
                xnbp->xnb_stat_rx_too_early++;
                return (mp);
        }

        loop = xnbp->xnb_rx_ring.req_cons;
        prod = xnbp->xnb_rx_ring.rsp_prod_pvt;
        gop = xnbp->xnb_rx_top;

        while ((mp != NULL) &&
            XNB_RING_HAS_UNCONSUMED_REQUESTS(&xnbp->xnb_rx_ring)) {

                mfn_t mfn;
                pfn_t pfn;
                netif_rx_request_t *rxreq;
                netif_rx_response_t *rxresp;
                char *valoop;
                mblk_t *ml;
                uint16_t cksum_flags;

                /* 1 */
                if ((mfn = xnb_alloc_page(xnbp)) == 0) {
                        xnbp->xnb_stat_rx_defer++;
                        break;
                }

                /* 2 */
                rxreq = RING_GET_REQUEST(&xnbp->xnb_rx_ring, loop);

#ifdef XNB_DEBUG
                if (!(rxreq->id < NET_RX_RING_SIZE))
                        cmn_err(CE_PANIC, "xnb_to_peer: "
                            "id %d out of range in request 0x%p",
                            rxreq->id, (void *)rxreq);
#endif /* XNB_DEBUG */

                /* Assign a pfn and map the new page at the allocated va. */
                pfn = xen_assign_pfn(mfn);
                hat_devload(kas.a_hat, xnbp->xnb_rx_va, PAGESIZE,
                    pfn, PROT_READ | PROT_WRITE, HAT_LOAD);

                /* 3 */
                len = 0;
                valoop = xnbp->xnb_rx_va;
                for (ml = mp; ml != NULL; ml = ml->b_cont) {
                        size_t chunk = ml->b_wptr - ml->b_rptr;

                        bcopy(ml->b_rptr, valoop, chunk);
                        valoop += chunk;
                        len += chunk;
                }

                ASSERT(len < PAGESIZE);

                /* Release the pfn. */
                hat_unload(kas.a_hat, xnbp->xnb_rx_va, PAGESIZE,
                    HAT_UNLOAD_UNMAP);
                xen_release_pfn(pfn);

                /* 4 */
                gop->mfn = mfn;
                gop->domid = xnbp->xnb_peer;
                gop->ref = rxreq->gref;

                /* 5.1 */
                rxresp = RING_GET_RESPONSE(&xnbp->xnb_rx_ring, prod);
                rxresp->offset = 0;
                rxresp->flags = 0;

                cksum_flags = xnbp->xnb_flavour->xf_cksum_to_peer(xnbp, mp);
                if (cksum_flags != 0)
                        xnbp->xnb_stat_rx_cksum_deferred++;
                rxresp->flags |= cksum_flags;

                rxresp->id = RING_GET_REQUEST(&xnbp->xnb_rx_ring, prod)->id;
                rxresp->status = len;

                loop++;
                prod++;
                gop++;
                prev = mp;
                mp = mp->b_next;
        }

        /*
         * Did we actually do anything?
         */
        if (loop == xnbp->xnb_rx_ring.req_cons) {
                mutex_exit(&xnbp->xnb_rx_lock);
                return (mp);
        }

        end = loop;

        /*
         * Unlink the end of the 'done' list from the remainder.
         */
        ASSERT(prev != NULL);
        prev->b_next = NULL;

        if (HYPERVISOR_grant_table_op(GNTTABOP_transfer, xnbp->xnb_rx_top,
            loop - xnbp->xnb_rx_ring.req_cons) != 0) {
                cmn_err(CE_WARN, "xnb_to_peer: transfer operation failed");
        }

        loop = xnbp->xnb_rx_ring.req_cons;
        prod = xnbp->xnb_rx_ring.rsp_prod_pvt;
        gop = xnbp->xnb_rx_top;

        while (loop < end) {
                int16_t status = NETIF_RSP_OKAY;

                if (gop->status != 0) {
                        status = NETIF_RSP_ERROR;

                        /*
                         * If the status is anything other than
                         * GNTST_bad_page then we don't own the page
                         * any more, so don't try to give it back.
                         */
                        if (gop->status != GNTST_bad_page)
                                gop->mfn = 0;
                } else {
                        /* The page is no longer ours. */
                        gop->mfn = 0;
                }

                if (gop->mfn != 0)
                        /*
                         * Give back the page, as we won't be using
                         * it.
                         */
                        xnb_free_page(xnbp, gop->mfn);
                else
                        /*
                         * We gave away a page, update our accounting
                         * now.
                         */
                        balloon_drv_subtracted(1);

                /* 5.2 */
                if (status != NETIF_RSP_OKAY) {
                        RING_GET_RESPONSE(&xnbp->xnb_rx_ring, prod)->status =
                            status;
                } else {
                        xnbp->xnb_stat_ipackets++;
                        xnbp->xnb_stat_rbytes += len;
                }

                loop++;
                prod++;
                gop++;
        }

        xnbp->xnb_rx_ring.req_cons = loop;
        xnbp->xnb_rx_ring.rsp_prod_pvt = prod;

        /* 6 */
        /* LINTED: constant in conditional context */
        RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&xnbp->xnb_rx_ring, notify);
        if (notify) {
                ec_notify_via_evtchn(xnbp->xnb_evtchn);
                xnbp->xnb_stat_rx_notify_sent++;
        } else {
                xnbp->xnb_stat_rx_notify_deferred++;
        }

        if (mp != NULL)
                xnbp->xnb_stat_rx_defer++;

        mutex_exit(&xnbp->xnb_rx_lock);

        /* Free mblk_t's that we consumed. */
        freemsgchain(free);

        return (mp);
}

/* Helper functions for xnb_copy_to_peer(). */

/*
 * Grow the array of copy operation descriptors.
 */
static boolean_t
grow_cpop_area(xnb_t *xnbp)
{
        size_t count;
        gnttab_copy_t *new;

        ASSERT(MUTEX_HELD(&xnbp->xnb_rx_lock));

        count = xnbp->xnb_rx_cpop_count + CPOP_DEFCNT;

        if ((new = kmem_alloc(sizeof (new[0]) * count, KM_NOSLEEP)) == NULL) {
                xnbp->xnb_stat_other_allocation_failure++;
                return (B_FALSE);
        }

        bcopy(xnbp->xnb_rx_cpop, new,
            sizeof (xnbp->xnb_rx_cpop[0]) * xnbp->xnb_rx_cpop_count);

        kmem_free(xnbp->xnb_rx_cpop,
            sizeof (xnbp->xnb_rx_cpop[0]) * xnbp->xnb_rx_cpop_count);

        xnbp->xnb_rx_cpop = new;
        xnbp->xnb_rx_cpop_count = count;

        xnbp->xnb_stat_rx_cpoparea_grown++;

        return (B_TRUE);
}

/*
 * Check whether an address is on a page that's foreign to this domain.
 */
static boolean_t
is_foreign(void *addr)
{
        pfn_t pfn = hat_getpfnum(kas.a_hat, addr);

        return ((pfn & PFN_IS_FOREIGN_MFN) == PFN_IS_FOREIGN_MFN);
}

/*
 * Insert a newly allocated mblk into a chain, replacing the old one.
 */
static mblk_t *
replace_msg(mblk_t *mp, size_t len, mblk_t *mp_prev, mblk_t *ml_prev)
{
        uint32_t        start, stuff, end, value, flags;
        mblk_t          *new_mp;

        new_mp = copyb(mp);
        if (new_mp == NULL) {
                cmn_err(CE_PANIC, "replace_msg: cannot alloc new message"
                    "for %p, len %lu", (void *) mp, len);
        }

        mac_hcksum_get(mp, &start, &stuff, &end, &value, &flags);
        mac_hcksum_set(new_mp, start, stuff, end, value, flags);

        new_mp->b_next = mp->b_next;
        new_mp->b_prev = mp->b_prev;
        new_mp->b_cont = mp->b_cont;

        /* Make sure we only overwrite pointers to the mblk being replaced. */
        if (mp_prev != NULL && mp_prev->b_next == mp)
                mp_prev->b_next = new_mp;

        if (ml_prev != NULL && ml_prev->b_cont == mp)
                ml_prev->b_cont = new_mp;

        mp->b_next = mp->b_prev = mp->b_cont = NULL;
        freemsg(mp);

        return (new_mp);
}

/*
 * Set all the fields in a gnttab_copy_t.
 */
static void
setup_gop(xnb_t *xnbp, gnttab_copy_t *gp, uchar_t *rptr,
    size_t s_off, size_t d_off, size_t len, grant_ref_t d_ref)
{
        ASSERT(xnbp != NULL && gp != NULL);

        gp->source.offset = s_off;
        gp->source.u.gmfn = pfn_to_mfn(hat_getpfnum(kas.a_hat, (caddr_t)rptr));
        gp->source.domid = DOMID_SELF;

        gp->len = (uint16_t)len;
        gp->flags = GNTCOPY_dest_gref;
        gp->status = 0;

        gp->dest.u.ref = d_ref;
        gp->dest.offset = d_off;
        gp->dest.domid = xnbp->xnb_peer;
}

/*
 * Pass packets to the peer using hypervisor copy operations.
 */
mblk_t *
xnb_copy_to_peer(xnb_t *xnbp, mblk_t *mp)
{
        mblk_t          *free = mp, *mp_prev = NULL, *saved_mp = mp;
        mblk_t          *ml, *ml_prev;
        boolean_t       notify;
        RING_IDX        loop, prod;
        int             i;

        /*
         * If the peer does not pre-post buffers for received packets,
         * use page flipping to pass packets to it.
         */
        if (!xnbp->xnb_rx_hv_copy)
                return (xnb_to_peer(xnbp, mp));

        /*
         * For each packet the sequence of operations is:
         *
         *  1. get a request slot from the ring.
         *  2. set up data for hypercall (see NOTE below)
         *  3. have the hypervisore copy the data
         *  4. update the request slot.
         *  5. kick the peer.
         *
         * NOTE ad 2.
         *  In order to reduce the number of hypercalls, we prepare
         *  several mblks (mp->b_cont != NULL) for the peer and
         *  perform a single hypercall to transfer them.  We also have
         *  to set up a seperate copy operation for every page.
         *
         * If we have more than one packet (mp->b_next != NULL), we do
         * this whole dance repeatedly.
         */

        mutex_enter(&xnbp->xnb_rx_lock);

        if (!(xnbp->xnb_connected && xnbp->xnb_hotplugged)) {
                mutex_exit(&xnbp->xnb_rx_lock);
                DTRACE_PROBE(copy_rx_too_early);
                xnbp->xnb_stat_rx_too_early++;
                return (mp);
        }

        loop = xnbp->xnb_rx_ring.req_cons;
        prod = xnbp->xnb_rx_ring.rsp_prod_pvt;

        while ((mp != NULL) &&
            XNB_RING_HAS_UNCONSUMED_REQUESTS(&xnbp->xnb_rx_ring)) {
                netif_rx_request_t      *rxreq;
                size_t                  d_offset, len;
                int                     item_count;
                gnttab_copy_t           *gop_cp;
                netif_rx_response_t     *rxresp;
                uint16_t                cksum_flags;
                int16_t                 status = NETIF_RSP_OKAY;

                /* 1 */
                rxreq = RING_GET_REQUEST(&xnbp->xnb_rx_ring, loop);

#ifdef XNB_DEBUG
                if (!(rxreq->id < NET_RX_RING_SIZE))
                        cmn_err(CE_PANIC, "xnb_copy_to_peer: "
                            "id %d out of range in request 0x%p",
                            rxreq->id, (void *)rxreq);
#endif /* XNB_DEBUG */

                /* 2 */
                d_offset = 0;
                len = 0;
                item_count = 0;

                gop_cp = xnbp->xnb_rx_cpop;

                /*
                 * We walk the b_cont pointers and set up a
                 * gnttab_copy_t for each sub-page chunk in each data
                 * block.
                 */
                /* 2a */
                for (ml = mp, ml_prev = NULL; ml != NULL; ml = ml->b_cont) {
                        size_t  chunk = ml->b_wptr - ml->b_rptr;
                        uchar_t *r_tmp, *rpt_align;
                        size_t  r_offset;

                        /*
                         * The hypervisor will not allow us to
                         * reference a foreign page (e.g. one
                         * belonging to another domain) by mfn in the
                         * copy operation. If the data in this mblk is
                         * on such a page we must copy the data into a
                         * local page before initiating the hypervisor
                         * copy operation.
                         */
                        if (is_foreign(ml->b_rptr) || is_foreign(ml->b_wptr)) {
                                mblk_t *ml_new = replace_msg(ml, chunk,
                                    mp_prev, ml_prev);

                                /* We can still use old ml, but not *ml! */
                                if (free == ml)
                                        free = ml_new;
                                if (mp == ml)
                                        mp = ml_new;
                                ml = ml_new;

                                xnbp->xnb_stat_rx_foreign_page++;
                        }

                        rpt_align = (uchar_t *)ALIGN2PAGE(ml->b_rptr);
                        r_offset = (uint16_t)(ml->b_rptr - rpt_align);
                        r_tmp = ml->b_rptr;

                        if (d_offset + chunk > PAGESIZE)
                                cmn_err(CE_PANIC, "xnb_copy_to_peer: mp %p "
                                    "(svd: %p), ml %p,rpt_alg. %p, d_offset "
                                    "(%lu) + chunk (%lu) > PAGESIZE %d!",
                                    (void *)mp, (void *)saved_mp, (void *)ml,
                                    (void *)rpt_align,
                                    d_offset, chunk, (int)PAGESIZE);

                        while (chunk > 0) {
                                size_t part_len;

                                if (item_count == xnbp->xnb_rx_cpop_count) {
                                        if (!grow_cpop_area(xnbp))
                                                goto failure;
                                        gop_cp = &xnbp->xnb_rx_cpop[item_count];
                                }
                                /*
                                 * If our mblk crosses a page boundary, we need
                                 * to do a seperate copy for each page.
                                 */
                                if (r_offset + chunk > PAGESIZE) {
                                        part_len = PAGESIZE - r_offset;

                                        DTRACE_PROBE3(mblk_page_crossed,
                                            (mblk_t *), ml, int, chunk, int,
                                            (int)r_offset);

                                        xnbp->xnb_stat_rx_pagebndry_crossed++;
                                } else {
                                        part_len = chunk;
                                }

                                setup_gop(xnbp, gop_cp, r_tmp, r_offset,
                                    d_offset, part_len, rxreq->gref);

                                chunk -= part_len;

                                len += part_len;
                                d_offset += part_len;
                                r_tmp += part_len;
                                /*
                                 * The 2nd, 3rd ... last copies will always
                                 * start at r_tmp, therefore r_offset is 0.
                                 */
                                r_offset = 0;
                                gop_cp++;
                                item_count++;
                        }
                        ml_prev = ml;

                        DTRACE_PROBE4(mblk_loop_end, (mblk_t *), ml, int,
                            chunk, int, len, int, item_count);
                }
                /* 3 */
                if (HYPERVISOR_grant_table_op(GNTTABOP_copy, xnbp->xnb_rx_cpop,
                    item_count) != 0) {
                        cmn_err(CE_WARN, "xnb_copy_to_peer: copy op. failed");
                        DTRACE_PROBE(HV_granttableopfailed);
                }

                /* 4 */
                rxresp = RING_GET_RESPONSE(&xnbp->xnb_rx_ring, prod);
                rxresp->offset = 0;

                rxresp->flags = 0;

                DTRACE_PROBE4(got_RX_rsp, int, (int)rxresp->id, int,
                    (int)rxresp->offset, int, (int)rxresp->flags, int,
                    (int)rxresp->status);

                cksum_flags = xnbp->xnb_flavour->xf_cksum_to_peer(xnbp, mp);
                if (cksum_flags != 0)
                        xnbp->xnb_stat_rx_cksum_deferred++;
                rxresp->flags |= cksum_flags;

                rxresp->id = RING_GET_REQUEST(&xnbp->xnb_rx_ring, prod)->id;
                rxresp->status = len;

                DTRACE_PROBE4(RX_rsp_set, int, (int)rxresp->id, int,
                    (int)rxresp->offset, int, (int)rxresp->flags, int,
                    (int)rxresp->status);

                for (i = 0; i < item_count; i++) {
                        if (xnbp->xnb_rx_cpop[i].status != 0) {
                                DTRACE_PROBE2(cpop_status_nonnull, int,
                                    (int)xnbp->xnb_rx_cpop[i].status,
                                    int, i);
                                status = NETIF_RSP_ERROR;
                        }
                }

                /* 5.2 */
                if (status != NETIF_RSP_OKAY) {
                        RING_GET_RESPONSE(&xnbp->xnb_rx_ring, prod)->status =
                            status;
                        xnbp->xnb_stat_rx_rsp_notok++;
                } else {
                        xnbp->xnb_stat_ipackets++;
                        xnbp->xnb_stat_rbytes += len;
                }

                loop++;
                prod++;
                mp_prev = mp;
                mp = mp->b_next;
        }
failure:
        /*
         * Did we actually do anything?
         */
        if (loop == xnbp->xnb_rx_ring.req_cons) {
                mutex_exit(&xnbp->xnb_rx_lock);
                return (mp);
        }

        /*
         * Unlink the end of the 'done' list from the remainder.
         */
        ASSERT(mp_prev != NULL);
        mp_prev->b_next = NULL;

        xnbp->xnb_rx_ring.req_cons = loop;
        xnbp->xnb_rx_ring.rsp_prod_pvt = prod;

        /* 6 */
        /* LINTED: constant in conditional context */
        RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&xnbp->xnb_rx_ring, notify);
        if (notify) {
                ec_notify_via_evtchn(xnbp->xnb_evtchn);
                xnbp->xnb_stat_rx_notify_sent++;
        } else {
                xnbp->xnb_stat_rx_notify_deferred++;
        }

        if (mp != NULL)
                xnbp->xnb_stat_rx_defer++;

        mutex_exit(&xnbp->xnb_rx_lock);

        /* Free mblk_t structs we have consumed. */
        freemsgchain(free);

        return (mp);
}


static void
xnb_tx_notify_peer(xnb_t *xnbp, boolean_t force)
{
        boolean_t notify;

        ASSERT(MUTEX_HELD(&xnbp->xnb_tx_lock));

        /* LINTED: constant in conditional context */
        RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&xnbp->xnb_tx_ring, notify);
        if (notify || force) {
                ec_notify_via_evtchn(xnbp->xnb_evtchn);
                xnbp->xnb_stat_tx_notify_sent++;
        } else {
                xnbp->xnb_stat_tx_notify_deferred++;
        }
}

static void
xnb_tx_mark_complete(xnb_t *xnbp, RING_IDX id, int16_t status)
{
        RING_IDX i;
        netif_tx_response_t *txresp;

        ASSERT(MUTEX_HELD(&xnbp->xnb_tx_lock));

        i = xnbp->xnb_tx_ring.rsp_prod_pvt;

        txresp = RING_GET_RESPONSE(&xnbp->xnb_tx_ring, i);
        txresp->id = id;
        txresp->status = status;

        xnbp->xnb_tx_ring.rsp_prod_pvt = i + 1;

        /*
         * Note that we don't push the change to the peer here - that
         * is the callers responsibility.
         */
}

static void
xnb_txbuf_recycle(xnb_txbuf_t *txp)
{
        xnb_t *xnbp = txp->xt_xnbp;

        kmem_cache_free(xnbp->xnb_tx_buf_cache, txp);

        xnbp->xnb_tx_buf_outstanding--;
}

static int
xnb_txbuf_constructor(void *buf, void *arg, int kmflag)
{
        _NOTE(ARGUNUSED(kmflag));
        xnb_txbuf_t *txp = buf;
        xnb_t *xnbp = arg;
        size_t len;
        ddi_dma_cookie_t dma_cookie;
        uint_t ncookies;

        txp->xt_free_rtn.free_func = xnb_txbuf_recycle;
        txp->xt_free_rtn.free_arg = (caddr_t)txp;
        txp->xt_xnbp = xnbp;
        txp->xt_next = NULL;

        if (ddi_dma_alloc_handle(xnbp->xnb_devinfo, &buf_dma_attr,
            0, 0, &txp->xt_dma_handle) != DDI_SUCCESS)
                goto failure;

        if (ddi_dma_mem_alloc(txp->xt_dma_handle, PAGESIZE, &data_accattr,
            DDI_DMA_STREAMING, 0, 0, &txp->xt_buf, &len,
            &txp->xt_acc_handle) != DDI_SUCCESS)
                goto failure_1;

        if (ddi_dma_addr_bind_handle(txp->xt_dma_handle, NULL, txp->xt_buf,
            len, DDI_DMA_RDWR | DDI_DMA_STREAMING, DDI_DMA_DONTWAIT, 0,
            &dma_cookie, &ncookies)
            != DDI_DMA_MAPPED)
                goto failure_2;
        ASSERT(ncookies == 1);

        txp->xt_mfn = xnb_btop(dma_cookie.dmac_laddress);
        txp->xt_buflen = dma_cookie.dmac_size;

        DTRACE_PROBE(txbuf_allocated);

        atomic_inc_32(&xnbp->xnb_tx_buf_count);
        xnbp->xnb_tx_buf_outstanding++;

        return (0);

failure_2:
        ddi_dma_mem_free(&txp->xt_acc_handle);

failure_1:
        ddi_dma_free_handle(&txp->xt_dma_handle);

failure:

        return (-1);
}

static void
xnb_txbuf_destructor(void *buf, void *arg)
{
        xnb_txbuf_t *txp = buf;
        xnb_t *xnbp = arg;

        (void) ddi_dma_unbind_handle(txp->xt_dma_handle);
        ddi_dma_mem_free(&txp->xt_acc_handle);
        ddi_dma_free_handle(&txp->xt_dma_handle);

        atomic_dec_32(&xnbp->xnb_tx_buf_count);
}

/*
 * Take packets from the peer and deliver them onward.
 */
static mblk_t *
xnb_from_peer(xnb_t *xnbp)
{
        RING_IDX start, end, loop;
        gnttab_copy_t *cop;
        xnb_txbuf_t **txpp;
        netif_tx_request_t *txreq;
        boolean_t work_to_do, need_notify = B_FALSE;
        mblk_t *head, *tail;
        int n_data_req, i;

        ASSERT(MUTEX_HELD(&xnbp->xnb_tx_lock));

        head = tail = NULL;
around:

        /* LINTED: constant in conditional context */
        RING_FINAL_CHECK_FOR_REQUESTS(&xnbp->xnb_tx_ring, work_to_do);
        if (!work_to_do) {
finished:
                xnb_tx_notify_peer(xnbp, need_notify);

                return (head);
        }

        start = xnbp->xnb_tx_ring.req_cons;
        end = xnbp->xnb_tx_ring.sring->req_prod;

        if ((end - start) > NET_TX_RING_SIZE) {
                /*
                 * This usually indicates that the frontend driver is
                 * misbehaving, as it's not possible to have more than
                 * NET_TX_RING_SIZE ring elements in play at any one
                 * time.
                 *
                 * We reset the ring pointers to the state declared by
                 * the frontend and try to carry on.
                 */
                cmn_err(CE_WARN, "xnb_from_peer: domain %d tried to give us %u "
                    "items in the ring, resetting and trying to recover.",
                    xnbp->xnb_peer, (end - start));

                /* LINTED: constant in conditional context */
                BACK_RING_ATTACH(&xnbp->xnb_tx_ring,
                    (netif_tx_sring_t *)xnbp->xnb_tx_ring_addr, PAGESIZE);

                goto around;
        }

        loop = start;
        cop = xnbp->xnb_tx_cop;
        txpp = xnbp->xnb_tx_bufp;
        n_data_req = 0;

        while (loop < end) {
                static const uint16_t acceptable_flags =
                    NETTXF_csum_blank |
                    NETTXF_data_validated |
                    NETTXF_extra_info;
                uint16_t unexpected_flags;

                txreq = RING_GET_REQUEST(&xnbp->xnb_tx_ring, loop);

                unexpected_flags = txreq->flags & ~acceptable_flags;
                if (unexpected_flags != 0) {
                        /*
                         * The peer used flag bits that we do not
                         * recognize.
                         */
                        cmn_err(CE_WARN, "xnb_from_peer: "
                            "unexpected flag bits (0x%x) from peer "
                            "in transmit request",
                            unexpected_flags);
                        xnbp->xnb_stat_tx_unexpected_flags++;

                        /* Mark this entry as failed. */
                        xnb_tx_mark_complete(xnbp, txreq->id, NETIF_RSP_ERROR);
                        need_notify = B_TRUE;

                } else if (txreq->flags & NETTXF_extra_info) {
                        struct netif_extra_info *erp;
                        boolean_t status;

                        loop++; /* Consume another slot in the ring. */
                        ASSERT(loop <= end);

                        erp = (struct netif_extra_info *)
                            RING_GET_REQUEST(&xnbp->xnb_tx_ring, loop);

                        switch (erp->type) {
                        case XEN_NETIF_EXTRA_TYPE_MCAST_ADD:
                                ASSERT(xnbp->xnb_multicast_control);
                                status = xnbp->xnb_flavour->xf_mcast_add(xnbp,
                                    &erp->u.mcast.addr);
                                break;
                        case XEN_NETIF_EXTRA_TYPE_MCAST_DEL:
                                ASSERT(xnbp->xnb_multicast_control);
                                status = xnbp->xnb_flavour->xf_mcast_del(xnbp,
                                    &erp->u.mcast.addr);
                                break;
                        default:
                                status = B_FALSE;
                                cmn_err(CE_WARN, "xnb_from_peer: "
                                    "unknown extra type %d", erp->type);
                                break;
                        }

                        xnb_tx_mark_complete(xnbp, txreq->id,
                            status ? NETIF_RSP_OKAY : NETIF_RSP_ERROR);
                        need_notify = B_TRUE;

                } else if ((txreq->offset > PAGESIZE) ||
                    (txreq->offset + txreq->size > PAGESIZE)) {
                        /*
                         * Peer attempted to refer to data beyond the
                         * end of the granted page.
                         */
                        cmn_err(CE_WARN, "xnb_from_peer: "
                            "attempt to refer beyond the end of granted "
                            "page in txreq (offset %d, size %d).",
                            txreq->offset, txreq->size);
                        xnbp->xnb_stat_tx_overflow_page++;

                        /* Mark this entry as failed. */
                        xnb_tx_mark_complete(xnbp, txreq->id, NETIF_RSP_ERROR);
                        need_notify = B_TRUE;

                } else {
                        xnb_txbuf_t *txp;

                        txp = kmem_cache_alloc(xnbp->xnb_tx_buf_cache,
                            KM_NOSLEEP);
                        if (txp == NULL)
                                break;

                        txp->xt_mblk = desballoc((unsigned char *)txp->xt_buf,
                            txp->xt_buflen, 0, &txp->xt_free_rtn);
                        if (txp->xt_mblk == NULL) {
                                kmem_cache_free(xnbp->xnb_tx_buf_cache, txp);
                                break;
                        }

                        txp->xt_idx = loop;
                        txp->xt_id = txreq->id;

                        cop->source.u.ref = txreq->gref;
                        cop->source.domid = xnbp->xnb_peer;
                        cop->source.offset = txreq->offset;

                        cop->dest.u.gmfn = txp->xt_mfn;
                        cop->dest.domid = DOMID_SELF;
                        cop->dest.offset = 0;

                        cop->len = txreq->size;
                        cop->flags = GNTCOPY_source_gref;
                        cop->status = 0;

                        *txpp = txp;

                        txpp++;
                        cop++;
                        n_data_req++;

                        ASSERT(n_data_req <= NET_TX_RING_SIZE);
                }

                loop++;
        }

        xnbp->xnb_tx_ring.req_cons = loop;

        if (n_data_req == 0)
                goto around;

        if (HYPERVISOR_grant_table_op(GNTTABOP_copy,
            xnbp->xnb_tx_cop, n_data_req) != 0) {

                cmn_err(CE_WARN, "xnb_from_peer: copy operation failed");

                txpp = xnbp->xnb_tx_bufp;
                i = n_data_req;
                while (i > 0) {
                        kmem_cache_free(xnbp->xnb_tx_buf_cache, *txpp);
                        txpp++;
                        i--;
                }

                goto finished;
        }

        txpp = xnbp->xnb_tx_bufp;
        cop = xnbp->xnb_tx_cop;
        i = n_data_req;

        while (i > 0) {
                xnb_txbuf_t *txp = *txpp;

                txreq = RING_GET_REQUEST(&xnbp->xnb_tx_ring, txp->xt_idx);

                if (cop->status != 0) {
#ifdef XNB_DEBUG
                        cmn_err(CE_WARN, "xnb_from_peer: "
                            "txpp 0x%p failed (%d)",
                            (void *)*txpp, cop->status);
#endif /* XNB_DEBUG */
                        xnb_tx_mark_complete(xnbp, txp->xt_id, NETIF_RSP_ERROR);
                        freemsg(txp->xt_mblk);
                } else {
                        mblk_t *mp;

                        mp = txp->xt_mblk;
                        mp->b_rptr = mp->b_wptr = (unsigned char *)txp->xt_buf;
                        mp->b_wptr += txreq->size;
                        mp->b_next = NULL;

                        /*
                         * If there are checksum flags, process them
                         * appropriately.
                         */
                        if ((txreq->flags &
                            (NETTXF_csum_blank | NETTXF_data_validated))
                            != 0) {
                                mp = xnbp->xnb_flavour->xf_cksum_from_peer(xnbp,
                                    mp, txreq->flags);
                                xnbp->xnb_stat_tx_cksum_no_need++;

                                txp->xt_mblk = mp;
                        }

                        if (head == NULL) {
                                ASSERT(tail == NULL);
                                head = mp;
                        } else {
                                ASSERT(tail != NULL);
                                tail->b_next = mp;
                        }
                        tail = mp;

                        xnbp->xnb_stat_opackets++;
                        xnbp->xnb_stat_obytes += txreq->size;

                        xnb_tx_mark_complete(xnbp, txp->xt_id, NETIF_RSP_OKAY);
                }

                txpp++;
                cop++;
                i--;
        }

        goto around;
        /* NOTREACHED */
}

static uint_t
xnb_intr(caddr_t arg)
{
        xnb_t *xnbp = (xnb_t *)arg;
        mblk_t *mp;

        xnbp->xnb_stat_intr++;

        mutex_enter(&xnbp->xnb_tx_lock);

        ASSERT(xnbp->xnb_connected);

        mp = xnb_from_peer(xnbp);

        mutex_exit(&xnbp->xnb_tx_lock);

        if (!xnbp->xnb_hotplugged) {
                xnbp->xnb_stat_tx_too_early++;
                goto fail;
        }
        if (mp == NULL) {
                xnbp->xnb_stat_spurious_intr++;
                goto fail;
        }

        xnbp->xnb_flavour->xf_from_peer(xnbp, mp);

        return (DDI_INTR_CLAIMED);

fail:
        freemsgchain(mp);
        return (DDI_INTR_CLAIMED);
}

/*
 * Read our configuration from xenstore.
 */
boolean_t
xnb_read_xs_config(xnb_t *xnbp)
{
        char *xsname;
        char mac[ETHERADDRL * 3];

        xsname = xvdi_get_xsname(xnbp->xnb_devinfo);

        if (xenbus_scanf(XBT_NULL, xsname,
            "mac", "%s", mac) != 0) {
                cmn_err(CE_WARN, "xnb_attach: "
                    "cannot read mac address from %s",
                    xsname);
                return (B_FALSE);
        }

        if (ether_aton(mac, xnbp->xnb_mac_addr) != ETHERADDRL) {
                cmn_err(CE_WARN,
                    "xnb_attach: cannot parse mac address %s",
                    mac);
                return (B_FALSE);
        }

        return (B_TRUE);
}

/*
 * Read the configuration of the peer from xenstore.
 */
boolean_t
xnb_read_oe_config(xnb_t *xnbp)
{
        char *oename;
        int i;

        oename = xvdi_get_oename(xnbp->xnb_devinfo);

        if (xenbus_gather(XBT_NULL, oename,
            "event-channel", "%u", &xnbp->xnb_fe_evtchn,
            "tx-ring-ref", "%lu", &xnbp->xnb_tx_ring_ref,
            "rx-ring-ref", "%lu", &xnbp->xnb_rx_ring_ref,
            NULL) != 0) {
                cmn_err(CE_WARN, "xnb_read_oe_config: "
                    "cannot read other-end details from %s",
                    oename);
                return (B_FALSE);
        }

        /*
         * Check whether our peer requests receive side hypervisor
         * copy.
         */
        if (xenbus_scanf(XBT_NULL, oename,
            "request-rx-copy", "%d", &i) != 0)
                i = 0;
        if (i != 0)
                xnbp->xnb_rx_hv_copy = B_TRUE;

        /*
         * Check whether our peer requests multicast_control.
         */
        if (xenbus_scanf(XBT_NULL, oename,
            "request-multicast-control", "%d", &i) != 0)
                i = 0;
        if (i != 0)
                xnbp->xnb_multicast_control = B_TRUE;

        /*
         * The Linux backend driver here checks to see if the peer has
         * set 'feature-no-csum-offload'. This is used to indicate
         * that the guest cannot handle receiving packets without a
         * valid checksum. We don't check here, because packets passed
         * to the peer _always_ have a valid checksum.
         *
         * There are three cases:
         *
         * - the NIC is dedicated: packets from the wire should always
         *   have a valid checksum. If the hardware validates the
         *   checksum then the relevant bit will be set in the packet
         *   attributes and we will inform the peer. It can choose to
         *   ignore the hardware verification.
         *
         * - the NIC is shared (VNIC) and a packet originates from the
         *   wire: this is the same as the case above - the packets
         *   will have a valid checksum.
         *
         * - the NIC is shared (VNIC) and a packet originates from the
         *   host: the MAC layer ensures that all such packets have a
         *   valid checksum by calculating one if the stack did not.
         */

        return (B_TRUE);
}

void
xnb_start_connect(xnb_t *xnbp)
{
        dev_info_t  *dip = xnbp->xnb_devinfo;

        if (!xnb_connect_rings(dip)) {
                cmn_err(CE_WARN, "xnb_start_connect: "
                    "cannot connect rings");
                goto failed;
        }

        if (!xnbp->xnb_flavour->xf_start_connect(xnbp)) {
                cmn_err(CE_WARN, "xnb_start_connect: "
                    "flavour failed to connect");
                goto failed;
        }

        (void) xvdi_switch_state(dip, XBT_NULL, XenbusStateConnected);
        return;

failed:
        xnbp->xnb_flavour->xf_peer_disconnected(xnbp);
        xnb_disconnect_rings(dip);
        (void) xvdi_switch_state(dip, XBT_NULL,
            XenbusStateClosed);
        (void) xvdi_post_event(dip, XEN_HP_REMOVE);
}

static boolean_t
xnb_connect_rings(dev_info_t *dip)
{
        xnb_t *xnbp = ddi_get_driver_private(dip);
        struct gnttab_map_grant_ref map_op;

        /*
         * Cannot attempt to connect the rings if already connected.
         */
        ASSERT(!xnbp->xnb_connected);

        /*
         * 1. allocate a vaddr for the tx page, one for the rx page.
         * 2. call GNTTABOP_map_grant_ref to map the relevant pages
         *    into the allocated vaddr (one for tx, one for rx).
         * 3. call EVTCHNOP_bind_interdomain to have the event channel
         *    bound to this domain.
         * 4. associate the event channel with an interrupt.
         * 5. enable the interrupt.
         */

        /* 1.tx */
        xnbp->xnb_tx_ring_addr = vmem_xalloc(heap_arena, PAGESIZE, PAGESIZE,
            0, 0, 0, 0, VM_SLEEP);
        ASSERT(xnbp->xnb_tx_ring_addr != NULL);

        /* 2.tx */
        map_op.host_addr = (uint64_t)((long)xnbp->xnb_tx_ring_addr);
        map_op.flags = GNTMAP_host_map;
        map_op.ref = xnbp->xnb_tx_ring_ref;
        map_op.dom = xnbp->xnb_peer;
        hat_prepare_mapping(kas.a_hat, xnbp->xnb_tx_ring_addr, NULL);
        if (xen_map_gref(GNTTABOP_map_grant_ref, &map_op, 1, B_FALSE) != 0 ||
            map_op.status != 0) {
                cmn_err(CE_WARN, "xnb_connect_rings: cannot map tx-ring page.");
                goto fail;
        }
        xnbp->xnb_tx_ring_handle = map_op.handle;

        /* LINTED: constant in conditional context */
        BACK_RING_INIT(&xnbp->xnb_tx_ring,
            (netif_tx_sring_t *)xnbp->xnb_tx_ring_addr, PAGESIZE);

        /* 1.rx */
        xnbp->xnb_rx_ring_addr = vmem_xalloc(heap_arena, PAGESIZE, PAGESIZE,
            0, 0, 0, 0, VM_SLEEP);
        ASSERT(xnbp->xnb_rx_ring_addr != NULL);

        /* 2.rx */
        map_op.host_addr = (uint64_t)((long)xnbp->xnb_rx_ring_addr);
        map_op.flags = GNTMAP_host_map;
        map_op.ref = xnbp->xnb_rx_ring_ref;
        map_op.dom = xnbp->xnb_peer;
        hat_prepare_mapping(kas.a_hat, xnbp->xnb_rx_ring_addr, NULL);
        if (xen_map_gref(GNTTABOP_map_grant_ref, &map_op, 1, B_FALSE) != 0 ||
            map_op.status != 0) {
                cmn_err(CE_WARN, "xnb_connect_rings: cannot map rx-ring page.");
                goto fail;
        }
        xnbp->xnb_rx_ring_handle = map_op.handle;

        /* LINTED: constant in conditional context */
        BACK_RING_INIT(&xnbp->xnb_rx_ring,
            (netif_rx_sring_t *)xnbp->xnb_rx_ring_addr, PAGESIZE);

        /* 3 */
        if (xvdi_bind_evtchn(dip, xnbp->xnb_fe_evtchn) != DDI_SUCCESS) {
                cmn_err(CE_WARN, "xnb_connect_rings: "
                    "cannot bind event channel %d", xnbp->xnb_evtchn);
                xnbp->xnb_evtchn = INVALID_EVTCHN;
                goto fail;
        }
        xnbp->xnb_evtchn = xvdi_get_evtchn(dip);

        /*
         * It would be good to set the state to XenbusStateConnected
         * here as well, but then what if ddi_add_intr() failed?
         * Changing the state in the store will be noticed by the peer
         * and cannot be "taken back".
         */
        mutex_enter(&xnbp->xnb_tx_lock);
        mutex_enter(&xnbp->xnb_rx_lock);

        xnbp->xnb_connected = B_TRUE;

        mutex_exit(&xnbp->xnb_rx_lock);
        mutex_exit(&xnbp->xnb_tx_lock);

        /* 4, 5 */
        if (ddi_add_intr(dip, 0, NULL, NULL, xnb_intr, (caddr_t)xnbp)
            != DDI_SUCCESS) {
                cmn_err(CE_WARN, "xnb_connect_rings: cannot add interrupt");
                goto fail;
        }
        xnbp->xnb_irq = B_TRUE;

        return (B_TRUE);

fail:
        mutex_enter(&xnbp->xnb_tx_lock);
        mutex_enter(&xnbp->xnb_rx_lock);

        xnbp->xnb_connected = B_FALSE;

        mutex_exit(&xnbp->xnb_rx_lock);
        mutex_exit(&xnbp->xnb_tx_lock);

        return (B_FALSE);
}

static void
xnb_disconnect_rings(dev_info_t *dip)
{
        xnb_t *xnbp = ddi_get_driver_private(dip);

        if (xnbp->xnb_irq) {
                ddi_remove_intr(dip, 0, NULL);
                xnbp->xnb_irq = B_FALSE;
        }

        if (xnbp->xnb_evtchn != INVALID_EVTCHN) {
                xvdi_free_evtchn(dip);
                xnbp->xnb_evtchn = INVALID_EVTCHN;
        }

        if (xnbp->xnb_rx_ring_handle != INVALID_GRANT_HANDLE) {
                struct gnttab_unmap_grant_ref unmap_op;

                unmap_op.host_addr = (uint64_t)(uintptr_t)
                    xnbp->xnb_rx_ring_addr;
                unmap_op.dev_bus_addr = 0;
                unmap_op.handle = xnbp->xnb_rx_ring_handle;
                if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref,
                    &unmap_op, 1) != 0)
                        cmn_err(CE_WARN, "xnb_disconnect_rings: "
                            "cannot unmap rx-ring page (%d)",
                            unmap_op.status);

                xnbp->xnb_rx_ring_handle = INVALID_GRANT_HANDLE;
        }

        if (xnbp->xnb_rx_ring_addr != NULL) {
                hat_release_mapping(kas.a_hat, xnbp->xnb_rx_ring_addr);
                vmem_free(heap_arena, xnbp->xnb_rx_ring_addr, PAGESIZE);
                xnbp->xnb_rx_ring_addr = NULL;
        }

        if (xnbp->xnb_tx_ring_handle != INVALID_GRANT_HANDLE) {
                struct gnttab_unmap_grant_ref unmap_op;

                unmap_op.host_addr = (uint64_t)(uintptr_t)
                    xnbp->xnb_tx_ring_addr;
                unmap_op.dev_bus_addr = 0;
                unmap_op.handle = xnbp->xnb_tx_ring_handle;
                if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref,
                    &unmap_op, 1) != 0)
                        cmn_err(CE_WARN, "xnb_disconnect_rings: "
                            "cannot unmap tx-ring page (%d)",
                            unmap_op.status);

                xnbp->xnb_tx_ring_handle = INVALID_GRANT_HANDLE;
        }

        if (xnbp->xnb_tx_ring_addr != NULL) {
                hat_release_mapping(kas.a_hat, xnbp->xnb_tx_ring_addr);
                vmem_free(heap_arena, xnbp->xnb_tx_ring_addr, PAGESIZE);
                xnbp->xnb_tx_ring_addr = NULL;
        }
}

static void
xnb_oe_state_change(dev_info_t *dip, ddi_eventcookie_t id,
    void *arg, void *impl_data)
{
        _NOTE(ARGUNUSED(id, arg));
        xnb_t *xnbp = ddi_get_driver_private(dip);
        XenbusState new_state = *(XenbusState *)impl_data;

        ASSERT(xnbp != NULL);

        switch (new_state) {
        case XenbusStateConnected:
                /* spurious state change */
                if (xnbp->xnb_connected)
                        return;

                if (!xnb_read_oe_config(xnbp) ||
                    !xnbp->xnb_flavour->xf_peer_connected(xnbp)) {
                        cmn_err(CE_WARN, "xnb_oe_state_change: "
                            "read otherend config error");
                        (void) xvdi_switch_state(dip, XBT_NULL,
                            XenbusStateClosed);
                        (void) xvdi_post_event(dip, XEN_HP_REMOVE);

                        break;
                }


                mutex_enter(&xnbp->xnb_state_lock);
                xnbp->xnb_fe_status = XNB_STATE_READY;
                if (xnbp->xnb_be_status == XNB_STATE_READY)
                        xnb_start_connect(xnbp);
                mutex_exit(&xnbp->xnb_state_lock);

                /*
                 * Now that we've attempted to connect it's reasonable
                 * to allow an attempt to detach.
                 */
                xnbp->xnb_detachable = B_TRUE;

                break;

        case XenbusStateClosing:
                (void) xvdi_switch_state(dip, XBT_NULL, XenbusStateClosing);

                break;

        case XenbusStateClosed:
                xnbp->xnb_flavour->xf_peer_disconnected(xnbp);

                mutex_enter(&xnbp->xnb_tx_lock);
                mutex_enter(&xnbp->xnb_rx_lock);

                xnb_disconnect_rings(dip);
                xnbp->xnb_connected = B_FALSE;

                mutex_exit(&xnbp->xnb_rx_lock);
                mutex_exit(&xnbp->xnb_tx_lock);

                (void) xvdi_switch_state(dip, XBT_NULL, XenbusStateClosed);
                (void) xvdi_post_event(dip, XEN_HP_REMOVE);
                /*
                 * In all likelyhood this is already set (in the above
                 * case), but if the peer never attempted to connect
                 * and the domain is destroyed we get here without
                 * having been through the case above, so we set it to
                 * be sure.
                 */
                xnbp->xnb_detachable = B_TRUE;

                break;

        default:
                break;
        }
}

static void
xnb_hp_state_change(dev_info_t *dip, ddi_eventcookie_t id,
    void *arg, void *impl_data)
{
        _NOTE(ARGUNUSED(id, arg));
        xnb_t *xnbp = ddi_get_driver_private(dip);
        xendev_hotplug_state_t state = *(xendev_hotplug_state_t *)impl_data;

        ASSERT(xnbp != NULL);

        switch (state) {
        case Connected:
                /* spurious hotplug event */
                if (xnbp->xnb_hotplugged)
                        break;

                if (!xnb_read_xs_config(xnbp))
                        break;

                if (!xnbp->xnb_flavour->xf_hotplug_connected(xnbp))
                        break;

                mutex_enter(&xnbp->xnb_tx_lock);
                mutex_enter(&xnbp->xnb_rx_lock);

                xnbp->xnb_hotplugged = B_TRUE;

                mutex_exit(&xnbp->xnb_rx_lock);
                mutex_exit(&xnbp->xnb_tx_lock);

                mutex_enter(&xnbp->xnb_state_lock);
                xnbp->xnb_be_status = XNB_STATE_READY;
                if (xnbp->xnb_fe_status == XNB_STATE_READY)
                        xnb_start_connect(xnbp);
                mutex_exit(&xnbp->xnb_state_lock);

                break;

        default:
                break;
        }
}

static struct modldrv modldrv = {
        &mod_miscops, "xnb",
};

static struct modlinkage modlinkage = {
        MODREV_1, &modldrv, NULL
};

int
_init(void)
{
        int i;

        mutex_init(&xnb_alloc_page_lock, NULL, MUTEX_DRIVER, NULL);

        i = mod_install(&modlinkage);
        if (i != DDI_SUCCESS)
                mutex_destroy(&xnb_alloc_page_lock);

        return (i);
}

int
_info(struct modinfo *modinfop)
{
        return (mod_info(&modlinkage, modinfop));
}

int
_fini(void)
{
        int i;

        i = mod_remove(&modlinkage);
        if (i == DDI_SUCCESS)
                mutex_destroy(&xnb_alloc_page_lock);

        return (i);
}