root/usr/src/uts/common/inet/ip/ip_ndp.c
/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved.
 */

/*
 * Copyright (c) 2019, Joyent, Inc.
 */

#include <sys/types.h>
#include <sys/stream.h>
#include <sys/stropts.h>
#include <sys/strsun.h>
#include <sys/sysmacros.h>
#include <sys/errno.h>
#include <sys/dlpi.h>
#include <sys/socket.h>
#include <sys/ddi.h>
#include <sys/sunddi.h>
#include <sys/cmn_err.h>
#include <sys/debug.h>
#include <sys/vtrace.h>
#include <sys/kmem.h>
#include <sys/zone.h>
#include <sys/ethernet.h>
#include <sys/sdt.h>
#include <sys/mac.h>

#include <net/if.h>
#include <net/if_types.h>
#include <net/if_dl.h>
#include <net/route.h>
#include <netinet/in.h>
#include <netinet/ip6.h>
#include <netinet/icmp6.h>

#include <inet/common.h>
#include <inet/mi.h>
#include <inet/mib2.h>
#include <inet/nd.h>
#include <inet/ip.h>
#include <inet/ip_impl.h>
#include <inet/ipclassifier.h>
#include <inet/ip_if.h>
#include <inet/ip_ire.h>
#include <inet/ip_rts.h>
#include <inet/ip6.h>
#include <inet/ip_ndp.h>
#include <inet/sctp_ip.h>
#include <inet/ip_arp.h>
#include <inet/ip2mac_impl.h>

#define ANNOUNCE_INTERVAL(isv6) \
        (isv6 ? ipst->ips_ip_ndp_unsolicit_interval : \
        ipst->ips_ip_arp_publish_interval)

#define DEFENSE_INTERVAL(isv6) \
        (isv6 ? ipst->ips_ndp_defend_interval : \
        ipst->ips_arp_defend_interval)

/* Non-tunable probe interval, based on link capabilities */
#define ILL_PROBE_INTERVAL(ill) ((ill)->ill_note_link ? 150 : 1500)

/*
 * The IPv4 Link Local address space is special; we do extra duplicate checking
 * there, as the entire assignment mechanism rests on random numbers.
 */
#define IS_IPV4_LL_SPACE(ptr)   (((uchar_t *)ptr)[0] == 169 && \
                                ((uchar_t *)ptr)[1] == 254)

/*
 * NCE_EXTERNAL_FLAGS_MASK defines the set of ncec_flags that may be passed
 * in to the ncec*add* functions.
 *
 * NCE_F_AUTHORITY means that we ignore any incoming adverts for that
 * mapping (though DAD is performed for the mapping). NCE_F_PUBLISH means
 * that we will respond to requests for the protocol address.
 */
#define NCE_EXTERNAL_FLAGS_MASK \
        (NCE_F_MYADDR | NCE_F_ISROUTER | NCE_F_NONUD | \
        NCE_F_ANYCAST | NCE_F_UNSOL_ADV | NCE_F_BCAST | NCE_F_MCAST | \
        NCE_F_AUTHORITY | NCE_F_PUBLISH | NCE_F_STATIC)

/*
 * Lock ordering:
 *
 *      ndp_g_lock -> ill_lock -> ncec_lock
 *
 * The ndp_g_lock protects the NCE hash (nce_hash_tbl, NCE_HASH_PTR) and
 * ncec_next.  ncec_lock protects the contents of the NCE (particularly
 * ncec_refcnt).
 */

static  void    nce_cleanup_list(ncec_t *ncec);
static  void    nce_set_ll(ncec_t *ncec, uchar_t *ll_addr);
static  ncec_t  *ncec_lookup_illgrp(ill_t *, const in6_addr_t *,
    ncec_t *);
static  nce_t   *nce_lookup_addr(ill_t *, const in6_addr_t *);
static  int     nce_set_multicast_v6(ill_t *ill, const in6_addr_t *addr,
    uint16_t ncec_flags, nce_t **newnce);
static  int     nce_set_multicast_v4(ill_t *ill, const in_addr_t *dst,
    uint16_t ncec_flags, nce_t **newnce);
static  boolean_t       ndp_xmit(ill_t *ill, uint32_t operation,
    uint8_t *hwaddr, uint_t hwaddr_len, const in6_addr_t *sender,
    const in6_addr_t *target, int flag);
static void     ncec_refhold_locked(ncec_t *);
static boolean_t ill_defend_rate_limit(ill_t *, ncec_t *);
static  void    nce_queue_mp_common(ncec_t *, mblk_t *, boolean_t);
static  int     nce_add_common(ill_t *, uchar_t *, uint_t, const in6_addr_t *,
    uint16_t, uint16_t, nce_t **);
static nce_t *nce_add_impl(ill_t *, ncec_t *, nce_t *, mblk_t *, list_t *);
static nce_t *nce_add(ill_t *, ncec_t *, list_t *);
static void nce_inactive(nce_t *);
extern nce_t    *nce_lookup(ill_t *, const in6_addr_t *);
static nce_t *nce_ill_lookup_then_add(ill_t *, ncec_t *);
static int      nce_add_v6(ill_t *, uchar_t *, uint_t, const in6_addr_t *,
    uint16_t, uint16_t, nce_t **);
static int      nce_add_v4(ill_t *, uchar_t *, uint_t, const in_addr_t *,
    uint16_t, uint16_t, nce_t **);
static int  nce_add_v6_postprocess(nce_t *);
static int  nce_add_v4_postprocess(nce_t *);
static ill_t *nce_resolve_src(ncec_t *, in6_addr_t *);
static clock_t nce_fuzz_interval(clock_t, boolean_t);
static void nce_resolv_ipmp_ok(ncec_t *);
static void nce_walk_common(ill_t *, pfi_t, void *);
static void nce_start_timer(ncec_t *, uint_t);
static nce_t *nce_fastpath_create(ill_t *, ncec_t *);
static void nce_fastpath_trigger(nce_t *);
static nce_t *nce_fastpath(ncec_t *, boolean_t, nce_t *);

#ifdef DEBUG
static void     ncec_trace_cleanup(const ncec_t *);
#endif

#define NCE_HASH_PTR_V4(ipst, addr)                                     \
        (&((ipst)->ips_ndp4->nce_hash_tbl[IRE_ADDR_HASH(addr, NCE_TABLE_SIZE)]))

#define NCE_HASH_PTR_V6(ipst, addr)                              \
        (&((ipst)->ips_ndp6->nce_hash_tbl[NCE_ADDR_HASH_V6(addr, \
                NCE_TABLE_SIZE)]))

extern kmem_cache_t *ncec_cache;
extern kmem_cache_t *nce_cache;

/*
 * Send out a IPv6 (unicast) or IPv4 (broadcast) DAD probe
 * If src_ill is not null, the ncec_addr is bound to src_ill. The
 * src_ill is ignored by nce_dad for IPv4 Neighbor Cache entries where
 * the probe is sent on the ncec_ill (in the non-IPMP case) or the
 * IPMP cast_ill (in the IPMP case).
 *
 * Note that the probe interval is based on the src_ill for IPv6, and
 * the ncec_xmit_interval for IPv4.
 */
static void
nce_dad(ncec_t *ncec, ill_t *src_ill, boolean_t send_probe)
{
        boolean_t dropped;
        uint32_t probe_interval;

        ASSERT(!(ncec->ncec_flags & NCE_F_MCAST));
        ASSERT(!(ncec->ncec_flags & NCE_F_BCAST));
        if (ncec->ncec_ipversion == IPV6_VERSION) {
                dropped = ndp_xmit(src_ill, ND_NEIGHBOR_SOLICIT,
                    ncec->ncec_lladdr, ncec->ncec_lladdr_length,
                    &ipv6_all_zeros, &ncec->ncec_addr, NDP_PROBE);
                probe_interval = ILL_PROBE_INTERVAL(src_ill);
        } else {
                /* IPv4 DAD delay the initial probe. */
                if (send_probe)
                        dropped = arp_probe(ncec);
                else
                        dropped = B_TRUE;
                probe_interval = nce_fuzz_interval(ncec->ncec_xmit_interval,
                    !send_probe);
        }
        if (!dropped) {
                mutex_enter(&ncec->ncec_lock);
                ncec->ncec_pcnt--;
                mutex_exit(&ncec->ncec_lock);
        }
        nce_restart_timer(ncec, probe_interval);
}

/*
 * Compute default flags to use for an advertisement of this ncec's address.
 */
static int
nce_advert_flags(const ncec_t *ncec)
{
        int flag = 0;

        if (ncec->ncec_flags & NCE_F_ISROUTER)
                flag |= NDP_ISROUTER;
        if (!(ncec->ncec_flags & NCE_F_ANYCAST))
                flag |= NDP_ORIDE;

        return (flag);
}

/*
 * NDP Cache Entry creation routine.
 * This routine must always be called with ndp6->ndp_g_lock held.
 */
int
nce_add_v6(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len,
    const in6_addr_t *addr, uint16_t flags, uint16_t state, nce_t **newnce)
{
        int             err;
        nce_t           *nce;

        ASSERT(MUTEX_HELD(&ill->ill_ipst->ips_ndp6->ndp_g_lock));
        ASSERT(ill != NULL && ill->ill_isv6);

        err = nce_add_common(ill, hw_addr, hw_addr_len, addr, flags, state,
            &nce);
        if (err != 0)
                return (err);
        ASSERT(newnce != NULL);
        *newnce = nce;
        return (err);
}

/*
 * Post-processing routine to be executed after nce_add_v6(). This function
 * triggers fastpath (if appropriate) and DAD on the newly added nce entry
 * and must be called without any locks held.
 */
int
nce_add_v6_postprocess(nce_t *nce)
{
        ncec_t          *ncec = nce->nce_common;
        boolean_t       dropped = B_FALSE;
        uchar_t         *hw_addr = ncec->ncec_lladdr;
        uint_t          hw_addr_len = ncec->ncec_lladdr_length;
        ill_t           *ill = ncec->ncec_ill;
        int             err = 0;
        uint16_t        flags = ncec->ncec_flags;
        ip_stack_t      *ipst = ill->ill_ipst;
        boolean_t       trigger_fastpath = B_TRUE;

        /*
         * If the hw_addr is NULL, typically for ND_INCOMPLETE nces, then
         * we call nce_fastpath as soon as the ncec is resolved in nce_process.
         * We call nce_fastpath from nce_update if the link layer address of
         * the peer changes from nce_update
         */
        if (NCE_PUBLISH(ncec) || !NCE_ISREACHABLE(ncec) ||
            (hw_addr == NULL && ill->ill_net_type != IRE_IF_NORESOLVER))
                trigger_fastpath = B_FALSE;

        if (trigger_fastpath)
                nce_fastpath_trigger(nce);
        if (NCE_PUBLISH(ncec) && ncec->ncec_state == ND_PROBE) {
                ill_t *hwaddr_ill;
                /*
                 * Unicast entry that needs DAD.
                 */
                if (IS_IPMP(ill)) {
                        hwaddr_ill = ipmp_illgrp_find_ill(ill->ill_grp,
                            hw_addr, hw_addr_len);
                } else {
                        hwaddr_ill = ill;
                }
                nce_dad(ncec, hwaddr_ill, B_TRUE);
                err = EINPROGRESS;
        } else if (flags & NCE_F_UNSOL_ADV) {
                /*
                 * We account for the transmit below by assigning one
                 * less than the ndd variable. Subsequent decrements
                 * are done in nce_timer.
                 */
                mutex_enter(&ncec->ncec_lock);
                ncec->ncec_unsolicit_count =
                    ipst->ips_ip_ndp_unsolicit_count - 1;
                mutex_exit(&ncec->ncec_lock);
                dropped = ndp_xmit(ill,
                    ND_NEIGHBOR_ADVERT,
                    hw_addr,
                    hw_addr_len,
                    &ncec->ncec_addr,   /* Source and target of the adv */
                    &ipv6_all_hosts_mcast, /* Destination of the packet */
                    nce_advert_flags(ncec));
                mutex_enter(&ncec->ncec_lock);
                if (dropped)
                        ncec->ncec_unsolicit_count++;
                else
                        ncec->ncec_last_time_defended = ddi_get_lbolt();
                if (ncec->ncec_unsolicit_count != 0) {
                        nce_start_timer(ncec,
                            ipst->ips_ip_ndp_unsolicit_interval);
                }
                mutex_exit(&ncec->ncec_lock);
        }
        return (err);
}

/*
 * Atomically lookup and add (if needed) Neighbor Cache information for
 * an address.
 *
 * IPMP notes: the ncec for non-local (i.e., !NCE_MYADDR(ncec) addresses
 * are always added pointing at the ipmp_ill. Thus, when the ill passed
 * to nce_add_v6 is an under_ill (i.e., IS_UNDER_IPMP(ill)) two nce_t
 * entries will be created, both pointing at the same ncec_t. The nce_t
 * entries will have their nce_ill set to the ipmp_ill and the under_ill
 * respectively, with the ncec_t having its ncec_ill pointing at the ipmp_ill.
 * Local addresses are always created on the ill passed to nce_add_v6.
 */
int
nce_lookup_then_add_v6(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len,
    const in6_addr_t *addr, uint16_t flags, uint16_t state, nce_t **newnce)
{
        int             err = 0;
        ip_stack_t      *ipst = ill->ill_ipst;
        nce_t           *nce, *upper_nce = NULL;
        ill_t           *in_ill = ill;
        boolean_t       need_ill_refrele = B_FALSE;

        if (flags & NCE_F_MCAST) {
                /*
                 * hw_addr will be figured out in nce_set_multicast_v6;
                 * caller has to select the cast_ill
                 */
                ASSERT(hw_addr == NULL);
                ASSERT(!IS_IPMP(ill));
                err = nce_set_multicast_v6(ill, addr, flags, newnce);
                return (err);
        }
        ASSERT(ill->ill_isv6);
        if (IS_UNDER_IPMP(ill) && !(flags & NCE_F_MYADDR)) {
                ill = ipmp_ill_hold_ipmp_ill(ill);
                if (ill == NULL)
                        return (ENXIO);
                need_ill_refrele = B_TRUE;
        }

        mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
        nce = nce_lookup_addr(ill, addr);
        if (nce == NULL) {
                err = nce_add_v6(ill, hw_addr, hw_addr_len, addr, flags, state,
                    &nce);
        } else {
                err = EEXIST;
        }
        mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
        if (err == 0)
                err = nce_add_v6_postprocess(nce);
        if (in_ill != ill && nce != NULL) {
                nce_t *under_nce = NULL;

                /*
                 * in_ill was the under_ill. Try to create the under_nce.
                 * Hold the ill_g_lock to prevent changes to group membership
                 * until we are done.
                 */
                rw_enter(&ipst->ips_ill_g_lock, RW_READER);
                if (!IS_IN_SAME_ILLGRP(in_ill, ill)) {
                        DTRACE_PROBE2(ill__not__in__group, nce_t *, nce,
                            ill_t *, ill);
                        rw_exit(&ipst->ips_ill_g_lock);
                        err = ENXIO;
                        nce_refrele(nce);
                        nce = NULL;
                        goto bail;
                }
                under_nce = nce_fastpath_create(in_ill, nce->nce_common);
                if (under_nce == NULL) {
                        rw_exit(&ipst->ips_ill_g_lock);
                        err = EINVAL;
                        nce_refrele(nce);
                        nce = NULL;
                        goto bail;
                }
                rw_exit(&ipst->ips_ill_g_lock);
                upper_nce = nce;
                nce = under_nce; /* will be returned to caller */
                if (NCE_ISREACHABLE(nce->nce_common))
                        nce_fastpath_trigger(under_nce);
        }
        /* nce_refrele is deferred until the lock is dropped  */
        if (nce != NULL) {
                if (newnce != NULL)
                        *newnce = nce;
                else
                        nce_refrele(nce);
        }
bail:
        if (upper_nce != NULL)
                nce_refrele(upper_nce);
        if (need_ill_refrele)
                ill_refrele(ill);
        return (err);
}

/*
 * Remove all the CONDEMNED nces from the appropriate hash table.
 * We create a private list of NCEs, these may have ires pointing
 * to them, so the list will be passed through to clean up dependent
 * ires and only then we can do ncec_refrele() which can make NCE inactive.
 */
static void
nce_remove(ndp_g_t *ndp, ncec_t *ncec, ncec_t **free_nce_list)
{
        ncec_t *ncec1;
        ncec_t **ptpn;

        ASSERT(MUTEX_HELD(&ndp->ndp_g_lock));
        ASSERT(ndp->ndp_g_walker == 0);
        for (; ncec; ncec = ncec1) {
                ncec1 = ncec->ncec_next;
                mutex_enter(&ncec->ncec_lock);
                if (NCE_ISCONDEMNED(ncec)) {
                        ptpn = ncec->ncec_ptpn;
                        ncec1 = ncec->ncec_next;
                        if (ncec1 != NULL)
                                ncec1->ncec_ptpn = ptpn;
                        *ptpn = ncec1;
                        ncec->ncec_ptpn = NULL;
                        ncec->ncec_next = NULL;
                        ncec->ncec_next = *free_nce_list;
                        *free_nce_list = ncec;
                }
                mutex_exit(&ncec->ncec_lock);
        }
}

/*
 * 1. Mark the entry CONDEMNED. This ensures that no new nce_lookup()
 *    will return this NCE. Also no new timeouts will
 *    be started (See nce_restart_timer).
 * 2. Cancel any currently running timeouts.
 * 3. If there is an ndp walker, return. The walker will do the cleanup.
 *    This ensures that walkers see a consistent list of NCEs while walking.
 * 4. Otherwise remove the NCE from the list of NCEs
 */
void
ncec_delete(ncec_t *ncec)
{
        ncec_t  **ptpn;
        ncec_t  *ncec1;
        int     ipversion = ncec->ncec_ipversion;
        ndp_g_t *ndp;
        ip_stack_t      *ipst = ncec->ncec_ipst;

        if (ipversion == IPV4_VERSION)
                ndp = ipst->ips_ndp4;
        else
                ndp = ipst->ips_ndp6;

        /* Serialize deletes */
        mutex_enter(&ncec->ncec_lock);
        if (NCE_ISCONDEMNED(ncec)) {
                /* Some other thread is doing the delete */
                mutex_exit(&ncec->ncec_lock);
                return;
        }
        /*
         * Caller has a refhold. Also 1 ref for being in the list. Thus
         * refcnt has to be >= 2
         */
        ASSERT(ncec->ncec_refcnt >= 2);
        ncec->ncec_flags |= NCE_F_CONDEMNED;
        mutex_exit(&ncec->ncec_lock);

        /* Count how many condemned ires for kmem_cache callback */
        atomic_inc_32(&ipst->ips_num_nce_condemned);
        nce_fastpath_list_delete(ncec->ncec_ill, ncec, NULL);

        /* Complete any waiting callbacks */
        ncec_cb_dispatch(ncec);

        /*
         * Cancel any running timer. Timeout can't be restarted
         * since CONDEMNED is set. Can't hold ncec_lock across untimeout.
         * Passing invalid timeout id is fine.
         */
        if (ncec->ncec_timeout_id != 0) {
                (void) untimeout(ncec->ncec_timeout_id);
                ncec->ncec_timeout_id = 0;
        }

        mutex_enter(&ndp->ndp_g_lock);
        if (ncec->ncec_ptpn == NULL) {
                /*
                 * The last ndp walker has already removed this ncec from
                 * the list after we marked the ncec CONDEMNED and before
                 * we grabbed the global lock.
                 */
                mutex_exit(&ndp->ndp_g_lock);
                return;
        }
        if (ndp->ndp_g_walker > 0) {
                /*
                 * Can't unlink. The walker will clean up
                 */
                ndp->ndp_g_walker_cleanup = B_TRUE;
                mutex_exit(&ndp->ndp_g_lock);
                return;
        }

        /*
         * Now remove the ncec from the list. nce_restart_timer won't restart
         * the timer since it is marked CONDEMNED.
         */
        ptpn = ncec->ncec_ptpn;
        ncec1 = ncec->ncec_next;
        if (ncec1 != NULL)
                ncec1->ncec_ptpn = ptpn;
        *ptpn = ncec1;
        ncec->ncec_ptpn = NULL;
        ncec->ncec_next = NULL;
        mutex_exit(&ndp->ndp_g_lock);

        /* Removed from ncec_ptpn/ncec_next list */
        ncec_refrele_notr(ncec);
}

void
ncec_inactive(ncec_t *ncec)
{
        mblk_t          **mpp;
        ill_t           *ill = ncec->ncec_ill;
        ip_stack_t      *ipst = ncec->ncec_ipst;

        ASSERT(ncec->ncec_refcnt == 0);
        ASSERT(MUTEX_HELD(&ncec->ncec_lock));

        /* Count how many condemned nces for kmem_cache callback */
        if (NCE_ISCONDEMNED(ncec))
                atomic_add_32(&ipst->ips_num_nce_condemned, -1);

        /* Free all allocated messages */
        mpp = &ncec->ncec_qd_mp;
        while (*mpp != NULL) {
                mblk_t  *mp;

                mp = *mpp;
                *mpp = mp->b_next;

                inet_freemsg(mp);
        }
        /*
         * must have been cleaned up in ncec_delete
         */
        ASSERT(list_is_empty(&ncec->ncec_cb));
        list_destroy(&ncec->ncec_cb);
        /*
         * free the ncec_lladdr if one was allocated in nce_add_common()
         */
        if (ncec->ncec_lladdr_length > 0)
                kmem_free(ncec->ncec_lladdr, ncec->ncec_lladdr_length);

#ifdef DEBUG
        ncec_trace_cleanup(ncec);
#endif

        mutex_enter(&ill->ill_lock);
        DTRACE_PROBE3(ill__decr__cnt, (ill_t *), ill,
            (char *), "ncec", (void *), ncec);
        ill->ill_ncec_cnt--;
        ncec->ncec_ill = NULL;
        /*
         * If the number of ncec's associated with this ill have dropped
         * to zero, check whether we need to restart any operation that
         * is waiting for this to happen.
         */
        if (ILL_DOWN_OK(ill)) {
                /* ipif_ill_refrele_tail drops the ill_lock */
                ipif_ill_refrele_tail(ill);
        } else {
                mutex_exit(&ill->ill_lock);
        }

        mutex_destroy(&ncec->ncec_lock);
        kmem_cache_free(ncec_cache, ncec);
}

/*
 * ncec_walk routine.  Delete the ncec if it is associated with the ill
 * that is going away.  Always called as a writer.
 */
void
ncec_delete_per_ill(ncec_t *ncec, void *arg)
{
        if ((ncec != NULL) && ncec->ncec_ill == arg) {
                ncec_delete(ncec);
        }
}

/*
 * Neighbor Cache cleanup logic for a list of ncec_t entries.
 */
static void
nce_cleanup_list(ncec_t *ncec)
{
        ncec_t *ncec_next;

        ASSERT(ncec != NULL);
        while (ncec != NULL) {
                ncec_next = ncec->ncec_next;
                ncec->ncec_next = NULL;

                /*
                 * It is possible for the last ndp walker (this thread)
                 * to come here after ncec_delete has marked the ncec CONDEMNED
                 * and before it has removed the ncec from the fastpath list
                 * or called untimeout. So we need to do it here. It is safe
                 * for both ncec_delete and this thread to do it twice or
                 * even simultaneously since each of the threads has a
                 * reference on the ncec.
                 */
                nce_fastpath_list_delete(ncec->ncec_ill, ncec, NULL);
                /*
                 * Cancel any running timer. Timeout can't be restarted
                 * since CONDEMNED is set. The ncec_lock can't be
                 * held across untimeout though passing invalid timeout
                 * id is fine.
                 */
                if (ncec->ncec_timeout_id != 0) {
                        (void) untimeout(ncec->ncec_timeout_id);
                        ncec->ncec_timeout_id = 0;
                }
                /* Removed from ncec_ptpn/ncec_next list */
                ncec_refrele_notr(ncec);
                ncec = ncec_next;
        }
}

/*
 * Restart DAD on given NCE.  Returns B_TRUE if DAD has been restarted.
 */
boolean_t
nce_restart_dad(ncec_t *ncec)
{
        boolean_t started;
        ill_t *ill, *hwaddr_ill;

        if (ncec == NULL)
                return (B_FALSE);
        ill = ncec->ncec_ill;
        mutex_enter(&ncec->ncec_lock);
        if (ncec->ncec_state == ND_PROBE) {
                mutex_exit(&ncec->ncec_lock);
                started = B_TRUE;
        } else if (ncec->ncec_state == ND_REACHABLE) {
                ASSERT(ncec->ncec_lladdr != NULL);
                ncec->ncec_state = ND_PROBE;
                ncec->ncec_pcnt = ND_MAX_UNICAST_SOLICIT;
                /*
                 * Slight cheat here: we don't use the initial probe delay
                 * for IPv4 in this obscure case.
                 */
                mutex_exit(&ncec->ncec_lock);
                if (IS_IPMP(ill)) {
                        hwaddr_ill = ipmp_illgrp_find_ill(ill->ill_grp,
                            ncec->ncec_lladdr, ncec->ncec_lladdr_length);
                } else {
                        hwaddr_ill = ill;
                }
                nce_dad(ncec, hwaddr_ill, B_TRUE);
                started = B_TRUE;
        } else {
                mutex_exit(&ncec->ncec_lock);
                started = B_FALSE;
        }
        return (started);
}

/*
 * IPv6 Cache entry lookup.  Try to find an ncec matching the parameters passed.
 * If one is found, the refcnt on the ncec will be incremented.
 */
ncec_t *
ncec_lookup_illgrp_v6(ill_t *ill, const in6_addr_t *addr)
{
        ncec_t          *ncec;
        ip_stack_t      *ipst = ill->ill_ipst;

        rw_enter(&ipst->ips_ill_g_lock, RW_READER);
        mutex_enter(&ipst->ips_ndp6->ndp_g_lock);

        /* Get head of v6 hash table */
        ncec = *((ncec_t **)NCE_HASH_PTR_V6(ipst, *addr));
        ncec = ncec_lookup_illgrp(ill, addr, ncec);
        mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
        rw_exit(&ipst->ips_ill_g_lock);
        return (ncec);
}
/*
 * IPv4 Cache entry lookup.  Try to find an ncec matching the parameters passed.
 * If one is found, the refcnt on the ncec will be incremented.
 */
ncec_t *
ncec_lookup_illgrp_v4(ill_t *ill, const in_addr_t *addr)
{
        ncec_t  *ncec = NULL;
        in6_addr_t addr6;
        ip_stack_t *ipst = ill->ill_ipst;

        rw_enter(&ipst->ips_ill_g_lock, RW_READER);
        mutex_enter(&ipst->ips_ndp4->ndp_g_lock);

        /* Get head of v4 hash table */
        ncec = *((ncec_t **)NCE_HASH_PTR_V4(ipst, *addr));
        IN6_IPADDR_TO_V4MAPPED(*addr, &addr6);
        ncec = ncec_lookup_illgrp(ill, &addr6, ncec);
        mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
        rw_exit(&ipst->ips_ill_g_lock);
        return (ncec);
}

/*
 * Cache entry lookup.  Try to find an ncec matching the parameters passed.
 * If an ncec is found, increment the hold count on that ncec.
 * The caller passes in the start of the appropriate hash table, and must
 * be holding the appropriate global lock (ndp_g_lock). In addition, since
 * this function matches ncec_t entries across the illgrp, the ips_ill_g_lock
 * must be held as reader.
 *
 * This function always matches across the ipmp group.
 */
ncec_t *
ncec_lookup_illgrp(ill_t *ill, const in6_addr_t *addr, ncec_t *ncec)
{
        ndp_g_t         *ndp;
        ip_stack_t      *ipst = ill->ill_ipst;

        if (ill->ill_isv6)
                ndp = ipst->ips_ndp6;
        else
                ndp = ipst->ips_ndp4;

        ASSERT(ill != NULL);
        ASSERT(MUTEX_HELD(&ndp->ndp_g_lock));
        if (IN6_IS_ADDR_UNSPECIFIED(addr))
                return (NULL);
        for (; ncec != NULL; ncec = ncec->ncec_next) {
                if (ncec->ncec_ill == ill ||
                    IS_IN_SAME_ILLGRP(ill, ncec->ncec_ill)) {
                        if (IN6_ARE_ADDR_EQUAL(&ncec->ncec_addr, addr)) {
                                mutex_enter(&ncec->ncec_lock);
                                if (!NCE_ISCONDEMNED(ncec)) {
                                        ncec_refhold_locked(ncec);
                                        mutex_exit(&ncec->ncec_lock);
                                        break;
                                }
                                mutex_exit(&ncec->ncec_lock);
                        }
                }
        }
        return (ncec);
}

/*
 * Find an nce_t on ill with nce_addr == addr. Lookup the nce_t
 * entries for ill only, i.e., when ill is part of an ipmp group,
 * nce_lookup_v4 will never try to match across the group.
 */
nce_t *
nce_lookup_v4(ill_t *ill, const in_addr_t *addr)
{
        nce_t *nce;
        in6_addr_t addr6;
        ip_stack_t *ipst = ill->ill_ipst;

        mutex_enter(&ipst->ips_ndp4->ndp_g_lock);
        IN6_IPADDR_TO_V4MAPPED(*addr, &addr6);
        nce = nce_lookup_addr(ill, &addr6);
        mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
        return (nce);
}

/*
 * Find an nce_t on ill with nce_addr == addr. Lookup the nce_t
 * entries for ill only, i.e., when ill is part of an ipmp group,
 * nce_lookup_v6 will never try to match across the group.
 */
nce_t *
nce_lookup_v6(ill_t *ill, const in6_addr_t *addr6)
{
        nce_t *nce;
        ip_stack_t *ipst = ill->ill_ipst;

        mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
        nce = nce_lookup_addr(ill, addr6);
        mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
        return (nce);
}

static nce_t *
nce_lookup_addr(ill_t *ill, const in6_addr_t *addr)
{
        nce_t *nce;

        ASSERT(ill != NULL);
#ifdef DEBUG
        if (ill->ill_isv6)
                ASSERT(MUTEX_HELD(&ill->ill_ipst->ips_ndp6->ndp_g_lock));
        else
                ASSERT(MUTEX_HELD(&ill->ill_ipst->ips_ndp4->ndp_g_lock));
#endif
        mutex_enter(&ill->ill_lock);
        nce = nce_lookup(ill, addr);
        mutex_exit(&ill->ill_lock);
        return (nce);
}


/*
 * Router turned to host.  We need to make sure that cached copies of the ncec
 * are not used for forwarding packets if they were derived from the default
 * route, and that the default route itself is removed, as  required by
 * section 7.2.5 of RFC 2461.
 *
 * Note that the ncec itself probably has valid link-layer information for the
 * nexthop, so that there is no reason to delete the ncec, as long as the
 * ISROUTER flag is turned off.
 */
static void
ncec_router_to_host(ncec_t *ncec)
{
        ire_t           *ire;
        ip_stack_t      *ipst = ncec->ncec_ipst;

        mutex_enter(&ncec->ncec_lock);
        ncec->ncec_flags &= ~NCE_F_ISROUTER;
        mutex_exit(&ncec->ncec_lock);

        ire = ire_ftable_lookup_v6(&ipv6_all_zeros, &ipv6_all_zeros,
            &ncec->ncec_addr, IRE_DEFAULT, ncec->ncec_ill, ALL_ZONES, NULL,
            MATCH_IRE_ILL | MATCH_IRE_TYPE | MATCH_IRE_GW, 0, ipst, NULL);
        if (ire != NULL) {
                ip_rts_rtmsg(RTM_DELETE, ire, 0, ipst);
                ire_delete(ire);
                ire_refrele(ire);
        }
}

/*
 * Process passed in parameters either from an incoming packet or via
 * user ioctl.
 */
void
nce_process(ncec_t *ncec, uchar_t *hw_addr, uint32_t flag, boolean_t is_adv)
{
        ill_t   *ill = ncec->ncec_ill;
        uint32_t hw_addr_len = ill->ill_phys_addr_length;
        boolean_t ll_updated = B_FALSE;
        boolean_t ll_changed;
        nce_t   *nce;

        ASSERT(ncec->ncec_ipversion == IPV6_VERSION);
        /*
         * No updates of link layer address or the neighbor state is
         * allowed, when the cache is in NONUD state.  This still
         * allows for responding to reachability solicitation.
         */
        mutex_enter(&ncec->ncec_lock);
        if (ncec->ncec_state == ND_INCOMPLETE) {
                if (hw_addr == NULL) {
                        mutex_exit(&ncec->ncec_lock);
                        return;
                }
                nce_set_ll(ncec, hw_addr);
                /*
                 * Update ncec state and send the queued packets
                 * back to ip this time ire will be added.
                 */
                if (flag & ND_NA_FLAG_SOLICITED) {
                        nce_update(ncec, ND_REACHABLE, NULL);
                } else {
                        nce_update(ncec, ND_STALE, NULL);
                }
                mutex_exit(&ncec->ncec_lock);
                nce = nce_fastpath(ncec, B_TRUE, NULL);
                nce_resolv_ok(ncec);
                if (nce != NULL)
                        nce_refrele(nce);
                return;
        }
        ll_changed = nce_cmp_ll_addr(ncec, hw_addr, hw_addr_len);
        if (!is_adv) {
                /* If this is a SOLICITATION request only */
                if (ll_changed)
                        nce_update(ncec, ND_STALE, hw_addr);
                mutex_exit(&ncec->ncec_lock);
                ncec_cb_dispatch(ncec);
                return;
        }
        if (!(flag & ND_NA_FLAG_OVERRIDE) && ll_changed) {
                /* If in any other state than REACHABLE, ignore */
                if (ncec->ncec_state == ND_REACHABLE) {
                        nce_update(ncec, ND_STALE, NULL);
                }
                mutex_exit(&ncec->ncec_lock);
                ncec_cb_dispatch(ncec);
                return;
        } else {
                if (ll_changed) {
                        nce_update(ncec, ND_UNCHANGED, hw_addr);
                        ll_updated = B_TRUE;
                }
                if (flag & ND_NA_FLAG_SOLICITED) {
                        nce_update(ncec, ND_REACHABLE, NULL);
                } else {
                        if (ll_updated) {
                                nce_update(ncec, ND_STALE, NULL);
                        }
                }
                mutex_exit(&ncec->ncec_lock);
                if (!(flag & ND_NA_FLAG_ROUTER) && (ncec->ncec_flags &
                    NCE_F_ISROUTER)) {
                        ncec_router_to_host(ncec);
                } else {
                        ncec_cb_dispatch(ncec);
                }
        }
}

/*
 * Pass arg1 to the cbf supplied, along with each ncec in existence.
 * ncec_walk() places a REFHOLD on the ncec and drops the lock when
 * walking the hash list.
 */
void
ncec_walk_common(ndp_g_t *ndp, ill_t *ill, ncec_walk_cb_t cbf,
    void *arg1, boolean_t trace)
{
        ncec_t  *ncec;
        ncec_t  *ncec1;
        ncec_t  **ncep;
        ncec_t  *free_nce_list = NULL;

        mutex_enter(&ndp->ndp_g_lock);
        /* Prevent ncec_delete from unlink and free of NCE */
        ndp->ndp_g_walker++;
        mutex_exit(&ndp->ndp_g_lock);
        for (ncep = ndp->nce_hash_tbl;
            ncep < A_END(ndp->nce_hash_tbl); ncep++) {
                for (ncec = *ncep; ncec != NULL; ncec = ncec1) {
                        ncec1 = ncec->ncec_next;
                        if (ill == NULL || ncec->ncec_ill == ill) {
                                if (trace) {
                                        ncec_refhold(ncec);
                                        (*cbf)(ncec, arg1);
                                        ncec_refrele(ncec);
                                } else {
                                        ncec_refhold_notr(ncec);
                                        (*cbf)(ncec, arg1);
                                        ncec_refrele_notr(ncec);
                                }
                        }
                }
        }
        mutex_enter(&ndp->ndp_g_lock);
        ndp->ndp_g_walker--;
        if (ndp->ndp_g_walker_cleanup && ndp->ndp_g_walker == 0) {
                /* Time to delete condemned entries */
                for (ncep = ndp->nce_hash_tbl;
                    ncep < A_END(ndp->nce_hash_tbl); ncep++) {
                        ncec = *ncep;
                        if (ncec != NULL) {
                                nce_remove(ndp, ncec, &free_nce_list);
                        }
                }
                ndp->ndp_g_walker_cleanup = B_FALSE;
        }

        mutex_exit(&ndp->ndp_g_lock);

        if (free_nce_list != NULL) {
                nce_cleanup_list(free_nce_list);
        }
}

/*
 * Walk everything.
 * Note that ill can be NULL hence can't derive the ipst from it.
 */
void
ncec_walk(ill_t *ill, ncec_walk_cb_t cbf, void *arg1, ip_stack_t *ipst)
{
        ncec_walk_common(ipst->ips_ndp4, ill, cbf, arg1, B_TRUE);
        ncec_walk_common(ipst->ips_ndp6, ill, cbf, arg1, B_TRUE);
}

/*
 * Cheesy globals (i.e. all netstacks) for both a limit on per-ill multicast
 * NCEs, and the number to reclaim if we hit the limit.  Used by
 * nce_set_multicast_v[46]() to limit the linked-list length of ill_nce. Until
 * we solve the multicast-mappings-shouldn't-be-NCEs problem, use this.
 */

/* Maximum number of multicast NCEs on an ill. */
uint_t ip_max_ill_mcast_nces = 16384;
/*
 * Number of NCEs to delete if we hit the maximum above.  0 means *don't* and
 * return an error.  Non-zero means delete so many, and if the number is >=
 * the max above, that means delete them all.
 */
uint_t ip_ill_mcast_reclaim = 256;

/*
 * Encapsulate multicast ill capping in a function, for easier DTrace
 * detections.  Return a list of refheld NCEs to destroy-via-refrele.  That
 * list can be NULL, but can only be non-NULL if we successfully reclaimed.
 *
 * NOTE:  This function must be called while holding the ill_lock AND
 * JUST PRIOR to making the insertion into the ill_nce list.
 *
 * We can't release the ones we delete ourselves because the ill_lock is held
 * by the caller. They are, instead, passed back in a list_t for deletion
 * outside of the ill_lock hold. nce_graveyard_free() actually frees them.
 *
 * While this covers nce_t, ncec_t gets done even further down the road.  See
 * nce_graveyard_free() for why.
 */
static boolean_t
nce_too_many_mcast(ill_t *ill, list_t *graveyard)
{
        uint_t reclaim_count, max_count, reclaimed = 0;
        boolean_t too_many;
        nce_t *nce, *deadman;

        ASSERT(graveyard != NULL);
        ASSERT(list_is_empty(graveyard));
        ASSERT(MUTEX_HELD(&ill->ill_lock));

        /*
         * NOTE: Some grinning weirdo may have lowered the global max beyond
         * what this ill currently has.  The behavior in this case will be
         * trim-back just by the reclaim amount for any new ones.
         */
        max_count = ip_max_ill_mcast_nces;
        reclaim_count = min(ip_ill_mcast_reclaim, max_count);

        /* All good? */
        if (ill->ill_mcast_nces < max_count)
                return (B_FALSE);       /* Yes, all good. */

        if (reclaim_count == 0)
                return (B_TRUE);        /* Don't bother - we're stuck. */

        /* We need to reclaim now.  Exploit our held ill_lock. */

        /*
         * Start at the tail and work backwards, new nces are head-inserted,
         * so we'll be reaping the oldest entries.
         */
        nce = list_tail(&ill->ill_nce);
        while (reclaimed < reclaim_count) {
                /* Skip ahead to a multicast NCE. */
                while (nce != NULL &&
                    (nce->nce_common->ncec_flags & NCE_F_MCAST) == 0) {
                        nce = list_prev(&ill->ill_nce, nce);
                }
                if (nce == NULL)
                        break;

                /*
                 * NOTE: For now, we just delete the first one(s) we find.
                 * This is not optimal, and may require some inspection of nce
                 * & its ncec to be better.
                 */
                deadman = nce;
                nce = list_prev(&ill->ill_nce, nce);

                /* nce_delete() requires caller holds... */
                nce_refhold(deadman);
                nce_delete(deadman);    /* Bumps down ill_mcast_nces. */

                /* Link the dead ones singly, still refheld... */
                list_insert_tail(graveyard, deadman);
                reclaimed++;
        }

        if (reclaimed != reclaim_count) {
                /* We didn't have enough to reach reclaim_count. Why?!? */
                DTRACE_PROBE3(ill__mcast__nce__reclaim__mismatch, ill_t *, ill,
                    uint_t, reclaimed, uint_t, reclaim_count);

                /* In case for some REALLY weird reason we found none! */
                too_many = (reclaimed == 0);
        } else {
                too_many = B_FALSE;
        }

        return (too_many);
}

static void
ncec_mcast_reap_one(ncec_t *ncec, void *arg)
{
        boolean_t reapit;
        ill_t *ill = (ill_t *)arg;

        /* Obvious no-lock-needed checks... */
        if (ncec == NULL || ncec->ncec_ill != ill ||
            (ncec->ncec_flags & NCE_F_MCAST) == 0)
                return;

        mutex_enter(&ncec->ncec_lock);
        /*
         * It's refheld by the walk infrastructure. It has one reference for
         * being in the ndp_g_hash, and if an nce_t exists, that's one more.
         * We want ones without an nce_t, so 2 is the magic number.  If it's
         * LESS than 2, we have much bigger problems anyway.
         */
        ASSERT(ncec->ncec_refcnt >= 2);
        reapit = (ncec->ncec_refcnt == 2);
        mutex_exit(&ncec->ncec_lock);

        if (reapit) {
                IP_STAT(ill->ill_ipst, ip_nce_mcast_reclaim_deleted);
                ncec_delete(ncec);
        }
}

/*
 * Attempt to reap stray multicast ncec_t structures left in the wake of
 * nce_graveyard_free(). This is a taskq servicing routine, as it's well
 * outside any netstack-global locks being held - ndp_g_lock in this case.  We
 * have a reference hold on the ill, which will prevent any unplumbing races.
 */
static void
ncec_mcast_reap(void *arg)
{
        ill_t *ill = (ill_t *)arg;

        IP_STAT(ill->ill_ipst, ip_nce_mcast_reclaim_calls);
        ncec_walk(ill, ncec_mcast_reap_one, ill, ill->ill_ipst);
        mutex_enter(&ill->ill_lock);
        ill->ill_mcast_ncec_cleanup = B_FALSE;
        /*
         * Inline a _notr() version of ill_refrele. See nce_graveyard_free()
         * below for why.
         */
        ill->ill_refcnt--;
        if (ill->ill_refcnt == 0)
                ipif_ill_refrele_tail(ill);     /* Drops ill_lock. */
        else
                mutex_exit(&ill->ill_lock);
}

/*
 * Free a list (including handling an empty list or NULL list) of
 * reference-held NCEs that were reaped from a nce_too_many_mcast()
 * call. Separate because the caller must have dropped ndp_g_lock first.
 *
 * This also schedules a taskq task to unlink underlying NCECs from the
 * ndp_g_hash, which are protected by ndp_g_lock.
 */
static void
nce_graveyard_free(list_t *graveyard)
{
        nce_t *deadman, *current;
        ill_t *ill;
        boolean_t doit;

        if (graveyard == NULL)
                return;

        current = list_head(graveyard);
        if (current == NULL) {
                list_destroy(graveyard);
                return;
        }

        ill = current->nce_ill;
        /*
         * Normally one should ill_refhold(ill) here.  There's no _notr()
         * variant like there is for ire_t, dce_t, or even ncec_t, but this is
         * the ONLY case that'll break the mh_trace that IP debugging uses for
         * reference counts (i.e. they assume same thread releases as
         * holds). Instead, we inline ill_refhold() here.  We must do the same
         * in the release done by the ncec_mcast_reap() above.
         */
        mutex_enter(&ill->ill_lock);
        ill->ill_refcnt++;
        mutex_exit(&ill->ill_lock);

        while (current != NULL) {
                ASSERT3P(ill, ==, current->nce_ill);
                deadman = current;
                current = list_next(graveyard, deadman);
                list_remove(graveyard, deadman);
                ASSERT3U((deadman->nce_common->ncec_flags & NCE_F_MCAST), !=,
                    0);
                nce_refrele(deadman);
        }
        list_destroy(graveyard);

        mutex_enter(&ill->ill_lock);
        if (ill->ill_mcast_ncec_cleanup)
                doit = B_FALSE;
        else {
                ill->ill_mcast_ncec_cleanup = B_TRUE;
                doit = B_TRUE;
        }
        mutex_exit(&ill->ill_lock);
        if (!doit || taskq_dispatch(system_taskq, ncec_mcast_reap,
            ill, TQ_NOSLEEP) == TASKQID_INVALID) {
                mutex_enter(&ill->ill_lock);
                if (doit) {
                        IP_STAT(ill->ill_ipst, ip_nce_mcast_reclaim_tqfail);
                        ill->ill_mcast_ncec_cleanup = B_FALSE;
                }
                /* There's no _notr() for ill_refrele(), so inline it here. */
                ill->ill_refcnt--;
                if (ill->ill_refcnt == 0)
                        ipif_ill_refrele_tail(ill);     /* Drops ill_lock */
                else
                        mutex_exit(&ill->ill_lock);
        }
}

/*
 * For each interface an entry is added for the unspecified multicast group.
 * Here that mapping is used to form the multicast cache entry for a particular
 * multicast destination.
 */
static int
nce_set_multicast_v6(ill_t *ill, const in6_addr_t *dst,
    uint16_t flags, nce_t **newnce)
{
        uchar_t         *hw_addr;
        int             err = 0;
        ip_stack_t      *ipst = ill->ill_ipst;
        nce_t           *nce;

        ASSERT(ill != NULL);
        ASSERT(ill->ill_isv6);
        ASSERT(!(IN6_IS_ADDR_UNSPECIFIED(dst)));

        mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
        nce = nce_lookup_addr(ill, dst);
        if (nce != NULL) {
                mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
                goto done;
        }
        if (ill->ill_net_type == IRE_IF_RESOLVER) {
                /*
                 * For IRE_IF_RESOLVER a hardware mapping can be
                 * generated.
                 */
                hw_addr = kmem_alloc(ill->ill_nd_lla_len, KM_NOSLEEP);
                if (hw_addr == NULL) {
                        mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
                        return (ENOMEM);
                }
                ip_mcast_mapping(ill, (uchar_t *)dst, hw_addr);
        } else {
                /* No hw_addr is needed for IRE_IF_NORESOLVER. */
                hw_addr = NULL;
        }
        ASSERT((flags & NCE_F_MCAST) != 0);
        ASSERT((flags & NCE_F_NONUD) != 0);
        /* nce_state will be computed by nce_add_common() */
        err = nce_add_v6(ill, hw_addr, ill->ill_phys_addr_length, dst, flags,
            ND_UNCHANGED, &nce);
        mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
        if (err == 0)
                err = (nce != NULL) ? nce_add_v6_postprocess(nce) : ENOMEM;
        if (hw_addr != NULL)
                kmem_free(hw_addr, ill->ill_nd_lla_len);
        if (err != 0) {
                ip1dbg(("nce_set_multicast_v6: create failed" "%d\n", err));
                return (err);
        }
done:
        ASSERT(nce->nce_common->ncec_state == ND_REACHABLE);
        if (newnce != NULL)
                *newnce = nce;
        else
                nce_refrele(nce);
        return (0);
}

/*
 * Return the link layer address, and any flags of a ncec.
 */
int
ndp_query(ill_t *ill, struct lif_nd_req *lnr)
{
        ncec_t          *ncec;
        in6_addr_t      *addr;
        sin6_t          *sin6;

        ASSERT(ill != NULL && ill->ill_isv6);
        sin6 = (sin6_t *)&lnr->lnr_addr;
        addr =  &sin6->sin6_addr;

        /*
         * NOTE: if the ill is an IPMP interface, then match against the whole
         * illgrp.  This e.g. allows in.ndpd to retrieve the link layer
         * addresses for the data addresses on an IPMP interface even though
         * ipif_ndp_up() created them with an ncec_ill of ipif_bound_ill.
         */
        ncec = ncec_lookup_illgrp_v6(ill, addr);
        if (ncec == NULL)
                return (ESRCH);
        /* If no link layer address is available yet, return ESRCH */
        if (!NCE_ISREACHABLE(ncec)) {
                ncec_refrele(ncec);
                return (ESRCH);
        }
        lnr->lnr_hdw_len = ill->ill_phys_addr_length;
        bcopy(ncec->ncec_lladdr, (uchar_t *)&lnr->lnr_hdw_addr,
            lnr->lnr_hdw_len);
        if (ncec->ncec_flags & NCE_F_ISROUTER)
                lnr->lnr_flags = NDF_ISROUTER_ON;
        if (ncec->ncec_flags & NCE_F_ANYCAST)
                lnr->lnr_flags |= NDF_ANYCAST_ON;
        if (ncec->ncec_flags & NCE_F_STATIC)
                lnr->lnr_flags |= NDF_STATIC;
        ncec_refrele(ncec);
        return (0);
}

/*
 * Finish setting up the Enable/Disable multicast for the driver.
 */
mblk_t *
ndp_mcastreq(ill_t *ill, const in6_addr_t *v6group, uint32_t hw_addr_len,
    uint32_t hw_addr_offset, mblk_t *mp)
{
        uchar_t         *hw_addr;
        ipaddr_t        v4group;
        uchar_t         *addr;

        ASSERT(ill->ill_net_type == IRE_IF_RESOLVER);
        if (IN6_IS_ADDR_V4MAPPED(v6group)) {
                IN6_V4MAPPED_TO_IPADDR(v6group, v4group);

                ASSERT(CLASSD(v4group));
                ASSERT(!(ill->ill_isv6));

                addr = (uchar_t *)&v4group;
        } else {
                ASSERT(IN6_IS_ADDR_MULTICAST(v6group));
                ASSERT(ill->ill_isv6);

                addr = (uchar_t *)v6group;
        }
        hw_addr = mi_offset_paramc(mp, hw_addr_offset, hw_addr_len);
        if (hw_addr == NULL) {
                ip0dbg(("ndp_mcastreq NULL hw_addr\n"));
                freemsg(mp);
                return (NULL);
        }

        ip_mcast_mapping(ill, addr, hw_addr);
        return (mp);
}

void
ip_ndp_resolve(ncec_t *ncec)
{
        in_addr_t       sender4 = INADDR_ANY;
        in6_addr_t      sender6 = ipv6_all_zeros;
        ill_t           *src_ill;
        uint32_t        ms;

        src_ill = nce_resolve_src(ncec, &sender6);
        if (src_ill == NULL) {
                /* Make sure we try again later */
                ms = ncec->ncec_ill->ill_reachable_retrans_time;
                nce_restart_timer(ncec, (clock_t)ms);
                return;
        }
        if (ncec->ncec_ipversion == IPV4_VERSION)
                IN6_V4MAPPED_TO_IPADDR(&sender6, sender4);
        mutex_enter(&ncec->ncec_lock);
        if (ncec->ncec_ipversion == IPV6_VERSION)
                ms = ndp_solicit(ncec, sender6, src_ill);
        else
                ms = arp_request(ncec, sender4, src_ill);
        mutex_exit(&ncec->ncec_lock);
        if (ms == 0) {
                if (ncec->ncec_state != ND_REACHABLE) {
                        if (ncec->ncec_ipversion == IPV6_VERSION)
                                ndp_resolv_failed(ncec);
                        else
                                arp_resolv_failed(ncec);
                        ASSERT((ncec->ncec_flags & NCE_F_STATIC) == 0);
                        nce_make_unreachable(ncec);
                        ncec_delete(ncec);
                }
        } else {
                nce_restart_timer(ncec, (clock_t)ms);
        }

        ill_refrele(src_ill);
}

/*
 * Send an IPv6 neighbor solicitation.
 * Returns number of milliseconds after which we should either rexmit or abort.
 * Return of zero means we should abort.
 * The caller holds the ncec_lock to protect ncec_qd_mp and ncec_rcnt.
 * The optional source address is used as a hint to ndp_solicit for
 * which source to use in the packet.
 *
 * NOTE: This routine drops ncec_lock (and later reacquires it) when sending
 * the packet.
 */
uint32_t
ndp_solicit(ncec_t *ncec, in6_addr_t src, ill_t *ill)
{
        in6_addr_t      dst;
        boolean_t       dropped = B_FALSE;

        ASSERT(ncec->ncec_ipversion == IPV6_VERSION);
        ASSERT(MUTEX_HELD(&ncec->ncec_lock));

        if (ncec->ncec_rcnt == 0)
                return (0);

        dst = ncec->ncec_addr;
        ncec->ncec_rcnt--;
        mutex_exit(&ncec->ncec_lock);
        dropped = ndp_xmit(ill, ND_NEIGHBOR_SOLICIT, ill->ill_phys_addr,
            ill->ill_phys_addr_length, &src, &dst, 0);
        mutex_enter(&ncec->ncec_lock);
        if (dropped)
                ncec->ncec_rcnt++;
        return (ncec->ncec_ill->ill_reachable_retrans_time);
}

/*
 * Attempt to recover an address on an interface that's been marked as a
 * duplicate.  Because NCEs are destroyed when the interface goes down, there's
 * no easy way to just probe the address and have the right thing happen if
 * it's no longer in use.  Instead, we just bring it up normally and allow the
 * regular interface start-up logic to probe for a remaining duplicate and take
 * us back down if necessary.
 * Neither DHCP nor temporary addresses arrive here; they're excluded by
 * ip_ndp_excl.
 */
/* ARGSUSED */
void
ip_addr_recover(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg)
{
        ill_t   *ill = rq->q_ptr;
        ipif_t  *ipif;
        in6_addr_t *addr6 = (in6_addr_t *)mp->b_rptr;
        in_addr_t *addr4 = (in_addr_t *)mp->b_rptr;
        boolean_t addr_equal;

        for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
                /*
                 * We do not support recovery of proxy ARP'd interfaces,
                 * because the system lacks a complete proxy ARP mechanism.
                 */
                if (ill->ill_isv6) {
                        addr_equal = IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6lcl_addr,
                            addr6);
                } else {
                        addr_equal = (ipif->ipif_lcl_addr == *addr4);
                }

                if ((ipif->ipif_flags & IPIF_POINTOPOINT) || !addr_equal)
                        continue;

                /*
                 * If we have already recovered or if the interface is going
                 * away, then ignore.
                 */
                mutex_enter(&ill->ill_lock);
                if (!(ipif->ipif_flags & IPIF_DUPLICATE) ||
                    (ipif->ipif_state_flags & IPIF_CONDEMNED)) {
                        mutex_exit(&ill->ill_lock);
                        continue;
                }

                ipif->ipif_flags &= ~IPIF_DUPLICATE;
                ill->ill_ipif_dup_count--;
                mutex_exit(&ill->ill_lock);
                ipif->ipif_was_dup = B_TRUE;

                if (ill->ill_isv6) {
                        VERIFY(ipif_ndp_up(ipif, B_TRUE) != EINPROGRESS);
                        (void) ipif_up_done_v6(ipif);
                } else {
                        VERIFY(ipif_arp_up(ipif, Res_act_initial, B_TRUE) !=
                            EINPROGRESS);
                        (void) ipif_up_done(ipif);
                }
        }
        freeb(mp);
}

/*
 * Attempt to recover an IPv6 interface that's been shut down as a duplicate.
 * As long as someone else holds the address, the interface will stay down.
 * When that conflict goes away, the interface is brought back up.  This is
 * done so that accidental shutdowns of addresses aren't made permanent.  Your
 * server will recover from a failure.
 *
 * For DHCP and temporary addresses, recovery is not done in the kernel.
 * Instead, it's handled by user space processes (dhcpagent and in.ndpd).
 *
 * This function is entered on a timer expiry; the ID is in ipif_recovery_id.
 */
void
ipif_dup_recovery(void *arg)
{
        ipif_t *ipif = arg;

        ipif->ipif_recovery_id = 0;
        if (!(ipif->ipif_flags & IPIF_DUPLICATE))
                return;

        /*
         * No lock, because this is just an optimization.
         */
        if (ipif->ipif_state_flags & IPIF_CONDEMNED)
                return;

        /* If the link is down, we'll retry this later */
        if (!(ipif->ipif_ill->ill_phyint->phyint_flags & PHYI_RUNNING))
                return;

        ipif_do_recovery(ipif);
}

/*
 * Perform interface recovery by forcing the duplicate interfaces up and
 * allowing the system to determine which ones should stay up.
 *
 * Called both by recovery timer expiry and link-up notification.
 */
void
ipif_do_recovery(ipif_t *ipif)
{
        ill_t *ill = ipif->ipif_ill;
        mblk_t *mp;
        ip_stack_t *ipst = ill->ill_ipst;
        size_t mp_size;

        if (ipif->ipif_isv6)
                mp_size = sizeof (ipif->ipif_v6lcl_addr);
        else
                mp_size = sizeof (ipif->ipif_lcl_addr);
        mp = allocb(mp_size, BPRI_MED);
        if (mp == NULL) {
                mutex_enter(&ill->ill_lock);
                if (ipst->ips_ip_dup_recovery > 0 &&
                    ipif->ipif_recovery_id == 0 &&
                    !(ipif->ipif_state_flags & IPIF_CONDEMNED)) {
                        ipif->ipif_recovery_id = timeout(ipif_dup_recovery,
                            ipif, MSEC_TO_TICK(ipst->ips_ip_dup_recovery));
                }
                mutex_exit(&ill->ill_lock);
        } else {
                /*
                 * A recovery timer may still be running if we got here from
                 * ill_restart_dad(); cancel that timer.
                 */
                if (ipif->ipif_recovery_id != 0)
                        (void) untimeout(ipif->ipif_recovery_id);
                ipif->ipif_recovery_id = 0;

                if (ipif->ipif_isv6) {
                        bcopy(&ipif->ipif_v6lcl_addr, mp->b_rptr,
                            sizeof (ipif->ipif_v6lcl_addr));
                } else  {
                        bcopy(&ipif->ipif_lcl_addr, mp->b_rptr,
                            sizeof (ipif->ipif_lcl_addr));
                }
                ill_refhold(ill);
                qwriter_ip(ill, ill->ill_rq, mp, ip_addr_recover, NEW_OP,
                    B_FALSE);
        }
}

/*
 * Find the MAC and IP addresses in an NA/NS message.
 */
static void
ip_ndp_find_addresses(mblk_t *mp, ip_recv_attr_t *ira, ill_t *ill,
    in6_addr_t *targp, uchar_t **haddr, uint_t *haddrlenp)
{
        icmp6_t *icmp6 = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
        nd_neighbor_solicit_t *ns = (nd_neighbor_solicit_t *)icmp6;
        uchar_t *addr;
        int alen;

        /* icmp_inbound_v6 ensures this */
        ASSERT(ira->ira_flags & IRAF_L2SRC_SET);

        addr = ira->ira_l2src;
        alen = ill->ill_phys_addr_length;
        if (alen > 0) {
                *haddr = addr;
                *haddrlenp = alen;
        } else {
                *haddr = NULL;
                *haddrlenp = 0;
        }

        /* nd_ns_target and nd_na_target are at the same offset, so we cheat */
        *targp = ns->nd_ns_target;
}

/*
 * This is for exclusive changes due to NDP duplicate address detection
 * failure.
 */
/* ARGSUSED */
static void
ip_ndp_excl(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg)
{
        ill_t   *ill = rq->q_ptr;
        ipif_t  *ipif;
        uchar_t *haddr;
        uint_t  haddrlen;
        ip_stack_t *ipst = ill->ill_ipst;
        in6_addr_t targ;
        ip_recv_attr_t iras;
        mblk_t  *attrmp;

        attrmp = mp;
        mp = mp->b_cont;
        attrmp->b_cont = NULL;
        if (!ip_recv_attr_from_mblk(attrmp, &iras)) {
                /* The ill or ip_stack_t disappeared on us */
                BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
                ip_drop_input("ip_recv_attr_from_mblk", mp, ill);
                freemsg(mp);
                ira_cleanup(&iras, B_TRUE);
                return;
        }

        ASSERT(ill == iras.ira_rill);

        ip_ndp_find_addresses(mp, &iras, ill, &targ, &haddr, &haddrlen);
        if (haddr != NULL && haddrlen == ill->ill_phys_addr_length) {
                /*
                 * Ignore conflicts generated by misbehaving switches that
                 * just reflect our own messages back to us.  For IPMP, we may
                 * see reflections across any ill in the illgrp.
                 *
                 * RFC2462 and revisions tried to detect both the case
                 * when a statically configured IPv6 address is a duplicate,
                 * and the case when the L2 address itself is a duplicate. The
                 * later is important because, with stateles address autoconf,
                 * if the L2 address is a duplicate, the resulting IPv6
                 * address(es) would also be duplicates. We rely on DAD of the
                 * IPv6 address itself to detect the latter case.
                 */
                /* For an under ill_grp can change under lock */
                rw_enter(&ipst->ips_ill_g_lock, RW_READER);
                if (bcmp(haddr, ill->ill_phys_addr, haddrlen) == 0 ||
                    IS_UNDER_IPMP(ill) &&
                    ipmp_illgrp_find_ill(ill->ill_grp, haddr,
                    haddrlen) != NULL) {
                        rw_exit(&ipst->ips_ill_g_lock);
                        goto ignore_conflict;
                }
                rw_exit(&ipst->ips_ill_g_lock);
        }

        /*
         * Look up the appropriate ipif.
         */
        ipif = ipif_lookup_addr_v6(&targ, ill, ALL_ZONES, ipst);
        if (ipif == NULL)
                goto ignore_conflict;

        /* Reload the ill to match the ipif */
        ill = ipif->ipif_ill;

        /* If it's already duplicate or ineligible, then don't do anything. */
        if (ipif->ipif_flags & (IPIF_POINTOPOINT|IPIF_DUPLICATE)) {
                ipif_refrele(ipif);
                goto ignore_conflict;
        }

        /*
         * If this is a failure during duplicate recovery, then don't
         * complain.  It may take a long time to recover.
         */
        if (!ipif->ipif_was_dup) {
                char ibuf[LIFNAMSIZ];
                char hbuf[MAC_STR_LEN];
                char sbuf[INET6_ADDRSTRLEN];

                ipif_get_name(ipif, ibuf, sizeof (ibuf));
                cmn_err(CE_WARN, "%s has duplicate address %s (in use by %s);"
                    " disabled", ibuf,
                    inet_ntop(AF_INET6, &targ, sbuf, sizeof (sbuf)),
                    mac_colon_addr(haddr, haddrlen, hbuf, sizeof (hbuf)));
        }
        mutex_enter(&ill->ill_lock);
        ASSERT(!(ipif->ipif_flags & IPIF_DUPLICATE));
        ipif->ipif_flags |= IPIF_DUPLICATE;
        ill->ill_ipif_dup_count++;
        mutex_exit(&ill->ill_lock);
        (void) ipif_down(ipif, NULL, NULL);
        (void) ipif_down_tail(ipif);
        mutex_enter(&ill->ill_lock);
        if (!(ipif->ipif_flags & (IPIF_DHCPRUNNING|IPIF_TEMPORARY)) &&
            ill->ill_net_type == IRE_IF_RESOLVER &&
            !(ipif->ipif_state_flags & IPIF_CONDEMNED) &&
            ipst->ips_ip_dup_recovery > 0) {
                ASSERT(ipif->ipif_recovery_id == 0);
                ipif->ipif_recovery_id = timeout(ipif_dup_recovery,
                    ipif, MSEC_TO_TICK(ipst->ips_ip_dup_recovery));
        }
        mutex_exit(&ill->ill_lock);
        ipif_refrele(ipif);

ignore_conflict:
        freemsg(mp);
        ira_cleanup(&iras, B_TRUE);
}

/*
 * Handle failure by tearing down the ipifs with the specified address.  Note
 * that tearing down the ipif also means deleting the ncec through ipif_down, so
 * it's not possible to do recovery by just restarting the ncec timer.  Instead,
 * we start a timer on the ipif.
 * Caller has to free mp;
 */
static void
ndp_failure(mblk_t *mp, ip_recv_attr_t *ira)
{
        const uchar_t   *haddr;
        ill_t           *ill = ira->ira_rill;

        /*
         * Ignore conflicts generated by misbehaving switches that just
         * reflect our own messages back to us.
         */

        /* icmp_inbound_v6 ensures this */
        ASSERT(ira->ira_flags & IRAF_L2SRC_SET);
        haddr = ira->ira_l2src;
        if (haddr != NULL &&
            bcmp(haddr, ill->ill_phys_addr, ill->ill_phys_addr_length) == 0) {
                return;
        }

        if ((mp = copymsg(mp)) != NULL) {
                mblk_t  *attrmp;

                attrmp = ip_recv_attr_to_mblk(ira);
                if (attrmp == NULL) {
                        BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
                        ip_drop_input("ipIfStatsInDiscards", mp, ill);
                        freemsg(mp);
                } else {
                        ASSERT(attrmp->b_cont == NULL);
                        attrmp->b_cont = mp;
                        mp = attrmp;
                        ill_refhold(ill);
                        qwriter_ip(ill, ill->ill_rq, mp, ip_ndp_excl, NEW_OP,
                            B_FALSE);
                }
        }
}

/*
 * Handle a discovered conflict: some other system is advertising that it owns
 * one of our IP addresses.  We need to defend ourselves, or just shut down the
 * interface.
 *
 * Handles both IPv4 and IPv6
 */
boolean_t
ip_nce_conflict(mblk_t *mp, ip_recv_attr_t *ira, ncec_t *ncec)
{
        ipif_t          *ipif;
        clock_t         now;
        uint_t          maxdefense;
        uint_t          defs;
        ill_t           *ill = ira->ira_ill;
        ip_stack_t      *ipst = ill->ill_ipst;
        uint32_t        elapsed;
        boolean_t       isv6 = ill->ill_isv6;
        ipaddr_t        ncec_addr;

        if (isv6) {
                ipif = ipif_lookup_addr_v6(&ncec->ncec_addr, ill, ALL_ZONES,
                    ipst);
        } else {
                if (arp_no_defense) {
                        /*
                         * Yes, there is a conflict, but no, we do not
                         * defend ourself.
                         */
                        return (B_TRUE);
                }
                IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, ncec_addr);
                ipif = ipif_lookup_addr(ncec_addr, ill, ALL_ZONES,
                    ipst);
        }
        if (ipif == NULL)
                return (B_FALSE);

        /*
         * First, figure out if this address is disposable.
         */
        if (ipif->ipif_flags & (IPIF_DHCPRUNNING | IPIF_TEMPORARY))
                maxdefense = ipst->ips_ip_max_temp_defend;
        else
                maxdefense = ipst->ips_ip_max_defend;

        /*
         * Now figure out how many times we've defended ourselves.  Ignore
         * defenses that happened long in the past.
         */
        now = ddi_get_lbolt();
        elapsed = (drv_hztousec(now - ncec->ncec_last_time_defended))/1000000;
        mutex_enter(&ncec->ncec_lock);
        if ((defs = ncec->ncec_defense_count) > 0 &&
            elapsed > ipst->ips_ip_defend_interval) {
                /*
                 * ip_defend_interval has elapsed.
                 * reset the defense count.
                 */
                ncec->ncec_defense_count = defs = 0;
        }
        ncec->ncec_defense_count++;
        ncec->ncec_last_time_defended = now;
        mutex_exit(&ncec->ncec_lock);
        ipif_refrele(ipif);

        /*
         * If we've defended ourselves too many times already, then give up and
         * tear down the interface(s) using this address.
         * Otherwise, caller has to defend by sending out an announce.
         */
        if (defs >= maxdefense) {
                if (isv6)
                        ndp_failure(mp, ira);
                else
                        arp_failure(mp, ira);
        } else {
                return (B_TRUE); /* caller must defend this address */
        }
        return (B_FALSE);
}

/*
 * Handle reception of Neighbor Solicitation messages.
 */
static void
ndp_input_solicit(mblk_t *mp, ip_recv_attr_t *ira)
{
        ill_t           *ill = ira->ira_ill, *under_ill;
        nd_neighbor_solicit_t *ns;
        uint32_t        hlen = ill->ill_phys_addr_length;
        uchar_t         *haddr = NULL;
        icmp6_t         *icmp_nd;
        ip6_t           *ip6h;
        ncec_t          *our_ncec = NULL;
        in6_addr_t      target;
        in6_addr_t      src;
        int             len;
        int             flag = 0;
        nd_opt_hdr_t    *opt = NULL;
        boolean_t       bad_solicit = B_FALSE;
        mib2_ipv6IfIcmpEntry_t  *mib = ill->ill_icmp6_mib;
        boolean_t       need_ill_refrele = B_FALSE;

        ip6h = (ip6_t *)mp->b_rptr;
        icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
        len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN;
        src = ip6h->ip6_src;
        ns = (nd_neighbor_solicit_t *)icmp_nd;
        target = ns->nd_ns_target;
        if (IN6_IS_ADDR_MULTICAST(&target) || IN6_IS_ADDR_V4MAPPED(&target) ||
            IN6_IS_ADDR_LOOPBACK(&target)) {
                if (ip_debug > 2) {
                        /* ip1dbg */
                        pr_addr_dbg("ndp_input_solicit: Martian Target %s\n",
                            AF_INET6, &target);
                }
                bad_solicit = B_TRUE;
                goto done;
        }
        if (len > sizeof (nd_neighbor_solicit_t)) {
                /* Options present */
                opt = (nd_opt_hdr_t *)&ns[1];
                len -= sizeof (nd_neighbor_solicit_t);
                if (!ndp_verify_optlen(opt, len)) {
                        ip1dbg(("ndp_input_solicit: Bad opt len\n"));
                        bad_solicit = B_TRUE;
                        goto done;
                }
        }
        if (IN6_IS_ADDR_UNSPECIFIED(&src)) {
                /* Check to see if this is a valid DAD solicitation */
                if (!IN6_IS_ADDR_MC_SOLICITEDNODE(&ip6h->ip6_dst)) {
                        if (ip_debug > 2) {
                                /* ip1dbg */
                                pr_addr_dbg("ndp_input_solicit: IPv6 "
                                    "Destination is not solicited node "
                                    "multicast %s\n", AF_INET6,
                                    &ip6h->ip6_dst);
                        }
                        bad_solicit = B_TRUE;
                        goto done;
                }
        }

        /*
         * NOTE: with IPMP, it's possible the nominated multicast ill (which
         * received this packet if it's multicast) is not the ill tied to
         * e.g. the IPMP ill's data link-local.  So we match across the illgrp
         * to ensure we find the associated NCE.
         */
        our_ncec = ncec_lookup_illgrp_v6(ill, &target);
        /*
         * If this is a valid Solicitation for an address we are publishing,
         * then a PUBLISH entry should exist in the cache
         */
        if (our_ncec == NULL || !NCE_PUBLISH(our_ncec)) {
                ip1dbg(("ndp_input_solicit: Wrong target in NS?!"
                    "ifname=%s ", ill->ill_name));
                if (ip_debug > 2) {
                        /* ip1dbg */
                        pr_addr_dbg(" dst %s\n", AF_INET6, &target);
                }
                if (our_ncec == NULL)
                        bad_solicit = B_TRUE;
                goto done;
        }

        /* At this point we should have a verified NS per spec */
        if (opt != NULL) {
                opt = ndp_get_option(opt, len, ND_OPT_SOURCE_LINKADDR);
                if (opt != NULL) {
                        haddr = (uchar_t *)&opt[1];
                        if (hlen > opt->nd_opt_len * 8 - sizeof (*opt) ||
                            hlen == 0) {
                                ip1dbg(("ndp_input_advert: bad SLLA\n"));
                                bad_solicit = B_TRUE;
                                goto done;
                        }
                }
        }

        /* If sending directly to peer, set the unicast flag */
        if (!IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst))
                flag |= NDP_UNICAST;

        /*
         * Create/update the entry for the soliciting node on the ipmp_ill.
         * or respond to outstanding queries, don't if
         * the source is unspecified address.
         */
        if (!IN6_IS_ADDR_UNSPECIFIED(&src)) {
                int     err;
                nce_t   *nnce;

                ASSERT(ill->ill_isv6);
                /*
                 * Regular solicitations *must* include the Source Link-Layer
                 * Address option.  Ignore messages that do not.
                 */
                if (haddr == NULL && IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst)) {
                        ip1dbg(("ndp_input_solicit: source link-layer address "
                            "option missing with a specified source.\n"));
                        bad_solicit = B_TRUE;
                        goto done;
                }

                /*
                 * This is a regular solicitation.  If we're still in the
                 * process of verifying the address, then don't respond at all
                 * and don't keep track of the sender.
                 */
                if (our_ncec->ncec_state == ND_PROBE)
                        goto done;

                /*
                 * If the solicitation doesn't have sender hardware address
                 * (legal for unicast solicitation), then process without
                 * installing the return NCE.  Either we already know it, or
                 * we'll be forced to look it up when (and if) we reply to the
                 * packet.
                 */
                if (haddr == NULL)
                        goto no_source;

                under_ill = ill;
                if (IS_UNDER_IPMP(under_ill)) {
                        ill = ipmp_ill_hold_ipmp_ill(under_ill);
                        if (ill == NULL)
                                ill = under_ill;
                        else
                                need_ill_refrele = B_TRUE;
                }
                err = nce_lookup_then_add_v6(ill,
                    haddr, hlen,
                    &src,       /* Soliciting nodes address */
                    0,
                    ND_STALE,
                    &nnce);

                if (need_ill_refrele) {
                        ill_refrele(ill);
                        ill = under_ill;
                        need_ill_refrele =  B_FALSE;
                }
                switch (err) {
                case 0:
                        /* done with this entry */
                        nce_refrele(nnce);
                        break;
                case EEXIST:
                        /*
                         * B_FALSE indicates this is not an an advertisement.
                         */
                        nce_process(nnce->nce_common, haddr, 0, B_FALSE);
                        nce_refrele(nnce);
                        break;
                default:
                        ip1dbg(("ndp_input_solicit: Can't create NCE %d\n",
                            err));
                        goto done;
                }
no_source:
                flag |= NDP_SOLICITED;
        } else {
                /*
                 * No source link layer address option should be present in a
                 * valid DAD request.
                 */
                if (haddr != NULL) {
                        ip1dbg(("ndp_input_solicit: source link-layer address "
                            "option present with an unspecified source.\n"));
                        bad_solicit = B_TRUE;
                        goto done;
                }
                if (our_ncec->ncec_state == ND_PROBE) {
                        /*
                         * Internally looped-back probes will have
                         * IRAF_L2SRC_LOOPBACK set so we can ignore our own
                         * transmissions.
                         */
                        if (!(ira->ira_flags & IRAF_L2SRC_LOOPBACK)) {
                                /*
                                 * If someone else is probing our address, then
                                 * we've crossed wires.  Declare failure.
                                 */
                                ndp_failure(mp, ira);
                        }
                        goto done;
                }
                /*
                 * This is a DAD probe.  Multicast the advertisement to the
                 * all-nodes address.
                 */
                src = ipv6_all_hosts_mcast;
        }
        flag |= nce_advert_flags(our_ncec);
        (void) ndp_xmit(ill,
            ND_NEIGHBOR_ADVERT,
            our_ncec->ncec_lladdr,
            our_ncec->ncec_lladdr_length,
            &target,    /* Source and target of the advertisement pkt */
            &src,       /* IP Destination (source of original pkt) */
            flag);
done:
        if (bad_solicit)
                BUMP_MIB(mib, ipv6IfIcmpInBadNeighborSolicitations);
        if (our_ncec != NULL)
                ncec_refrele(our_ncec);
}

/*
 * Handle reception of Neighbor Solicitation messages
 */
void
ndp_input_advert(mblk_t *mp, ip_recv_attr_t *ira)
{
        ill_t           *ill = ira->ira_ill;
        nd_neighbor_advert_t *na;
        uint32_t        hlen = ill->ill_phys_addr_length;
        uchar_t         *haddr = NULL;
        icmp6_t         *icmp_nd;
        ip6_t           *ip6h;
        ncec_t          *dst_ncec = NULL;
        in6_addr_t      target;
        nd_opt_hdr_t    *opt = NULL;
        int             len;
        ip_stack_t      *ipst = ill->ill_ipst;
        mib2_ipv6IfIcmpEntry_t  *mib = ill->ill_icmp6_mib;

        ip6h = (ip6_t *)mp->b_rptr;
        icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
        len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN;
        na = (nd_neighbor_advert_t *)icmp_nd;

        if (IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst) &&
            (na->nd_na_flags_reserved & ND_NA_FLAG_SOLICITED)) {
                ip1dbg(("ndp_input_advert: Target is multicast but the "
                    "solicited flag is not zero\n"));
                BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements);
                return;
        }
        target = na->nd_na_target;
        if (IN6_IS_ADDR_MULTICAST(&target) || IN6_IS_ADDR_V4MAPPED(&target) ||
            IN6_IS_ADDR_LOOPBACK(&target)) {
                if (ip_debug > 2) {
                        /* ip1dbg */
                        pr_addr_dbg("ndp_input_solicit: Martian Target %s\n",
                            AF_INET6, &target);
                }
                BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements);
                return;
        }
        if (len > sizeof (nd_neighbor_advert_t)) {
                opt = (nd_opt_hdr_t *)&na[1];
                if (!ndp_verify_optlen(opt,
                    len - sizeof (nd_neighbor_advert_t))) {
                        ip1dbg(("ndp_input_advert: cannot verify SLLA\n"));
                        BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements);
                        return;
                }
                /* At this point we have a verified NA per spec */
                len -= sizeof (nd_neighbor_advert_t);
                opt = ndp_get_option(opt, len, ND_OPT_TARGET_LINKADDR);
                if (opt != NULL) {
                        haddr = (uchar_t *)&opt[1];
                        if (hlen > opt->nd_opt_len * 8 - sizeof (*opt) ||
                            hlen == 0) {
                                ip1dbg(("ndp_input_advert: bad SLLA\n"));
                                BUMP_MIB(mib,
                                    ipv6IfIcmpInBadNeighborAdvertisements);
                                return;
                        }
                }
        }

        /*
         * NOTE: we match across the illgrp since we need to do DAD for all of
         * our local addresses, and those are spread across all the active
         * ills in the group.
         */
        if ((dst_ncec = ncec_lookup_illgrp_v6(ill, &target)) == NULL)
                return;

        if (NCE_PUBLISH(dst_ncec)) {
                /*
                 * Someone just advertised an addresses that we publish. First,
                 * check it it was us -- if so, we can safely ignore it.
                 * We don't get the haddr from the ira_l2src because, in the
                 * case that the packet originated from us, on an IPMP group,
                 * the ira_l2src may would be the link-layer address of the
                 * cast_ill used to send the packet, which may not be the same
                 * as the dst_ncec->ncec_lladdr of the address.
                 */
                if (haddr != NULL) {
                        if (ira->ira_flags & IRAF_L2SRC_LOOPBACK)
                                goto out;

                        if (!nce_cmp_ll_addr(dst_ncec, haddr, hlen))
                                goto out;   /* from us -- no conflict */

                        /*
                         * If we're in an IPMP group, check if this is an echo
                         * from another ill in the group.  Use the double-
                         * checked locking pattern to avoid grabbing
                         * ill_g_lock in the non-IPMP case.
                         */
                        if (IS_UNDER_IPMP(ill)) {
                                rw_enter(&ipst->ips_ill_g_lock, RW_READER);
                                if (IS_UNDER_IPMP(ill) && ipmp_illgrp_find_ill(
                                    ill->ill_grp, haddr, hlen) != NULL) {
                                        rw_exit(&ipst->ips_ill_g_lock);
                                        goto out;
                                }
                                rw_exit(&ipst->ips_ill_g_lock);
                        }
                }

                /*
                 * This appears to be a real conflict.  If we're trying to
                 * configure this NCE (ND_PROBE), then shut it down.
                 * Otherwise, handle the discovered conflict.
                 */
                if (dst_ncec->ncec_state == ND_PROBE) {
                        ndp_failure(mp, ira);
                } else {
                        if (ip_nce_conflict(mp, ira, dst_ncec)) {
                                char hbuf[MAC_STR_LEN];
                                char sbuf[INET6_ADDRSTRLEN];

                                cmn_err(CE_WARN,
                                    "node '%s' is using %s on %s",
                                    inet_ntop(AF_INET6, &target, sbuf,
                                    sizeof (sbuf)),
                                    haddr == NULL ? "<none>" :
                                    mac_colon_addr(haddr, hlen, hbuf,
                                    sizeof (hbuf)), ill->ill_name);
                                /*
                                 * RFC 4862, Section 5.4.4 does not mandate
                                 * any specific behavior when an NA matches
                                 * a non-tentative address assigned to the
                                 * receiver. We make the choice of defending
                                 * our address, based on the assumption that
                                 * the sender has not detected the Duplicate.
                                 *
                                 * ncec_last_time_defended has been adjusted
                                 * in ip_nce_conflict()
                                 */
                                (void) ndp_announce(dst_ncec);
                        }
                }
        } else {
                if (na->nd_na_flags_reserved & ND_NA_FLAG_ROUTER)
                        dst_ncec->ncec_flags |= NCE_F_ISROUTER;

                /* B_TRUE indicates this an advertisement */
                nce_process(dst_ncec, haddr, na->nd_na_flags_reserved, B_TRUE);
        }
out:
        ncec_refrele(dst_ncec);
}

/*
 * Process NDP neighbor solicitation/advertisement messages.
 * The checksum has already checked o.k before reaching here.
 * Information about the datalink header is contained in ira_l2src, but
 * that should be ignored for loopback packets.
 */
void
ndp_input(mblk_t *mp, ip_recv_attr_t *ira)
{
        ill_t           *ill = ira->ira_rill;
        icmp6_t         *icmp_nd;
        ip6_t           *ip6h;
        int             len;
        mib2_ipv6IfIcmpEntry_t  *mib = ill->ill_icmp6_mib;
        ill_t           *orig_ill = NULL;

        /*
         * Since ira_ill is where the IRE_LOCAL was hosted we use ira_rill
         * and make it be the IPMP upper so avoid being confused by a packet
         * addressed to a unicast address on a different ill.
         */
        if (IS_UNDER_IPMP(ill)) {
                orig_ill = ill;
                ill = ipmp_ill_hold_ipmp_ill(orig_ill);
                if (ill == NULL) {
                        ill = orig_ill;
                        BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
                        ip_drop_input("ipIfStatsInDiscards - IPMP ill",
                            mp, ill);
                        freemsg(mp);
                        return;
                }
                ASSERT(ill != orig_ill);
                orig_ill = ira->ira_ill;
                ira->ira_ill = ill;
                mib = ill->ill_icmp6_mib;
        }
        if (!pullupmsg(mp, -1)) {
                ip1dbg(("ndp_input: pullupmsg failed\n"));
                BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
                ip_drop_input("ipIfStatsInDiscards - pullupmsg", mp, ill);
                goto done;
        }
        ip6h = (ip6_t *)mp->b_rptr;
        if (ip6h->ip6_hops != IPV6_MAX_HOPS) {
                ip1dbg(("ndp_input: hoplimit != IPV6_MAX_HOPS\n"));
                ip_drop_input("ipv6IfIcmpBadHoplimit", mp, ill);
                BUMP_MIB(mib, ipv6IfIcmpBadHoplimit);
                goto done;
        }
        /*
         * NDP does not accept any extension headers between the
         * IP header and the ICMP header since e.g. a routing
         * header could be dangerous.
         * This assumes that any AH or ESP headers are removed
         * by ip prior to passing the packet to ndp_input.
         */
        if (ip6h->ip6_nxt != IPPROTO_ICMPV6) {
                ip1dbg(("ndp_input: Wrong next header 0x%x\n",
                    ip6h->ip6_nxt));
                ip_drop_input("Wrong next header", mp, ill);
                BUMP_MIB(mib, ipv6IfIcmpInErrors);
                goto done;
        }
        icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
        ASSERT(icmp_nd->icmp6_type == ND_NEIGHBOR_SOLICIT ||
            icmp_nd->icmp6_type == ND_NEIGHBOR_ADVERT);
        if (icmp_nd->icmp6_code != 0) {
                ip1dbg(("ndp_input: icmp6 code != 0 \n"));
                ip_drop_input("code non-zero", mp, ill);
                BUMP_MIB(mib, ipv6IfIcmpInErrors);
                goto done;
        }
        len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN;
        /*
         * Make sure packet length is large enough for either
         * a NS or a NA icmp packet.
         */
        if (len <  sizeof (struct icmp6_hdr) + sizeof (struct in6_addr)) {
                ip1dbg(("ndp_input: packet too short\n"));
                ip_drop_input("packet too short", mp, ill);
                BUMP_MIB(mib, ipv6IfIcmpInErrors);
                goto done;
        }
        if (icmp_nd->icmp6_type == ND_NEIGHBOR_SOLICIT) {
                ndp_input_solicit(mp, ira);
        } else {
                ndp_input_advert(mp, ira);
        }
done:
        freemsg(mp);
        if (orig_ill != NULL) {
                ill_refrele(ill);
                ira->ira_ill = orig_ill;
        }
}

/*
 * ndp_xmit is called to form and transmit a ND solicitation or
 * advertisement ICMP packet.
 *
 * If the source address is unspecified and this isn't a probe (used for
 * duplicate address detection), an appropriate source address and link layer
 * address will be chosen here.  The link layer address option is included if
 * the source is specified (i.e., all non-probe packets), and omitted (per the
 * specification) otherwise.
 *
 * It returns B_FALSE only if it does a successful put() to the
 * corresponding ill's ill_wq otherwise returns B_TRUE.
 */
static boolean_t
ndp_xmit(ill_t *ill, uint32_t operation, uint8_t *hw_addr, uint_t hw_addr_len,
    const in6_addr_t *sender, const in6_addr_t *target, int flag)
{
        uint32_t        len;
        icmp6_t         *icmp6;
        mblk_t          *mp;
        ip6_t           *ip6h;
        nd_opt_hdr_t    *opt;
        uint_t          plen;
        zoneid_t        zoneid = GLOBAL_ZONEID;
        ill_t           *hwaddr_ill = ill;
        ip_xmit_attr_t  ixas;
        ip_stack_t      *ipst = ill->ill_ipst;
        boolean_t       need_refrele = B_FALSE;
        boolean_t       probe = B_FALSE;

        if (IS_UNDER_IPMP(ill)) {
                probe = ipif_lookup_testaddr_v6(ill, sender, NULL);
                /*
                 * We send non-probe packets on the upper IPMP interface.
                 * ip_output_simple() will use cast_ill for sending any
                 * multicast packets. Note that we can't follow the same
                 * logic for probe packets because all interfaces in the ipmp
                 * group may have failed, so that we really want to only try
                 * to send the ND packet on the ill corresponding to the src
                 * address.
                 */
                if (!probe) {
                        ill = ipmp_ill_hold_ipmp_ill(ill);
                        if (ill != NULL)
                                need_refrele = B_TRUE;
                        else
                                ill = hwaddr_ill;
                }
        }

        /*
         * If we have a unspecified source(sender) address, select a
         * proper source address for the solicitation here itself so
         * that we can initialize the h/w address correctly.
         *
         * If the sender is specified then we use this address in order
         * to lookup the zoneid before calling ip_output_v6(). This is to
         * enable unicast ND_NEIGHBOR_ADVERT packets to be routed correctly
         * by IP (we cannot guarantee that the global zone has an interface
         * route to the destination).
         *
         * Note that the NA never comes here with the unspecified source
         * address.
         */

        /*
         * Probes will have unspec src at this point.
         */
        if (!(IN6_IS_ADDR_UNSPECIFIED(sender))) {
                zoneid = ipif_lookup_addr_zoneid_v6(sender, ill, ipst);
                /*
                 * It's possible for ipif_lookup_addr_zoneid_v6() to return
                 * ALL_ZONES if it cannot find a matching ipif for the address
                 * we are trying to use. In this case we err on the side of
                 * trying to send the packet by defaulting to the GLOBAL_ZONEID.
                 */
                if (zoneid == ALL_ZONES)
                        zoneid = GLOBAL_ZONEID;
        }

        plen = (sizeof (nd_opt_hdr_t) + hw_addr_len + 7) / 8;
        len = IPV6_HDR_LEN + sizeof (nd_neighbor_advert_t) + plen * 8;
        mp = allocb(len,  BPRI_LO);
        if (mp == NULL) {
                if (need_refrele)
                        ill_refrele(ill);
                return (B_TRUE);
        }

        bzero((char *)mp->b_rptr, len);
        mp->b_wptr = mp->b_rptr + len;

        bzero(&ixas, sizeof (ixas));
        ixas.ixa_flags = IXAF_SET_ULP_CKSUM | IXAF_NO_HW_CKSUM;

        ixas.ixa_ifindex = ill->ill_phyint->phyint_ifindex;
        ixas.ixa_ipst = ipst;
        ixas.ixa_cred = kcred;
        ixas.ixa_cpid = NOPID;
        ixas.ixa_tsl = NULL;
        ixas.ixa_zoneid = zoneid;

        ip6h = (ip6_t *)mp->b_rptr;
        ip6h->ip6_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
        ip6h->ip6_plen = htons(len - IPV6_HDR_LEN);
        ip6h->ip6_nxt = IPPROTO_ICMPV6;
        ip6h->ip6_hops = IPV6_MAX_HOPS;
        ixas.ixa_multicast_ttl = ip6h->ip6_hops;
        ip6h->ip6_dst = *target;
        icmp6 = (icmp6_t *)&ip6h[1];

        if (hw_addr_len != 0) {
                opt = (nd_opt_hdr_t *)((uint8_t *)ip6h + IPV6_HDR_LEN +
                    sizeof (nd_neighbor_advert_t));
        } else {
                opt = NULL;
        }
        if (operation == ND_NEIGHBOR_SOLICIT) {
                nd_neighbor_solicit_t *ns = (nd_neighbor_solicit_t *)icmp6;

                if (opt != NULL && !(flag & NDP_PROBE)) {
                        /*
                         * Note that we don't send out SLLA for ND probes
                         * per RFC 4862, even though we do send out the src
                         * haddr for IPv4 DAD probes, even though both IPv4
                         * and IPv6 go out with the unspecified/INADDR_ANY
                         * src IP addr.
                         */
                        opt->nd_opt_type = ND_OPT_SOURCE_LINKADDR;
                }
                ip6h->ip6_src = *sender;
                ns->nd_ns_target = *target;
                if (!(flag & NDP_UNICAST)) {
                        /* Form multicast address of the target */
                        ip6h->ip6_dst = ipv6_solicited_node_mcast;
                        ip6h->ip6_dst.s6_addr32[3] |=
                            ns->nd_ns_target.s6_addr32[3];
                }
        } else {
                nd_neighbor_advert_t *na = (nd_neighbor_advert_t *)icmp6;

                ASSERT(!(flag & NDP_PROBE));
                if (opt != NULL)
                        opt->nd_opt_type = ND_OPT_TARGET_LINKADDR;
                ip6h->ip6_src = *sender;
                na->nd_na_target = *sender;
                if (flag & NDP_ISROUTER)
                        na->nd_na_flags_reserved |= ND_NA_FLAG_ROUTER;
                if (flag & NDP_SOLICITED)
                        na->nd_na_flags_reserved |= ND_NA_FLAG_SOLICITED;
                if (flag & NDP_ORIDE)
                        na->nd_na_flags_reserved |= ND_NA_FLAG_OVERRIDE;
        }

        if (!(flag & NDP_PROBE)) {
                if (hw_addr != NULL && opt != NULL) {
                        /* Fill in link layer address and option len */
                        opt->nd_opt_len = (uint8_t)plen;
                        bcopy(hw_addr, &opt[1], hw_addr_len);
                }
        }
        if (opt != NULL && opt->nd_opt_type == 0) {
                /* If there's no link layer address option, then strip it. */
                len -= plen * 8;
                mp->b_wptr = mp->b_rptr + len;
                ip6h->ip6_plen = htons(len - IPV6_HDR_LEN);
        }

        icmp6->icmp6_type = (uint8_t)operation;
        icmp6->icmp6_code = 0;
        /*
         * Prepare for checksum by putting icmp length in the icmp
         * checksum field. The checksum is calculated in ip_output.c.
         */
        icmp6->icmp6_cksum = ip6h->ip6_plen;

        (void) ip_output_simple(mp, &ixas);
        ixa_cleanup(&ixas);
        if (need_refrele)
                ill_refrele(ill);
        return (B_FALSE);
}

/*
 * Used to set ND_UNREACHBLE before ncec_delete sets it NCE_F_CONDEMNED.
 * The datapath uses this as an indication that there
 * is a problem (as opposed to a NCE that was just
 * reclaimed due to lack of memory.
 * Note that static ARP entries never become unreachable.
 */
void
nce_make_unreachable(ncec_t *ncec)
{
        mutex_enter(&ncec->ncec_lock);
        ncec->ncec_state = ND_UNREACHABLE;
        mutex_exit(&ncec->ncec_lock);
}

/*
 * NCE retransmit timer. Common to IPv4 and IPv6.
 * This timer goes off when:
 * a. It is time to retransmit a resolution for resolver.
 * b. It is time to send reachability probes.
 */
void
nce_timer(void *arg)
{
        ncec_t          *ncec = arg;
        ill_t           *ill = ncec->ncec_ill, *src_ill;
        char            addrbuf[INET6_ADDRSTRLEN];
        boolean_t       dropped = B_FALSE;
        ip_stack_t      *ipst = ncec->ncec_ipst;
        boolean_t       isv6 = (ncec->ncec_ipversion == IPV6_VERSION);
        in_addr_t       sender4 = INADDR_ANY;
        in6_addr_t      sender6 = ipv6_all_zeros;

        /*
         * The timer has to be cancelled by ncec_delete before doing the final
         * refrele. So the NCE is guaranteed to exist when the timer runs
         * until it clears the timeout_id. Before clearing the timeout_id
         * bump up the refcnt so that we can continue to use the ncec
         */
        ASSERT(ncec != NULL);
        mutex_enter(&ncec->ncec_lock);
        ncec_refhold_locked(ncec);
        ncec->ncec_timeout_id = 0;
        mutex_exit(&ncec->ncec_lock);

        src_ill = nce_resolve_src(ncec, &sender6);
        /* if we could not find a sender address, return */
        if (src_ill == NULL) {
                if (!isv6) {
                        IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, sender4);
                        ip1dbg(("no src ill for %s\n", inet_ntop(AF_INET,
                            &sender4, addrbuf, sizeof (addrbuf))));
                } else {
                        ip1dbg(("no src ill for %s\n", inet_ntop(AF_INET6,
                            &ncec->ncec_addr, addrbuf, sizeof (addrbuf))));
                }
                nce_restart_timer(ncec, ill->ill_reachable_retrans_time);
                ncec_refrele(ncec);
                return;
        }
        if (!isv6)
                IN6_V4MAPPED_TO_IPADDR(&sender6, sender4);

        mutex_enter(&ncec->ncec_lock);
        /*
         * Check the reachability state.
         */
        switch (ncec->ncec_state) {
        case ND_DELAY:
                ASSERT(ncec->ncec_lladdr != NULL);
                ncec->ncec_state = ND_PROBE;
                ncec->ncec_pcnt = ND_MAX_UNICAST_SOLICIT;
                if (isv6) {
                        mutex_exit(&ncec->ncec_lock);
                        dropped = ndp_xmit(src_ill, ND_NEIGHBOR_SOLICIT,
                            src_ill->ill_phys_addr,
                            src_ill->ill_phys_addr_length,
                            &sender6, &ncec->ncec_addr,
                            NDP_UNICAST);
                } else {
                        dropped = (arp_request(ncec, sender4, src_ill) == 0);
                        mutex_exit(&ncec->ncec_lock);
                }
                if (!dropped) {
                        mutex_enter(&ncec->ncec_lock);
                        ncec->ncec_pcnt--;
                        mutex_exit(&ncec->ncec_lock);
                }
                if (ip_debug > 3) {
                        /* ip2dbg */
                        pr_addr_dbg("nce_timer: state for %s changed "
                            "to PROBE\n", AF_INET6, &ncec->ncec_addr);
                }
                nce_restart_timer(ncec, ill->ill_reachable_retrans_time);
                break;
        case ND_PROBE:
                /* must be retransmit timer */
                ASSERT(ncec->ncec_pcnt >= -1);
                if (ncec->ncec_pcnt > 0) {
                        /*
                         * As per RFC2461, the ncec gets deleted after
                         * MAX_UNICAST_SOLICIT unsuccessful re-transmissions.
                         * Note that the first unicast solicitation is sent
                         * during the DELAY state.
                         */
                        ip2dbg(("nce_timer: pcount=%x dst %s\n",
                            ncec->ncec_pcnt,
                            inet_ntop((isv6? AF_INET6 : AF_INET),
                            &ncec->ncec_addr, addrbuf, sizeof (addrbuf))));
                        if (NCE_PUBLISH(ncec)) {
                                mutex_exit(&ncec->ncec_lock);
                                /*
                                 * send out a probe; note that src_ill
                                 * is ignored by nce_dad() for all
                                 * DAD message types other than IPv6
                                 * unicast probes
                                 */
                                nce_dad(ncec, src_ill, B_TRUE);
                        } else {
                                ASSERT(src_ill != NULL);
                                if (isv6) {
                                        mutex_exit(&ncec->ncec_lock);
                                        dropped = ndp_xmit(src_ill,
                                            ND_NEIGHBOR_SOLICIT,
                                            src_ill->ill_phys_addr,
                                            src_ill->ill_phys_addr_length,
                                            &sender6, &ncec->ncec_addr,
                                            NDP_UNICAST);
                                } else {
                                        /*
                                         * since the nce is REACHABLE,
                                         * the ARP request will be sent out
                                         * as a link-layer unicast.
                                         */
                                        dropped = (arp_request(ncec, sender4,
                                            src_ill) == 0);
                                        mutex_exit(&ncec->ncec_lock);
                                }
                                if (!dropped) {
                                        mutex_enter(&ncec->ncec_lock);
                                        ncec->ncec_pcnt--;
                                        mutex_exit(&ncec->ncec_lock);
                                }
                                nce_restart_timer(ncec,
                                    ill->ill_reachable_retrans_time);
                        }
                } else if (ncec->ncec_pcnt < 0) {
                        /* No hope, delete the ncec */
                        /* Tell datapath it went bad */
                        ncec->ncec_state = ND_UNREACHABLE;
                        mutex_exit(&ncec->ncec_lock);
                        if (ip_debug > 2) {
                                /* ip1dbg */
                                pr_addr_dbg("nce_timer: Delete NCE for"
                                    " dst %s\n", (isv6? AF_INET6: AF_INET),
                                    &ncec->ncec_addr);
                        }
                        /* if static ARP can't delete. */
                        if ((ncec->ncec_flags & NCE_F_STATIC) == 0)
                                ncec_delete(ncec);

                } else if (!NCE_PUBLISH(ncec)) {
                        /*
                         * Probe count is 0 for a dynamic entry (one that we
                         * ourselves are not publishing). We should never get
                         * here if NONUD was requested, hence the ASSERT below.
                         */
                        ASSERT((ncec->ncec_flags & NCE_F_NONUD) == 0);
                        ip2dbg(("nce_timer: pcount=%x dst %s\n",
                            ncec->ncec_pcnt, inet_ntop(AF_INET6,
                            &ncec->ncec_addr, addrbuf, sizeof (addrbuf))));
                        ncec->ncec_pcnt--;
                        mutex_exit(&ncec->ncec_lock);
                        /* Wait one interval before killing */
                        nce_restart_timer(ncec,
                            ill->ill_reachable_retrans_time);
                } else if (ill->ill_phyint->phyint_flags & PHYI_RUNNING) {
                        ipif_t *ipif;
                        ipaddr_t ncec_addr;

                        /*
                         * We're done probing, and we can now declare this
                         * address to be usable.  Let IP know that it's ok to
                         * use.
                         */
                        ncec->ncec_state = ND_REACHABLE;
                        ncec->ncec_flags &= ~NCE_F_UNVERIFIED;
                        mutex_exit(&ncec->ncec_lock);
                        if (isv6) {
                                ipif = ipif_lookup_addr_exact_v6(
                                    &ncec->ncec_addr, ill, ipst);
                        } else {
                                IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr,
                                    ncec_addr);
                                ipif = ipif_lookup_addr_exact(ncec_addr, ill,
                                    ipst);
                        }
                        if (ipif != NULL) {
                                if (ipif->ipif_was_dup) {
                                        char ibuf[LIFNAMSIZ];
                                        char sbuf[INET6_ADDRSTRLEN];

                                        ipif->ipif_was_dup = B_FALSE;
                                        (void) inet_ntop(AF_INET6,
                                            &ipif->ipif_v6lcl_addr,
                                            sbuf, sizeof (sbuf));
                                        ipif_get_name(ipif, ibuf,
                                            sizeof (ibuf));
                                        cmn_err(CE_NOTE, "recovered address "
                                            "%s on %s", sbuf, ibuf);
                                }
                                if ((ipif->ipif_flags & IPIF_UP) &&
                                    !ipif->ipif_addr_ready)
                                        ipif_up_notify(ipif);
                                ipif->ipif_addr_ready = 1;
                                ipif_refrele(ipif);
                        }
                        if (!isv6 && arp_no_defense)
                                break;
                        /* Begin defending our new address */
                        if (ncec->ncec_unsolicit_count > 0) {
                                ncec->ncec_unsolicit_count--;
                                if (isv6) {
                                        dropped = ndp_announce(ncec);
                                } else {
                                        dropped = arp_announce(ncec);
                                }

                                if (dropped)
                                        ncec->ncec_unsolicit_count++;
                                else
                                        ncec->ncec_last_time_defended =
                                            ddi_get_lbolt();
                        }
                        if (ncec->ncec_unsolicit_count > 0) {
                                nce_restart_timer(ncec,
                                    ANNOUNCE_INTERVAL(isv6));
                        } else if (DEFENSE_INTERVAL(isv6) != 0) {
                                nce_restart_timer(ncec, DEFENSE_INTERVAL(isv6));
                        }
                } else {
                        /*
                         * This is an address we're probing to be our own, but
                         * the ill is down.  Wait until it comes back before
                         * doing anything, but switch to reachable state so
                         * that the restart will work.
                         */
                        ncec->ncec_state = ND_REACHABLE;
                        mutex_exit(&ncec->ncec_lock);
                }
                break;
        case ND_INCOMPLETE: {
                mblk_t  *mp, *nextmp;
                mblk_t  **prevmpp;

                /*
                 * Per case (2) in the nce_queue_mp() comments, scan ncec_qd_mp
                 * for any IPMP probe packets, and toss them.  IPMP probe
                 * packets will always be at the head of ncec_qd_mp, so that
                 * we can stop at the first queued ND packet that is
                 * not a probe packet.
                 */
                prevmpp = &ncec->ncec_qd_mp;
                for (mp = ncec->ncec_qd_mp; mp != NULL; mp = nextmp) {
                        nextmp = mp->b_next;

                        if (IS_UNDER_IPMP(ill) && ncec->ncec_nprobes > 0) {
                                inet_freemsg(mp);
                                ncec->ncec_nprobes--;
                                *prevmpp = nextmp;
                        } else {
                                prevmpp = &mp->b_next;
                        }
                }

                /*
                 * Must be resolver's retransmit timer.
                 */
                mutex_exit(&ncec->ncec_lock);
                ip_ndp_resolve(ncec);
                break;
        }
        case ND_REACHABLE:
                if (((ncec->ncec_flags & NCE_F_UNSOL_ADV) &&
                    ncec->ncec_unsolicit_count != 0) ||
                    (NCE_PUBLISH(ncec) && DEFENSE_INTERVAL(isv6) != 0)) {
                        if (ncec->ncec_unsolicit_count > 0) {
                                ncec->ncec_unsolicit_count--;
                                mutex_exit(&ncec->ncec_lock);
                                /*
                                 * When we get to zero announcements left,
                                 * switch to address defense
                                 */
                        } else {
                                boolean_t rate_limit;

                                mutex_exit(&ncec->ncec_lock);
                                rate_limit = ill_defend_rate_limit(ill, ncec);
                                if (rate_limit) {
                                        nce_restart_timer(ncec,
                                            DEFENSE_INTERVAL(isv6));
                                        break;
                                }
                        }
                        if (isv6) {
                                dropped = ndp_announce(ncec);
                        } else {
                                dropped = arp_announce(ncec);
                        }
                        mutex_enter(&ncec->ncec_lock);
                        if (dropped) {
                                ncec->ncec_unsolicit_count++;
                        } else {
                                ncec->ncec_last_time_defended =
                                    ddi_get_lbolt();
                        }
                        mutex_exit(&ncec->ncec_lock);
                        if (ncec->ncec_unsolicit_count != 0) {
                                nce_restart_timer(ncec,
                                    ANNOUNCE_INTERVAL(isv6));
                        } else {
                                nce_restart_timer(ncec, DEFENSE_INTERVAL(isv6));
                        }
                } else {
                        mutex_exit(&ncec->ncec_lock);
                }
                break;
        default:
                mutex_exit(&ncec->ncec_lock);
                break;
        }

        ncec_refrele(ncec);
        ill_refrele(src_ill);
}

/*
 * Set a link layer address from the ll_addr passed in.
 * Copy SAP from ill.
 */
static void
nce_set_ll(ncec_t *ncec, uchar_t *ll_addr)
{
        ill_t   *ill = ncec->ncec_ill;

        ASSERT(ll_addr != NULL);
        if (ill->ill_phys_addr_length > 0) {
                /*
                 * The bcopy() below used to be called for the physical address
                 * length rather than the link layer address length. For
                 * ethernet and many other media, the phys_addr and lla are
                 * identical.
                 *
                 * The phys_addr and lla may not be the same for devices that
                 * support DL_IPV6_LINK_LAYER_ADDR, though there are currently
                 * no known instances of these.
                 *
                 * For PPP or other interfaces with a zero length
                 * physical address, don't do anything here.
                 * The bcopy() with a zero phys_addr length was previously
                 * a no-op for interfaces with a zero-length physical address.
                 * Using the lla for them would change the way they operate.
                 * Doing nothing in such cases preserves expected behavior.
                 */
                bcopy(ll_addr, ncec->ncec_lladdr, ill->ill_nd_lla_len);
        }
}

boolean_t
nce_cmp_ll_addr(const ncec_t *ncec, const uchar_t *ll_addr,
    uint32_t ll_addr_len)
{
        ASSERT(ncec->ncec_lladdr != NULL);
        if (ll_addr == NULL)
                return (B_FALSE);
        if (bcmp(ll_addr, ncec->ncec_lladdr, ll_addr_len) != 0)
                return (B_TRUE);
        return (B_FALSE);
}

/*
 * Updates the link layer address or the reachability state of
 * a cache entry.  Reset probe counter if needed.
 */
void
nce_update(ncec_t *ncec, uint16_t new_state, uchar_t *new_ll_addr)
{
        ill_t   *ill = ncec->ncec_ill;
        boolean_t need_stop_timer = B_FALSE;
        boolean_t need_fastpath_update = B_FALSE;
        nce_t   *nce = NULL;
        timeout_id_t tid;

        ASSERT(MUTEX_HELD(&ncec->ncec_lock));
        /*
         * If this interface does not do NUD, there is no point
         * in allowing an update to the cache entry.  Although
         * we will respond to NS.
         * The only time we accept an update for a resolver when
         * NUD is turned off is when it has just been created.
         * Non-Resolvers will always be created as REACHABLE.
         */
        if (new_state != ND_UNCHANGED) {
                if ((ncec->ncec_flags & NCE_F_NONUD) &&
                    (ncec->ncec_state != ND_INCOMPLETE))
                        return;
                ASSERT((int16_t)new_state >= ND_STATE_VALID_MIN);
                ASSERT((int16_t)new_state <= ND_STATE_VALID_MAX);
                need_stop_timer = B_TRUE;
                if (new_state == ND_REACHABLE)
                        ncec->ncec_last = TICK_TO_MSEC(ddi_get_lbolt64());
                else {
                        /* We force NUD in this case */
                        ncec->ncec_last = 0;
                }
                ncec->ncec_state = new_state;
                ncec->ncec_pcnt = ND_MAX_UNICAST_SOLICIT;
                ASSERT(ncec->ncec_lladdr != NULL || new_state == ND_INITIAL ||
                    new_state == ND_INCOMPLETE);
        }

        tid = 0;
        if (need_stop_timer || (ncec->ncec_flags & NCE_F_STATIC)) {
                tid = ncec->ncec_timeout_id;
                ncec->ncec_timeout_id = 0;
        }
        /*
         * Re-trigger fastpath probe and
         * overwrite the DL_UNITDATA_REQ data, noting we'll lose
         * whatever packets that happens to be transmitting at the time.
         */
        if (new_ll_addr != NULL) {
                bcopy(new_ll_addr, ncec->ncec_lladdr,
                    ill->ill_phys_addr_length);
                need_fastpath_update = B_TRUE;
        }
        mutex_exit(&ncec->ncec_lock);
        if (need_stop_timer || (ncec->ncec_flags & NCE_F_STATIC)) {
                if (tid != 0)
                        (void) untimeout(tid);
        }
        if (need_fastpath_update) {
                /*
                 * Delete any existing existing dlur_mp and fp_mp information.
                 * For IPMP interfaces, all underlying ill's must be checked
                 * and purged.
                 */
                nce_fastpath_list_delete(ncec->ncec_ill, ncec, NULL);
                /*
                 * add the new dlur_mp and fp_mp
                 */
                nce = nce_fastpath(ncec, B_TRUE, NULL);
                if (nce != NULL)
                        nce_refrele(nce);
        }
        mutex_enter(&ncec->ncec_lock);
}

static void
nce_queue_mp_common(ncec_t *ncec, mblk_t *mp, boolean_t head_insert)
{
        uint_t  count = 0;
        mblk_t  **mpp, *tmp;

        ASSERT(MUTEX_HELD(&ncec->ncec_lock));

        for (mpp = &ncec->ncec_qd_mp; *mpp != NULL; mpp = &(*mpp)->b_next) {
                if (++count > ncec->ncec_ill->ill_max_buf) {
                        tmp = ncec->ncec_qd_mp->b_next;
                        ncec->ncec_qd_mp->b_next = NULL;
                        /*
                         * if we never create data addrs on the under_ill
                         * does this matter?
                         */
                        BUMP_MIB(ncec->ncec_ill->ill_ip_mib,
                            ipIfStatsOutDiscards);
                        ip_drop_output("ipIfStatsOutDiscards", ncec->ncec_qd_mp,
                            ncec->ncec_ill);
                        freemsg(ncec->ncec_qd_mp);
                        ncec->ncec_qd_mp = tmp;
                }
        }

        if (head_insert) {
                ncec->ncec_nprobes++;
                mp->b_next = ncec->ncec_qd_mp;
                ncec->ncec_qd_mp = mp;
        } else {
                *mpp = mp;
        }
}

/*
 * nce_queue_mp will queue the packet into the ncec_qd_mp. The packet will be
 * queued at the head or tail of the queue based on the input argument
 * 'head_insert'. The caller should specify this argument as B_TRUE if this
 * packet is an IPMP probe packet, in which case the following happens:
 *
 *   1. Insert it at the head of the ncec_qd_mp list.  Consider the normal
 *      (non-ipmp_probe) load-speading case where the source address of the ND
 *      packet is not tied to ncec_ill. If the ill bound to the source address
 *      cannot receive, the response to the ND packet will not be received.
 *      However, if ND packets for ncec_ill's probes are queued behind that ND
 *      packet, those probes will also fail to be sent, and thus in.mpathd will
 *       erroneously conclude that ncec_ill has also failed.
 *
 *   2. Drop the ipmp_probe packet in ndp_timer() if the ND did not succeed on
 *      the first attempt.  This ensures that ND problems do not manifest as
 *      probe RTT spikes.
 *
 * We achieve this by inserting ipmp_probe() packets at the head of the
 * nce_queue.
 *
 * The ncec for the probe target is created with ncec_ill set to the ipmp_ill,
 * but the caller needs to set head_insert to B_TRUE if this is a probe packet.
 */
void
nce_queue_mp(ncec_t *ncec, mblk_t *mp, boolean_t head_insert)
{
        ASSERT(MUTEX_HELD(&ncec->ncec_lock));
        nce_queue_mp_common(ncec, mp, head_insert);
}

/*
 * Called when address resolution failed due to a timeout.
 * Send an ICMP unreachable in response to all queued packets.
 */
void
ndp_resolv_failed(ncec_t *ncec)
{
        mblk_t  *mp, *nxt_mp;
        char    buf[INET6_ADDRSTRLEN];
        ill_t *ill = ncec->ncec_ill;
        ip_recv_attr_t  iras;

        bzero(&iras, sizeof (iras));
        iras.ira_flags = 0;
        /*
         * we are setting the ira_rill to the ipmp_ill (instead of
         * the actual ill on which the packet was received), but this
         * is ok because we don't actually need the real ira_rill.
         * to send the icmp unreachable to the sender.
         */
        iras.ira_ill = iras.ira_rill = ill;
        iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex;
        iras.ira_rifindex = iras.ira_ruifindex;

        ip1dbg(("ndp_resolv_failed: dst %s\n",
            inet_ntop(AF_INET6, (char *)&ncec->ncec_addr, buf, sizeof (buf))));
        mutex_enter(&ncec->ncec_lock);
        mp = ncec->ncec_qd_mp;
        ncec->ncec_qd_mp = NULL;
        ncec->ncec_nprobes = 0;
        mutex_exit(&ncec->ncec_lock);
        while (mp != NULL) {
                nxt_mp = mp->b_next;
                mp->b_next = NULL;

                BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
                ip_drop_output("ipIfStatsOutDiscards - address unreachable",
                    mp, ill);
                icmp_unreachable_v6(mp,
                    ICMP6_DST_UNREACH_ADDR, B_FALSE, &iras);
                ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE));
                mp = nxt_mp;
        }
        ncec_cb_dispatch(ncec); /* finish off waiting callbacks */
}

/*
 * Handle the completion of NDP and ARP resolution.
 */
void
nce_resolv_ok(ncec_t *ncec)
{
        mblk_t *mp;
        uint_t pkt_len;
        iaflags_t ixaflags = IXAF_NO_TRACE;
        nce_t *nce;
        ill_t   *ill = ncec->ncec_ill;
        boolean_t isv6 = (ncec->ncec_ipversion == IPV6_VERSION);
        ip_stack_t *ipst = ill->ill_ipst;

        if (IS_IPMP(ncec->ncec_ill)) {
                nce_resolv_ipmp_ok(ncec);
                return;
        }
        /* non IPMP case */

        mutex_enter(&ncec->ncec_lock);
        ASSERT(ncec->ncec_nprobes == 0);
        mp = ncec->ncec_qd_mp;
        ncec->ncec_qd_mp = NULL;
        mutex_exit(&ncec->ncec_lock);

        while (mp != NULL) {
                mblk_t *nxt_mp;

                if (ill->ill_isv6) {
                        ip6_t *ip6h = (ip6_t *)mp->b_rptr;

                        pkt_len = ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN;
                } else {
                        ipha_t *ipha = (ipha_t *)mp->b_rptr;

                        ixaflags |= IXAF_IS_IPV4;
                        pkt_len = ntohs(ipha->ipha_length);
                }
                nxt_mp = mp->b_next;
                mp->b_next = NULL;
                /*
                 * IXAF_NO_DEV_FLOW_CTL information for TCP packets is no
                 * longer available, but it's ok to drop this flag because TCP
                 * has its own flow-control in effect, so TCP packets
                 * are not likely to get here when flow-control is in effect.
                 */
                mutex_enter(&ill->ill_lock);
                nce = nce_lookup(ill, &ncec->ncec_addr);
                mutex_exit(&ill->ill_lock);

                if (nce == NULL) {
                        if (isv6) {
                                BUMP_MIB(&ipst->ips_ip6_mib,
                                    ipIfStatsOutDiscards);
                        } else {
                                BUMP_MIB(&ipst->ips_ip_mib,
                                    ipIfStatsOutDiscards);
                        }
                        ip_drop_output("ipIfStatsOutDiscards - no nce",
                            mp, NULL);
                        freemsg(mp);
                } else {
                        /*
                         * We don't know the zoneid, but
                         * ip_xmit does not care since IXAF_NO_TRACE
                         * is set. (We traced the packet the first
                         * time through ip_xmit.)
                         */
                        (void) ip_xmit(mp, nce, ixaflags, pkt_len, 0,
                            ALL_ZONES, 0, NULL);
                        nce_refrele(nce);
                }
                mp = nxt_mp;
        }

        ncec_cb_dispatch(ncec); /* complete callbacks */
}

/*
 * Called by SIOCSNDP* ioctl to add/change an ncec entry
 * and the corresponding attributes.
 * Disallow states other than ND_REACHABLE or ND_STALE.
 */
int
ndp_sioc_update(ill_t *ill, lif_nd_req_t *lnr)
{
        sin6_t          *sin6;
        in6_addr_t      *addr;
        ncec_t          *ncec;
        nce_t           *nce;
        int             err = 0;
        uint16_t        new_flags = 0;
        uint16_t        old_flags = 0;
        int             inflags = lnr->lnr_flags;
        ip_stack_t      *ipst = ill->ill_ipst;
        boolean_t       do_postprocess = B_FALSE;

        ASSERT(ill->ill_isv6);
        if ((lnr->lnr_state_create != ND_REACHABLE) &&
            (lnr->lnr_state_create != ND_STALE))
                return (EINVAL);

        sin6 = (sin6_t *)&lnr->lnr_addr;
        addr = &sin6->sin6_addr;

        mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
        ASSERT(!IS_UNDER_IPMP(ill));
        nce = nce_lookup_addr(ill, addr);
        if (nce != NULL)
                new_flags = nce->nce_common->ncec_flags;

        switch (inflags & (NDF_ISROUTER_ON|NDF_ISROUTER_OFF)) {
        case NDF_ISROUTER_ON:
                new_flags |= NCE_F_ISROUTER;
                break;
        case NDF_ISROUTER_OFF:
                new_flags &= ~NCE_F_ISROUTER;
                break;
        case (NDF_ISROUTER_OFF|NDF_ISROUTER_ON):
                mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
                if (nce != NULL)
                        nce_refrele(nce);
                return (EINVAL);
        }
        if (inflags & NDF_STATIC)
                new_flags |= NCE_F_STATIC;

        switch (inflags & (NDF_ANYCAST_ON|NDF_ANYCAST_OFF)) {
        case NDF_ANYCAST_ON:
                new_flags |= NCE_F_ANYCAST;
                break;
        case NDF_ANYCAST_OFF:
                new_flags &= ~NCE_F_ANYCAST;
                break;
        case (NDF_ANYCAST_OFF|NDF_ANYCAST_ON):
                mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
                if (nce != NULL)
                        nce_refrele(nce);
                return (EINVAL);
        }

        if (nce == NULL) {
                err = nce_add_v6(ill,
                    (uchar_t *)lnr->lnr_hdw_addr,
                    ill->ill_phys_addr_length,
                    addr,
                    new_flags,
                    lnr->lnr_state_create,
                    &nce);
                if (err != 0) {
                        mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
                        ip1dbg(("ndp_sioc_update: Can't create NCE %d\n", err));
                        return (err);
                } else {
                        do_postprocess = B_TRUE;
                }
        }
        ncec = nce->nce_common;
        old_flags = ncec->ncec_flags;
        if (old_flags & NCE_F_ISROUTER && !(new_flags & NCE_F_ISROUTER)) {
                ncec_router_to_host(ncec);
                mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
                if (do_postprocess)
                        err = nce_add_v6_postprocess(nce);
                nce_refrele(nce);
                return (0);
        }
        mutex_exit(&ipst->ips_ndp6->ndp_g_lock);

        if (do_postprocess)
                err = nce_add_v6_postprocess(nce);
        /*
         * err cannot be anything other than 0 because we don't support
         * proxy arp of static addresses.
         */
        ASSERT(err == 0);

        mutex_enter(&ncec->ncec_lock);
        ncec->ncec_flags = new_flags;
        mutex_exit(&ncec->ncec_lock);
        /*
         * Note that we ignore the state at this point, which
         * should be either STALE or REACHABLE.  Instead we let
         * the link layer address passed in to determine the state
         * much like incoming packets.
         */
        nce_process(ncec, (uchar_t *)lnr->lnr_hdw_addr, 0, B_FALSE);
        nce_refrele(nce);
        return (0);
}

/*
 * Create an nce_t structure for ill using the ncec->ncec_lladdr to set up
 * the nce_dlur_mp. If ill != ncec->ncec_ill, then the ips_ill_g_lock must
 * be held to ensure that they are in the same group.
 */
static nce_t *
nce_fastpath_create(ill_t *ill, ncec_t *ncec)
{

        nce_t *nce;

        nce = nce_ill_lookup_then_add(ill, ncec);

        if (nce == NULL || IS_LOOPBACK(nce->nce_ill) || IS_VNI(nce->nce_ill))
                return (nce);

        /*
         * hold the ncec_lock to synchronize with nce_update() so that,
         * at the end of this function, the contents of nce_dlur_mp are
         * consistent with ncec->ncec_lladdr, even though some intermediate
         * packet may have been sent out with a mangled address, which would
         * only be a transient condition.
         */
        mutex_enter(&ncec->ncec_lock);
        if (ncec->ncec_lladdr != NULL) {
                bcopy(ncec->ncec_lladdr, nce->nce_dlur_mp->b_rptr +
                    NCE_LL_ADDR_OFFSET(ill), ill->ill_phys_addr_length);
        } else {
                nce->nce_dlur_mp = ill_dlur_gen(NULL, 0, ill->ill_sap,
                    ill->ill_sap_length);
        }
        mutex_exit(&ncec->ncec_lock);
        return (nce);
}

/*
 * we make nce_fp_mp to have an M_DATA prepend.
 * The caller ensures there is hold on ncec for this function.
 * Note that since ill_fastpath_probe() copies the mblk there is
 * no need to hold the nce or ncec beyond this function.
 *
 * If the caller has passed in a non-null ncec_nce to nce_fastpath() that
 * ncec_nce must correspond to the nce for ncec with nce_ill == ncec->ncec_ill
 * and will be returned back by this function, so that no extra nce_refrele
 * is required for the caller. The calls from nce_add_common() use this
 * method. All other callers (that pass in NULL ncec_nce) will have to do a
 * nce_refrele of the returned nce (when it is non-null).
 */
static nce_t *
nce_fastpath(ncec_t *ncec, boolean_t trigger_fp_req, nce_t *ncec_nce)
{
        nce_t *nce;
        ill_t *ill = ncec->ncec_ill;

        ASSERT(ill != NULL);

        if (IS_IPMP(ill) && trigger_fp_req) {
                trigger_fp_req = B_FALSE;
                ipmp_ncec_refresh_nce(ncec);
        }

        /*
         * If the caller already has the nce corresponding to the ill, use
         * that one. Otherwise we have to lookup/add the nce. Calls from
         * nce_add_common() fall in the former category, and have just done
         * the nce lookup/add that can be reused.
         */
        if (ncec_nce == NULL)
                nce = nce_fastpath_create(ill, ncec);
        else
                nce = ncec_nce;

        if (nce == NULL || IS_LOOPBACK(nce->nce_ill) || IS_VNI(nce->nce_ill))
                return (nce);

        if (trigger_fp_req)
                nce_fastpath_trigger(nce);
        return (nce);
}

/*
 * Trigger fastpath on nce. No locks may be held.
 */
static void
nce_fastpath_trigger(nce_t *nce)
{
        int res;
        ill_t *ill = nce->nce_ill;
        ncec_t *ncec = nce->nce_common;

        res = ill_fastpath_probe(ill, nce->nce_dlur_mp);
        /*
         * EAGAIN is an indication of a transient error
         * i.e. allocation failure etc. leave the ncec in the list it
         * will be updated when another probe happens for another ire
         * if not it will be taken out of the list when the ire is
         * deleted.
         */
        if (res != 0 && res != EAGAIN && res != ENOTSUP)
                nce_fastpath_list_delete(ill, ncec, NULL);
}

/*
 * Add ncec to the nce fastpath list on ill.
 */
static nce_t *
nce_ill_lookup_then_add_locked(ill_t *ill, ncec_t *ncec, list_t *graveyard)
{
        nce_t *nce = NULL;

        ASSERT(MUTEX_HELD(&ill->ill_lock));
        /*
         * Atomically ensure that the ill is not CONDEMNED and is not going
         * down, before adding the NCE.
         */
        if (ill->ill_state_flags & ILL_CONDEMNED)
                return (NULL);
        mutex_enter(&ncec->ncec_lock);
        /*
         * if ncec has not been deleted and
         * is not already in the list add it.
         */
        if (!NCE_ISCONDEMNED(ncec)) {
                nce = nce_lookup(ill, &ncec->ncec_addr);
                if (nce != NULL)
                        goto done;
                nce = nce_add(ill, ncec, graveyard);
        }
done:
        mutex_exit(&ncec->ncec_lock);
        return (nce);
}

static nce_t *
nce_ill_lookup_then_add(ill_t *ill, ncec_t *ncec)
{
        nce_t *nce;
        list_t graveyard;

        list_create(&graveyard, sizeof (nce_t), offsetof(nce_t, nce_node));
        mutex_enter(&ill->ill_lock);
        nce = nce_ill_lookup_then_add_locked(ill, ncec, &graveyard);
        mutex_exit(&ill->ill_lock);
        nce_graveyard_free(&graveyard);
        return (nce);
}


/*
 * remove ncec from the ill_nce list. If 'dead' is non-null, the deleted
 * nce is added to the 'dead' list, and the caller must nce_refrele() the
 * entry after all locks have been dropped.
 */
void
nce_fastpath_list_delete(ill_t *ill, ncec_t *ncec, list_t *dead)
{
        nce_t *nce;

        ASSERT(ill != NULL);

        /* delete any nces referencing the ncec from underlying ills */
        if (IS_IPMP(ill))
                ipmp_ncec_delete_nce(ncec);

        /* now the ill itself */
        mutex_enter(&ill->ill_lock);
        for (nce = list_head(&ill->ill_nce); nce != NULL;
            nce = list_next(&ill->ill_nce, nce)) {
                if (nce->nce_common == ncec) {
                        nce_refhold(nce);
                        nce_delete(nce);
                        break;
                }
        }
        mutex_exit(&ill->ill_lock);
        if (nce != NULL) {
                if (dead == NULL)
                        nce_refrele(nce);
                else
                        list_insert_tail(dead, nce);
        }
}

/*
 * when the fastpath response does not fit in the datab
 * associated with the existing nce_fp_mp, we delete and
 * add the nce to retrigger fastpath based on the information
 * in the ncec_t.
 */
static nce_t *
nce_delete_then_add(nce_t *nce)
{
        ill_t           *ill = nce->nce_ill;
        nce_t           *newnce = NULL;
        list_t          graveyard;

        list_create(&graveyard, sizeof (nce_t), offsetof(nce_t, nce_node));
        ip0dbg(("nce_delete_then_add nce %p ill %s\n",
            (void *)nce, ill->ill_name));
        mutex_enter(&ill->ill_lock);
        mutex_enter(&nce->nce_common->ncec_lock);
        nce_delete(nce);
        /*
         * Make sure that ncec is not condemned before adding. We hold the
         * ill_lock and ncec_lock to synchronize with ncec_delete() and
         * ipmp_ncec_delete_nce()
         */
        if (!NCE_ISCONDEMNED(nce->nce_common))
                newnce = nce_add(ill, nce->nce_common, &graveyard);
        mutex_exit(&nce->nce_common->ncec_lock);
        mutex_exit(&ill->ill_lock);
        nce_graveyard_free(&graveyard);
        nce_refrele(nce);
        return (newnce); /* could be null if nomem */
}

typedef struct nce_fp_match_s {
        nce_t   *nce_fp_match_res;
        mblk_t  *nce_fp_match_ack_mp;
} nce_fp_match_t;

/* ARGSUSED */
static int
nce_fastpath_match_dlur(ill_t *ill, nce_t *nce, void *arg)
{
        nce_fp_match_t  *nce_fp_marg = arg;
        ncec_t          *ncec = nce->nce_common;
        mblk_t          *mp = nce_fp_marg->nce_fp_match_ack_mp;
        uchar_t *mp_rptr, *ud_mp_rptr;
        mblk_t          *ud_mp = nce->nce_dlur_mp;
        ptrdiff_t       cmplen;

        /*
         * mp is the mp associated with the fastpath ack.
         * ud_mp is the outstanding DL_UNITDATA_REQ on the nce_t
         * under consideration. If the contents match, then the
         * fastpath ack is used to update the nce.
         */
        if (ud_mp == NULL)
                return (0);
        mp_rptr = mp->b_rptr;
        cmplen = mp->b_wptr - mp_rptr;
        ASSERT(cmplen >= 0);

        ud_mp_rptr = ud_mp->b_rptr;
        /*
         * The ncec is locked here to prevent any other threads from accessing
         * and changing nce_dlur_mp when the address becomes resolved to an
         * lla while we're in the middle of looking at and comparing the
         * hardware address (lla). It is also locked to prevent multiple
         * threads in nce_fastpath() from examining nce_dlur_mp at the same
         * time.
         */
        mutex_enter(&ncec->ncec_lock);
        if (ud_mp->b_wptr - ud_mp_rptr != cmplen ||
            bcmp((char *)mp_rptr, (char *)ud_mp_rptr, cmplen) == 0) {
                nce_fp_marg->nce_fp_match_res = nce;
                mutex_exit(&ncec->ncec_lock);
                nce_refhold(nce);
                return (1);
        }
        mutex_exit(&ncec->ncec_lock);
        return (0);
}

/*
 * Update all NCE's that are not in fastpath mode and
 * have an nce_fp_mp that matches mp. mp->b_cont contains
 * the fastpath header.
 *
 * Returns TRUE if entry should be dequeued, or FALSE otherwise.
 */
void
nce_fastpath_update(ill_t *ill,  mblk_t *mp)
{
        nce_fp_match_t nce_fp_marg;
        nce_t *nce;
        mblk_t *nce_fp_mp, *fp_mp;

        nce_fp_marg.nce_fp_match_res = NULL;
        nce_fp_marg.nce_fp_match_ack_mp = mp;

        nce_walk(ill, nce_fastpath_match_dlur, &nce_fp_marg);

        if ((nce = nce_fp_marg.nce_fp_match_res) == NULL)
                return;

        mutex_enter(&nce->nce_lock);
        nce_fp_mp = nce->nce_fp_mp;

        if (nce_fp_mp != NULL) {
                fp_mp = mp->b_cont;
                if (nce_fp_mp->b_rptr + MBLKL(fp_mp) >
                    nce_fp_mp->b_datap->db_lim) {
                        mutex_exit(&nce->nce_lock);
                        nce = nce_delete_then_add(nce);
                        if (nce == NULL) {
                                return;
                        }
                        mutex_enter(&nce->nce_lock);
                        nce_fp_mp = nce->nce_fp_mp;
                }
        }

        /* Matched - install mp as the fastpath mp */
        if (nce_fp_mp == NULL) {
                fp_mp = dupb(mp->b_cont);
                nce->nce_fp_mp = fp_mp;
        } else {
                fp_mp = mp->b_cont;
                bcopy(fp_mp->b_rptr, nce_fp_mp->b_rptr, MBLKL(fp_mp));
                nce->nce_fp_mp->b_wptr = nce->nce_fp_mp->b_rptr
                    + MBLKL(fp_mp);
        }
        mutex_exit(&nce->nce_lock);
        nce_refrele(nce);
}

/*
 * Return a pointer to a given option in the packet.
 * Assumes that option part of the packet have already been validated.
 */
nd_opt_hdr_t *
ndp_get_option(nd_opt_hdr_t *opt, int optlen, int opt_type)
{
        while (optlen > 0) {
                if (opt->nd_opt_type == opt_type)
                        return (opt);
                optlen -= 8 * opt->nd_opt_len;
                opt = (struct nd_opt_hdr *)((char *)opt + 8 * opt->nd_opt_len);
        }
        return (NULL);
}

/*
 * Verify all option lengths present are > 0, also check to see
 * if the option lengths and packet length are consistent.
 */
boolean_t
ndp_verify_optlen(nd_opt_hdr_t *opt, int optlen)
{
        ASSERT(opt != NULL);
        while (optlen > 0) {
                if (opt->nd_opt_len == 0)
                        return (B_FALSE);
                optlen -= 8 * opt->nd_opt_len;
                if (optlen < 0)
                        return (B_FALSE);
                opt = (struct nd_opt_hdr *)((char *)opt + 8 * opt->nd_opt_len);
        }
        return (B_TRUE);
}

/*
 * ncec_walk function.
 * Free a fraction of the NCE cache entries.
 *
 * A possible optimization here would be to use ncec_last where possible, and
 * delete the least-frequently used entry, which would require more complex
 * computation as we walk through the ncec's (e.g., track ncec entries by
 * order of ncec_last and/or maintain state)
 */
static void
ncec_cache_reclaim(ncec_t *ncec, void *arg)
{
        ip_stack_t      *ipst = ncec->ncec_ipst;
        uint_t          fraction = *(uint_t *)arg;
        uint_t          rand;

        if ((ncec->ncec_flags &
            (NCE_F_MYADDR | NCE_F_STATIC | NCE_F_BCAST)) != 0) {
                return;
        }

        rand = (uint_t)ddi_get_lbolt() +
            NCE_ADDR_HASH_V6(ncec->ncec_addr, NCE_TABLE_SIZE);
        if ((rand/fraction)*fraction == rand) {
                IP_STAT(ipst, ip_nce_reclaim_deleted);
                ncec_delete(ncec);
        }
}

/*
 * kmem_cache callback to free up memory.
 *
 * For now we just delete a fixed fraction.
 */
static void
ip_nce_reclaim_stack(ip_stack_t *ipst)
{
        uint_t          fraction = ipst->ips_ip_nce_reclaim_fraction;

        IP_STAT(ipst, ip_nce_reclaim_calls);

        ncec_walk(NULL, ncec_cache_reclaim, &fraction, ipst);

        /*
         * Walk all CONNs that can have a reference on an ire, ncec or dce.
         * Get them to update any stale references to drop any refholds they
         * have.
         */
        ipcl_walk(conn_ixa_cleanup, (void *)B_FALSE, ipst);
}

/*
 * Called by the memory allocator subsystem directly, when the system
 * is running low on memory.
 */
/* ARGSUSED */
void
ip_nce_reclaim(void *args)
{
        netstack_handle_t nh;
        netstack_t *ns;
        ip_stack_t *ipst;

        netstack_next_init(&nh);
        while ((ns = netstack_next(&nh)) != NULL) {
                /*
                 * netstack_next() can return a netstack_t with a NULL
                 * netstack_ip at boot time.
                 */
                if ((ipst = ns->netstack_ip) == NULL) {
                        netstack_rele(ns);
                        continue;
                }
                ip_nce_reclaim_stack(ipst);
                netstack_rele(ns);
        }
        netstack_next_fini(&nh);
}

#ifdef DEBUG
void
ncec_trace_ref(ncec_t *ncec)
{
        ASSERT(MUTEX_HELD(&ncec->ncec_lock));

        if (ncec->ncec_trace_disable)
                return;

        if (!th_trace_ref(ncec, ncec->ncec_ipst)) {
                ncec->ncec_trace_disable = B_TRUE;
                ncec_trace_cleanup(ncec);
        }
}

void
ncec_untrace_ref(ncec_t *ncec)
{
        ASSERT(MUTEX_HELD(&ncec->ncec_lock));

        if (!ncec->ncec_trace_disable)
                th_trace_unref(ncec);
}

static void
ncec_trace_cleanup(const ncec_t *ncec)
{
        th_trace_cleanup(ncec, ncec->ncec_trace_disable);
}
#endif

/*
 * Called when address resolution fails due to a timeout.
 * Send an ICMP unreachable in response to all queued packets.
 */
void
arp_resolv_failed(ncec_t *ncec)
{
        mblk_t  *mp, *nxt_mp;
        char    buf[INET6_ADDRSTRLEN];
        struct in_addr ipv4addr;
        ill_t *ill = ncec->ncec_ill;
        ip_stack_t *ipst = ncec->ncec_ipst;
        ip_recv_attr_t  iras;

        bzero(&iras, sizeof (iras));
        iras.ira_flags = IRAF_IS_IPV4;
        /*
         * we are setting the ira_rill to the ipmp_ill (instead of
         * the actual ill on which the packet was received), but this
         * is ok because we don't actually need the real ira_rill.
         * to send the icmp unreachable to the sender.
         */
        iras.ira_ill = iras.ira_rill = ill;
        iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex;
        iras.ira_rifindex = iras.ira_ruifindex;

        IN6_V4MAPPED_TO_INADDR(&ncec->ncec_addr, &ipv4addr);
        ip3dbg(("arp_resolv_failed: dst %s\n",
            inet_ntop(AF_INET, &ipv4addr, buf, sizeof (buf))));
        mutex_enter(&ncec->ncec_lock);
        mp = ncec->ncec_qd_mp;
        ncec->ncec_qd_mp = NULL;
        ncec->ncec_nprobes = 0;
        mutex_exit(&ncec->ncec_lock);
        while (mp != NULL) {
                nxt_mp = mp->b_next;
                mp->b_next = NULL;

                BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
                ip_drop_output("ipIfStatsOutDiscards - address unreachable",
                    mp, ill);
                if (ipst->ips_ip_arp_icmp_error) {
                        ip3dbg(("arp_resolv_failed: "
                            "Calling icmp_unreachable\n"));
                        icmp_unreachable(mp, ICMP_HOST_UNREACHABLE, &iras);
                } else {
                        freemsg(mp);
                }
                ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE));
                mp = nxt_mp;
        }
        ncec_cb_dispatch(ncec); /* finish off waiting callbacks */
}

/*
 * if ill is an under_ill, translate it to the ipmp_ill and add the
 * nce on the ipmp_ill. Two nce_t entries (one on the ipmp_ill, and
 * one on the underlying in_ill) will be created for the
 * ncec_t in this case. The ncec_t itself will be created on the ipmp_ill.
 */
int
nce_lookup_then_add_v4(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len,
    const in_addr_t *addr, uint16_t flags, uint16_t state, nce_t **newnce)
{
        int     err;
        in6_addr_t addr6;
        ip_stack_t *ipst = ill->ill_ipst;
        nce_t   *nce, *upper_nce = NULL;
        ill_t   *in_ill = ill, *under = NULL;
        boolean_t need_ill_refrele = B_FALSE;

        if (flags & NCE_F_MCAST) {
                /*
                 * hw_addr will be figured out in nce_set_multicast_v4;
                 * caller needs to pass in the cast_ill for ipmp
                 */
                ASSERT(hw_addr == NULL);
                ASSERT(!IS_IPMP(ill));
                err = nce_set_multicast_v4(ill, addr, flags, newnce);
                return (err);
        }

        if (IS_UNDER_IPMP(ill) && !(flags & NCE_F_MYADDR)) {
                ill = ipmp_ill_hold_ipmp_ill(ill);
                if (ill == NULL)
                        return (ENXIO);
                need_ill_refrele = B_TRUE;
        }
        if ((flags & NCE_F_BCAST) != 0) {
                /*
                 * IPv4 broadcast ncec: compute the hwaddr.
                 */
                if (IS_IPMP(ill)) {
                        under = ipmp_ill_hold_xmit_ill(ill, B_FALSE);
                        if (under == NULL)  {
                                if (need_ill_refrele)
                                        ill_refrele(ill);
                                return (ENETDOWN);
                        }
                        hw_addr = under->ill_bcast_mp->b_rptr +
                            NCE_LL_ADDR_OFFSET(under);
                        hw_addr_len = under->ill_phys_addr_length;
                } else {
                        hw_addr = ill->ill_bcast_mp->b_rptr +
                            NCE_LL_ADDR_OFFSET(ill),
                            hw_addr_len = ill->ill_phys_addr_length;
                }
        }

        mutex_enter(&ipst->ips_ndp4->ndp_g_lock);
        IN6_IPADDR_TO_V4MAPPED(*addr, &addr6);
        nce = nce_lookup_addr(ill, &addr6);
        if (nce == NULL) {
                err = nce_add_v4(ill, hw_addr, hw_addr_len, addr, flags,
                    state, &nce);
        } else {
                err = EEXIST;
        }
        mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
        if (err == 0)
                err = nce_add_v4_postprocess(nce);

        if (in_ill != ill && nce != NULL) {
                nce_t *under_nce = NULL;

                /*
                 * in_ill was the under_ill. Try to create the under_nce.
                 * Hold the ill_g_lock to prevent changes to group membership
                 * until we are done.
                 */
                rw_enter(&ipst->ips_ill_g_lock, RW_READER);
                if (!IS_IN_SAME_ILLGRP(in_ill, ill)) {
                        DTRACE_PROBE2(ill__not__in__group, nce_t *, nce,
                            ill_t *, ill);
                        rw_exit(&ipst->ips_ill_g_lock);
                        err = ENXIO;
                        nce_refrele(nce);
                        nce = NULL;
                        goto bail;
                }
                under_nce = nce_fastpath_create(in_ill, nce->nce_common);
                if (under_nce == NULL) {
                        rw_exit(&ipst->ips_ill_g_lock);
                        err = EINVAL;
                        nce_refrele(nce);
                        nce = NULL;
                        goto bail;
                }
                rw_exit(&ipst->ips_ill_g_lock);
                upper_nce = nce;
                nce = under_nce; /* will be returned to caller */
                if (NCE_ISREACHABLE(nce->nce_common))
                        nce_fastpath_trigger(under_nce);
        }
        if (nce != NULL) {
                if (newnce != NULL)
                        *newnce = nce;
                else
                        nce_refrele(nce);
        }
bail:
        if (under != NULL)
                ill_refrele(under);
        if (upper_nce != NULL)
                nce_refrele(upper_nce);
        if (need_ill_refrele)
                ill_refrele(ill);

        return (err);
}

/*
 * NDP Cache Entry creation routine for IPv4.
 * This routine must always be called with ndp4->ndp_g_lock held.
 * Prior to return, ncec_refcnt is incremented.
 *
 * IPMP notes: the ncec for non-local (i.e., !NCE_MYADDR(ncec) addresses
 * are always added pointing at the ipmp_ill. Thus, when the ill passed
 * to nce_add_v4 is an under_ill (i.e., IS_UNDER_IPMP(ill)) two nce_t
 * entries will be created, both pointing at the same ncec_t. The nce_t
 * entries will have their nce_ill set to the ipmp_ill and the under_ill
 * respectively, with the ncec_t having its ncec_ill pointing at the ipmp_ill.
 * Local addresses are always created on the ill passed to nce_add_v4.
 */
int
nce_add_v4(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len,
    const in_addr_t *addr, uint16_t flags, uint16_t state, nce_t **newnce)
{
        int             err;
        boolean_t       is_multicast = (flags & NCE_F_MCAST);
        struct in6_addr addr6;
        nce_t           *nce;

        ASSERT(MUTEX_HELD(&ill->ill_ipst->ips_ndp4->ndp_g_lock));
        ASSERT(!ill->ill_isv6);
        ASSERT(!IN_MULTICAST(htonl(*addr)) || is_multicast);

        IN6_IPADDR_TO_V4MAPPED(*addr, &addr6);
        err = nce_add_common(ill, hw_addr, hw_addr_len, &addr6, flags, state,
            &nce);
        ASSERT(newnce != NULL);
        *newnce = nce;
        return (err);
}

/*
 * Post-processing routine to be executed after nce_add_v4(). This function
 * triggers fastpath (if appropriate) and DAD on the newly added nce entry
 * and must be called without any locks held.
 *
 * Always returns 0, but we return an int to keep this symmetric with the
 * IPv6 counter-part.
 */
int
nce_add_v4_postprocess(nce_t *nce)
{
        ncec_t          *ncec = nce->nce_common;
        uint16_t        flags = ncec->ncec_flags;
        boolean_t       ndp_need_dad = B_FALSE;
        boolean_t       dropped;
        clock_t         delay;
        ip_stack_t      *ipst = ncec->ncec_ill->ill_ipst;
        uchar_t         *hw_addr = ncec->ncec_lladdr;
        boolean_t       trigger_fastpath = B_TRUE;

        /*
         * If the hw_addr is NULL, typically for ND_INCOMPLETE nces, then
         * we call nce_fastpath as soon as the ncec is resolved in nce_process.
         * We call nce_fastpath from nce_update if the link layer address of
         * the peer changes from nce_update
         */
        if (NCE_PUBLISH(ncec) || !NCE_ISREACHABLE(ncec) || (hw_addr == NULL &&
            ncec->ncec_ill->ill_net_type != IRE_IF_NORESOLVER))
                trigger_fastpath = B_FALSE;

        if (trigger_fastpath)
                nce_fastpath_trigger(nce);

        if (NCE_PUBLISH(ncec) && ncec->ncec_state == ND_PROBE) {
                /*
                 * Either the caller (by passing in ND_PROBE)
                 * or nce_add_common() (by the internally computed state
                 * based on ncec_addr and ill_net_type) has determined
                 * that this unicast entry needs DAD. Trigger DAD.
                 */
                ndp_need_dad = B_TRUE;
        } else if (flags & NCE_F_UNSOL_ADV) {
                /*
                 * We account for the transmit below by assigning one
                 * less than the ndd variable. Subsequent decrements
                 * are done in nce_timer.
                 */
                mutex_enter(&ncec->ncec_lock);
                ncec->ncec_unsolicit_count =
                    ipst->ips_ip_arp_publish_count - 1;
                mutex_exit(&ncec->ncec_lock);
                dropped = arp_announce(ncec);
                mutex_enter(&ncec->ncec_lock);
                if (dropped)
                        ncec->ncec_unsolicit_count++;
                else
                        ncec->ncec_last_time_defended = ddi_get_lbolt();
                if (ncec->ncec_unsolicit_count != 0) {
                        nce_start_timer(ncec,
                            ipst->ips_ip_arp_publish_interval);
                }
                mutex_exit(&ncec->ncec_lock);
        }

        /*
         * If ncec_xmit_interval is 0, user has configured us to send the first
         * probe right away.  Do so, and set up for the subsequent probes.
         */
        if (ndp_need_dad) {
                mutex_enter(&ncec->ncec_lock);
                if (ncec->ncec_pcnt == 0) {
                        /*
                         * DAD probes and announce can be
                         * administratively disabled by setting the
                         * probe_count to zero. Restart the timer in
                         * this case to mark the ipif as ready.
                         */
                        ncec->ncec_unsolicit_count = 0;
                        mutex_exit(&ncec->ncec_lock);
                        nce_restart_timer(ncec, 0);
                } else {
                        mutex_exit(&ncec->ncec_lock);
                        delay = ((ncec->ncec_flags & NCE_F_FAST) ?
                            ipst->ips_arp_probe_delay :
                            ipst->ips_arp_fastprobe_delay);
                        nce_dad(ncec, NULL, (delay == 0 ? B_TRUE : B_FALSE));
                }
        }
        return (0);
}

/*
 * ncec_walk routine to update all entries that have a given destination or
 * gateway address and cached link layer (MAC) address.  This is used when ARP
 * informs us that a network-to-link-layer mapping may have changed.
 */
void
nce_update_hw_changed(ncec_t *ncec, void *arg)
{
        nce_hw_map_t *hwm = arg;
        ipaddr_t ncec_addr;

        if (ncec->ncec_state != ND_REACHABLE)
                return;

        IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, ncec_addr);
        if (ncec_addr != hwm->hwm_addr)
                return;

        mutex_enter(&ncec->ncec_lock);
        if (hwm->hwm_flags != 0)
                ncec->ncec_flags = hwm->hwm_flags;
        nce_update(ncec, ND_STALE, hwm->hwm_hwaddr);
        mutex_exit(&ncec->ncec_lock);
}

void
ncec_refhold(ncec_t *ncec)
{
        mutex_enter(&(ncec)->ncec_lock);
        (ncec)->ncec_refcnt++;
        ASSERT((ncec)->ncec_refcnt != 0);
#ifdef DEBUG
        ncec_trace_ref(ncec);
#endif
        mutex_exit(&(ncec)->ncec_lock);
}

void
ncec_refhold_notr(ncec_t *ncec)
{
        mutex_enter(&(ncec)->ncec_lock);
        (ncec)->ncec_refcnt++;
        ASSERT((ncec)->ncec_refcnt != 0);
        mutex_exit(&(ncec)->ncec_lock);
}

static void
ncec_refhold_locked(ncec_t *ncec)
{
        ASSERT(MUTEX_HELD(&(ncec)->ncec_lock));
        (ncec)->ncec_refcnt++;
#ifdef DEBUG
        ncec_trace_ref(ncec);
#endif
}

/* ncec_inactive destroys the mutex thus no mutex_exit is needed */
void
ncec_refrele(ncec_t *ncec)
{
        mutex_enter(&(ncec)->ncec_lock);
#ifdef DEBUG
        ncec_untrace_ref(ncec);
#endif
        ASSERT((ncec)->ncec_refcnt != 0);
        if (--(ncec)->ncec_refcnt == 0) {
                ncec_inactive(ncec);
        } else {
                mutex_exit(&(ncec)->ncec_lock);
        }
}

void
ncec_refrele_notr(ncec_t *ncec)
{
        mutex_enter(&(ncec)->ncec_lock);
        ASSERT((ncec)->ncec_refcnt != 0);
        if (--(ncec)->ncec_refcnt == 0) {
                ncec_inactive(ncec);
        } else {
                mutex_exit(&(ncec)->ncec_lock);
        }
}

/*
 * Common to IPv4 and IPv6.
 */
void
nce_restart_timer(ncec_t *ncec, uint_t ms)
{
        timeout_id_t tid;

        ASSERT(!MUTEX_HELD(&(ncec)->ncec_lock));

        /* First cancel any running timer */
        mutex_enter(&ncec->ncec_lock);
        tid = ncec->ncec_timeout_id;
        ncec->ncec_timeout_id = 0;
        if (tid != 0) {
                mutex_exit(&ncec->ncec_lock);
                (void) untimeout(tid);
                mutex_enter(&ncec->ncec_lock);
        }

        /* Restart timer */
        nce_start_timer(ncec, ms);
        mutex_exit(&ncec->ncec_lock);
}

static void
nce_start_timer(ncec_t *ncec, uint_t ms)
{
        ASSERT(MUTEX_HELD(&ncec->ncec_lock));
        /*
         * Don't start the timer if the ncec has been deleted, or if the timer
         * is already running
         */
        if (!NCE_ISCONDEMNED(ncec) && ncec->ncec_timeout_id == 0) {
                ncec->ncec_timeout_id = timeout(nce_timer, ncec,
                    MSEC_TO_TICK(ms) == 0 ? 1 : MSEC_TO_TICK(ms));
        }
}

int
nce_set_multicast_v4(ill_t *ill, const in_addr_t *dst,
    uint16_t flags, nce_t **newnce)
{
        uchar_t         *hw_addr;
        int             err = 0;
        ip_stack_t      *ipst = ill->ill_ipst;
        in6_addr_t      dst6;
        nce_t           *nce;

        ASSERT(!ill->ill_isv6);

        IN6_IPADDR_TO_V4MAPPED(*dst, &dst6);
        mutex_enter(&ipst->ips_ndp4->ndp_g_lock);
        if ((nce = nce_lookup_addr(ill, &dst6)) != NULL) {
                mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
                goto done;
        }
        if (ill->ill_net_type == IRE_IF_RESOLVER) {
                /*
                 * For IRE_IF_RESOLVER a hardware mapping can be
                 * generated, for IRE_IF_NORESOLVER, resolution cookie
                 * in the ill is copied in nce_add_v4().
                 */
                hw_addr = kmem_alloc(ill->ill_phys_addr_length, KM_NOSLEEP);
                if (hw_addr == NULL) {
                        mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
                        return (ENOMEM);
                }
                ip_mcast_mapping(ill, (uchar_t *)dst, hw_addr);
        } else {
                /*
                 * IRE_IF_NORESOLVER type simply copies the resolution
                 * cookie passed in.  So no hw_addr is needed.
                 */
                hw_addr = NULL;
        }
        ASSERT(flags & NCE_F_MCAST);
        ASSERT(flags & NCE_F_NONUD);
        /* nce_state will be computed by nce_add_common() */
        err = nce_add_v4(ill, hw_addr, ill->ill_phys_addr_length, dst, flags,
            ND_UNCHANGED, &nce);
        mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
        if (err == 0)
                err = (nce != NULL) ? nce_add_v4_postprocess(nce) : ENOMEM;
        if (hw_addr != NULL)
                kmem_free(hw_addr, ill->ill_phys_addr_length);
        if (err != 0) {
                ip1dbg(("nce_set_multicast_v4: create failed" "%d\n", err));
                return (err);
        }
done:
        if (newnce != NULL)
                *newnce = nce;
        else
                nce_refrele(nce);
        return (0);
}

/*
 * This is used when scanning for "old" (least recently broadcast) NCEs.  We
 * don't want to have to walk the list for every single one, so we gather up
 * batches at a time.
 */
#define NCE_RESCHED_LIST_LEN    8

typedef struct {
        ill_t   *ncert_ill;
        uint_t  ncert_num;
        ncec_t  *ncert_nces[NCE_RESCHED_LIST_LEN];
} nce_resched_t;

/*
 * Pick the longest waiting NCEs for defense.
 */
/* ARGSUSED */
static int
ncec_reschedule(ill_t *ill, nce_t *nce, void *arg)
{
        nce_resched_t *ncert = arg;
        ncec_t **ncecs;
        ncec_t **ncec_max;
        ncec_t *ncec_temp;
        ncec_t *ncec = nce->nce_common;

        ASSERT(ncec->ncec_ill == ncert->ncert_ill);
        /*
         * Only reachable entries that are ready for announcement are eligible.
         */
        if (!NCE_MYADDR(ncec) || ncec->ncec_state != ND_REACHABLE)
                return (0);
        if (ncert->ncert_num < NCE_RESCHED_LIST_LEN) {
                ncec_refhold(ncec);
                ncert->ncert_nces[ncert->ncert_num++] = ncec;
        } else {
                ncecs = ncert->ncert_nces;
                ncec_max = ncecs + NCE_RESCHED_LIST_LEN;
                ncec_refhold(ncec);
                for (; ncecs < ncec_max; ncecs++) {
                        ASSERT(ncec != NULL);
                        if ((*ncecs)->ncec_last_time_defended >
                            ncec->ncec_last_time_defended) {
                                ncec_temp = *ncecs;
                                *ncecs = ncec;
                                ncec = ncec_temp;
                        }
                }
                ncec_refrele(ncec);
        }
        return (0);
}

/*
 * Reschedule the ARP defense of any long-waiting NCEs.  It's assumed that this
 * doesn't happen very often (if at all), and thus it needn't be highly
 * optimized.  (Note, though, that it's actually O(N) complexity, because the
 * outer loop is bounded by a constant rather than by the length of the list.)
 */
static void
nce_ill_reschedule(ill_t *ill, nce_resched_t *ncert)
{
        ncec_t          *ncec;
        ip_stack_t      *ipst = ill->ill_ipst;
        uint_t          i, defend_rate;

        i = ill->ill_defend_count;
        ill->ill_defend_count = 0;
        if (ill->ill_isv6)
                defend_rate = ipst->ips_ndp_defend_rate;
        else
                defend_rate = ipst->ips_arp_defend_rate;
        /* If none could be sitting around, then don't reschedule */
        if (i < defend_rate) {
                DTRACE_PROBE1(reschedule_none, ill_t *, ill);
                return;
        }
        ncert->ncert_ill = ill;
        while (ill->ill_defend_count < defend_rate) {
                nce_walk_common(ill, ncec_reschedule, ncert);
                for (i = 0; i < ncert->ncert_num; i++) {

                        ncec = ncert->ncert_nces[i];
                        mutex_enter(&ncec->ncec_lock);
                        ncec->ncec_flags |= NCE_F_DELAYED;
                        mutex_exit(&ncec->ncec_lock);
                        /*
                         * we plan to schedule this ncec, so incr the
                         * defend_count in anticipation.
                         */
                        if (++ill->ill_defend_count >= defend_rate)
                                break;
                }
                if (ncert->ncert_num < NCE_RESCHED_LIST_LEN)
                        break;
        }
}

/*
 * Check if the current rate-limiting parameters permit the sending
 * of another address defense announcement for both IPv4 and IPv6.
 * Returns B_TRUE if rate-limiting is in effect (i.e., send is not
 * permitted), and B_FALSE otherwise. The `defend_rate' parameter
 * determines how many address defense announcements are permitted
 * in any `defense_perio' interval.
 */
static boolean_t
ill_defend_rate_limit(ill_t *ill, ncec_t *ncec)
{
        clock_t         now = ddi_get_lbolt();
        ip_stack_t      *ipst = ill->ill_ipst;
        clock_t         start = ill->ill_defend_start;
        uint32_t        elapsed, defend_period, defend_rate;
        nce_resched_t   ncert;
        boolean_t       ret;
        int             i;

        if (ill->ill_isv6) {
                defend_period = ipst->ips_ndp_defend_period;
                defend_rate = ipst->ips_ndp_defend_rate;
        } else {
                defend_period = ipst->ips_arp_defend_period;
                defend_rate = ipst->ips_arp_defend_rate;
        }
        if (defend_rate == 0)
                return (B_TRUE);
        bzero(&ncert, sizeof (ncert));
        mutex_enter(&ill->ill_lock);
        if (start > 0) {
                elapsed = now - start;
                if (elapsed > SEC_TO_TICK(defend_period)) {
                        ill->ill_defend_start = now;
                        /*
                         * nce_ill_reschedule will attempt to
                         * prevent starvation by reschduling the
                         * oldest entries, which are marked with
                         * the NCE_F_DELAYED flag.
                         */
                        nce_ill_reschedule(ill, &ncert);
                }
        } else {
                ill->ill_defend_start = now;
        }
        ASSERT(ill->ill_defend_count <= defend_rate);
        mutex_enter(&ncec->ncec_lock);
        if (ncec->ncec_flags & NCE_F_DELAYED) {
                /*
                 * This ncec was rescheduled as one of the really old
                 * entries needing on-going defense. The
                 * ill_defend_count was already incremented in
                 * nce_ill_reschedule. Go ahead and send the announce.
                 */
                ncec->ncec_flags &= ~NCE_F_DELAYED;
                mutex_exit(&ncec->ncec_lock);
                ret = B_FALSE;
                goto done;
        }
        mutex_exit(&ncec->ncec_lock);
        if (ill->ill_defend_count < defend_rate)
                ill->ill_defend_count++;
        if (ill->ill_defend_count == defend_rate) {
                /*
                 * we are no longer allowed to send unbidden defense
                 * messages. Wait for rescheduling.
                 */
                ret = B_TRUE;
        } else {
                ret = B_FALSE;
        }
done:
        mutex_exit(&ill->ill_lock);
        /*
         * After all the locks have been dropped we can restart nce timer,
         * and refrele the delayed ncecs
         */
        for (i = 0; i < ncert.ncert_num; i++) {
                clock_t xmit_interval;
                ncec_t  *tmp;

                tmp = ncert.ncert_nces[i];
                xmit_interval = nce_fuzz_interval(tmp->ncec_xmit_interval,
                    B_FALSE);
                nce_restart_timer(tmp, xmit_interval);
                ncec_refrele(tmp);
        }
        return (ret);
}

boolean_t
ndp_announce(ncec_t *ncec)
{
        return (ndp_xmit(ncec->ncec_ill, ND_NEIGHBOR_ADVERT, ncec->ncec_lladdr,
            ncec->ncec_lladdr_length, &ncec->ncec_addr, &ipv6_all_hosts_mcast,
            nce_advert_flags(ncec)));
}

ill_t *
nce_resolve_src(ncec_t *ncec, in6_addr_t *src)
{
        mblk_t          *mp;
        in6_addr_t      src6;
        ipaddr_t        src4;
        ill_t           *ill = ncec->ncec_ill;
        ill_t           *src_ill = NULL;
        ipif_t          *ipif = NULL;
        boolean_t       is_myaddr = NCE_MYADDR(ncec);
        boolean_t       isv6 = (ncec->ncec_ipversion == IPV6_VERSION);

        ASSERT(src != NULL);
        ASSERT(IN6_IS_ADDR_UNSPECIFIED(src));
        src4 = 0;
        src6 = *src;
        if (is_myaddr) {
                src6 = ncec->ncec_addr;
                if (!isv6)
                        IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, src4);
        } else {
                /*
                 * try to find one from the outgoing packet.
                 */
                mutex_enter(&ncec->ncec_lock);
                mp = ncec->ncec_qd_mp;
                if (mp != NULL) {
                        if (isv6) {
                                ip6_t   *ip6h = (ip6_t *)mp->b_rptr;

                                src6 = ip6h->ip6_src;
                        } else {
                                ipha_t  *ipha = (ipha_t *)mp->b_rptr;

                                src4 = ipha->ipha_src;
                                IN6_IPADDR_TO_V4MAPPED(src4, &src6);
                        }
                }
                mutex_exit(&ncec->ncec_lock);
        }

        /*
         * For outgoing packets, if the src of outgoing packet is one
         * of the assigned interface addresses use it, otherwise we
         * will pick the source address below.
         * For local addresses (is_myaddr) doing DAD, NDP announce
         * messages are mcast. So we use the (IPMP) cast_ill or the
         * (non-IPMP) ncec_ill for these message types. The only case
         * of unicast DAD messages are for IPv6 ND probes, for which
         * we find the ipif_bound_ill corresponding to the ncec_addr.
         */
        if (!IN6_IS_ADDR_UNSPECIFIED(&src6) || is_myaddr) {
                if (isv6) {
                        ipif = ipif_lookup_addr_nondup_v6(&src6, ill, ALL_ZONES,
                            ill->ill_ipst);
                } else {
                        ipif = ipif_lookup_addr_nondup(src4, ill, ALL_ZONES,
                            ill->ill_ipst);
                }

                /*
                 * If no relevant ipif can be found, then it's not one of our
                 * addresses.  Reset to :: and try to find a src for the NS or
                 * ARP request using ipif_select_source_v[4,6]  below.
                 * If an ipif can be found, but it's not yet done with
                 * DAD verification, and we are not being invoked for
                 * DAD (i.e., !is_myaddr), then just postpone this
                 * transmission until later.
                 */
                if (ipif == NULL) {
                        src6 = ipv6_all_zeros;
                        src4 = INADDR_ANY;
                } else if (!ipif->ipif_addr_ready && !is_myaddr) {
                        DTRACE_PROBE2(nce__resolve__ipif__not__ready,
                            ncec_t *, ncec, ipif_t *, ipif);
                        ipif_refrele(ipif);
                        return (NULL);
                }
        }

        if (IN6_IS_ADDR_UNSPECIFIED(&src6) && !is_myaddr) {
                /*
                 * Pick a source address for this solicitation, but
                 * restrict the selection to addresses assigned to the
                 * output interface.  We do this because the destination will
                 * create a neighbor cache entry for the source address of
                 * this packet, so the source address had better be a valid
                 * neighbor.
                 */
                if (isv6) {
                        ipif = ipif_select_source_v6(ill, &ncec->ncec_addr,
                            B_TRUE, IPV6_PREFER_SRC_DEFAULT, ALL_ZONES,
                            B_FALSE, NULL);
                } else {
                        ipaddr_t nce_addr;

                        IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, nce_addr);
                        ipif = ipif_select_source_v4(ill, nce_addr, ALL_ZONES,
                            B_FALSE, NULL);
                }
                if (ipif == NULL && IS_IPMP(ill)) {
                        ill_t *send_ill = ipmp_ill_hold_xmit_ill(ill, B_TRUE);

                        if (send_ill != NULL) {
                                if (isv6) {
                                        ipif = ipif_select_source_v6(send_ill,
                                            &ncec->ncec_addr, B_TRUE,
                                            IPV6_PREFER_SRC_DEFAULT, ALL_ZONES,
                                            B_FALSE, NULL);
                                } else {
                                        IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr,
                                            src4);
                                        ipif = ipif_select_source_v4(send_ill,
                                            src4, ALL_ZONES, B_TRUE, NULL);
                                }
                                ill_refrele(send_ill);
                        }
                }

                if (ipif == NULL) {
                        char buf[INET6_ADDRSTRLEN];

                        ip1dbg(("nce_resolve_src: No source ipif for dst %s\n",
                            inet_ntop((isv6 ? AF_INET6 : AF_INET),
                            (char *)&ncec->ncec_addr, buf, sizeof (buf))));
                        DTRACE_PROBE1(nce__resolve__no__ipif, ncec_t *, ncec);
                        return (NULL);
                }
                src6 = ipif->ipif_v6lcl_addr;
        }
        *src = src6;
        if (ipif != NULL) {
                src_ill = ipif->ipif_ill;
                if (IS_IPMP(src_ill))
                        src_ill = ipmp_ipif_hold_bound_ill(ipif);
                else
                        ill_refhold(src_ill);
                ipif_refrele(ipif);
                DTRACE_PROBE2(nce__resolve__src__ill, ncec_t *, ncec,
                    ill_t *, src_ill);
        }
        return (src_ill);
}

void
ip_nce_lookup_and_update(ipaddr_t *addr, ipif_t *ipif, ip_stack_t *ipst,
    uchar_t *hwaddr, int hwaddr_len, int flags)
{
        ill_t   *ill;
        ncec_t  *ncec;
        nce_t   *nce;
        uint16_t new_state;

        ill = (ipif ? ipif->ipif_ill : NULL);
        if (ill != NULL) {
                /*
                 * only one ncec is possible
                 */
                nce = nce_lookup_v4(ill, addr);
                if (nce != NULL) {
                        ncec = nce->nce_common;
                        mutex_enter(&ncec->ncec_lock);
                        if (NCE_ISREACHABLE(ncec))
                                new_state = ND_UNCHANGED;
                        else
                                new_state = ND_STALE;
                        ncec->ncec_flags = flags;
                        nce_update(ncec, new_state, hwaddr);
                        mutex_exit(&ncec->ncec_lock);
                        nce_refrele(nce);
                        return;
                }
        } else {
                /*
                 * ill is wildcard; clean up all ncec's and ire's
                 * that match on addr.
                 */
                nce_hw_map_t hwm;

                hwm.hwm_addr = *addr;
                hwm.hwm_hwlen = hwaddr_len;
                hwm.hwm_hwaddr = hwaddr;
                hwm.hwm_flags = flags;

                ncec_walk_common(ipst->ips_ndp4, NULL,
                    nce_update_hw_changed, &hwm, B_TRUE);
        }
}

/*
 * Common function to add ncec entries.
 * we always add the ncec with ncec_ill == ill, and always create
 * nce_t on ncec_ill. A dlpi fastpath message may be triggered if the
 * ncec is !reachable.
 *
 * When the caller passes in an nce_state of ND_UNCHANGED,
 * nce_add_common() will determine the state of the created nce based
 * on the ill_net_type and nce_flags used. Otherwise, the nce will
 * be created with state set to the passed in nce_state.
 */
static int
nce_add_common(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len,
    const in6_addr_t *addr, uint16_t flags, uint16_t nce_state, nce_t **retnce)
{
        static  ncec_t          nce_nil;
        uchar_t                 *template = NULL;
        int                     err;
        ncec_t                  *ncec;
        ncec_t                  **ncep;
        ip_stack_t              *ipst = ill->ill_ipst;
        uint16_t                state;
        boolean_t               fastprobe = B_FALSE;
        struct ndp_g_s          *ndp;
        nce_t                   *nce = NULL;
        list_t                  graveyard;
        mblk_t                  *dlur_mp = NULL;

        if (ill->ill_isv6)
                ndp = ill->ill_ipst->ips_ndp6;
        else
                ndp = ill->ill_ipst->ips_ndp4;

        *retnce = NULL;
        state = 0;

        ASSERT(MUTEX_HELD(&ndp->ndp_g_lock));

        if (IN6_IS_ADDR_UNSPECIFIED(addr)) {
                ip0dbg(("nce_add_common: no addr\n"));
                return (EINVAL);
        }
        if ((flags & ~NCE_EXTERNAL_FLAGS_MASK)) {
                ip0dbg(("nce_add_common: flags = %x\n", (int)flags));
                return (EINVAL);
        }

        if (ill->ill_isv6) {
                ncep = ((ncec_t **)NCE_HASH_PTR_V6(ipst, *addr));
        } else {
                ipaddr_t v4addr;

                IN6_V4MAPPED_TO_IPADDR(addr, v4addr);
                ncep = ((ncec_t **)NCE_HASH_PTR_V4(ipst, v4addr));
        }

        /*
         * The caller has ensured that there is no nce on ill, but there could
         * still be an nce_common_t for the address, so that we find exisiting
         * ncec_t strucutures first, and atomically add a new nce_t if
         * one is found. The ndp_g_lock ensures that we don't cross threads
         * with an ncec_delete(). Unlike ncec_lookup_illgrp() we do not
         * compare for matches across the illgrp because this function is
         * called via nce_lookup_then_add_v* -> nce_add_v* -> nce_add_common,
         * with the nce_lookup_then_add_v* passing in the ipmp_ill where
         * appropriate.
         */
        ncec = *ncep;
        for (; ncec != NULL; ncec = ncec->ncec_next) {
                if (ncec->ncec_ill == ill) {
                        if (IN6_ARE_ADDR_EQUAL(&ncec->ncec_addr, addr)) {
                                /*
                                 * We should never find *retnce to be
                                 * MYADDR, since the caller may then
                                 * incorrectly restart a DAD timer that's
                                 * already running.  However, if we are in
                                 * forwarding mode, and the interface is
                                 * moving in/out of groups, the data
                                 * path ire lookup (e.g., ire_revalidate_nce)
                                 * may  have determined that some destination
                                 * is offlink while the control path is adding
                                 * that address as a local address.
                                 * Recover from  this case by failing the
                                 * lookup
                                 */
                                if (NCE_MYADDR(ncec))
                                        return (ENXIO);
                                *retnce = nce_ill_lookup_then_add(ill, ncec);
                                if (*retnce != NULL)
                                        break;
                        }
                }
        }
        if (*retnce != NULL) /* caller must trigger fastpath on nce */
                return (0);

        ncec = kmem_cache_alloc(ncec_cache, KM_NOSLEEP);
        if (ncec == NULL)
                return (ENOMEM);
        *ncec = nce_nil;
        ncec->ncec_ill = ill;
        ncec->ncec_ipversion = (ill->ill_isv6 ? IPV6_VERSION : IPV4_VERSION);
        ncec->ncec_flags = flags;
        ncec->ncec_ipst = ipst; /* No netstack_hold */

        if (!ill->ill_isv6) {
                ipaddr_t addr4;

                /*
                 * DAD probe interval and probe count are set based on
                 * fast/slow probe settings. If the underlying link doesn't
                 * have reliably up/down notifications or if we're working
                 * with IPv4 169.254.0.0/16 Link Local Address space, then
                 * don't use the fast timers.  Otherwise, use them.
                 */
                ASSERT(IN6_IS_ADDR_V4MAPPED(addr));
                IN6_V4MAPPED_TO_IPADDR(addr, addr4);
                if (ill->ill_note_link && !IS_IPV4_LL_SPACE(&addr4)) {
                        fastprobe = B_TRUE;
                } else if (IS_IPMP(ill) && NCE_PUBLISH(ncec) &&
                    !IS_IPV4_LL_SPACE(&addr4)) {
                        ill_t *hwaddr_ill;

                        hwaddr_ill = ipmp_illgrp_find_ill(ill->ill_grp, hw_addr,
                            hw_addr_len);
                        if (hwaddr_ill != NULL && hwaddr_ill->ill_note_link)
                                fastprobe = B_TRUE;
                }
                if (fastprobe) {
                        ncec->ncec_xmit_interval =
                            ipst->ips_arp_fastprobe_interval;
                        ncec->ncec_pcnt =
                            ipst->ips_arp_fastprobe_count;
                        ncec->ncec_flags |= NCE_F_FAST;
                } else {
                        ncec->ncec_xmit_interval =
                            ipst->ips_arp_probe_interval;
                        ncec->ncec_pcnt =
                            ipst->ips_arp_probe_count;
                }
                if (NCE_PUBLISH(ncec)) {
                        ncec->ncec_unsolicit_count =
                            ipst->ips_ip_arp_publish_count;
                }
        } else {
                /*
                 * probe interval is constant: ILL_PROBE_INTERVAL
                 * probe count is constant: ND_MAX_UNICAST_SOLICIT
                 */
                ncec->ncec_pcnt = ND_MAX_UNICAST_SOLICIT;
                if (NCE_PUBLISH(ncec)) {
                        ncec->ncec_unsolicit_count =
                            ipst->ips_ip_ndp_unsolicit_count;
                }
        }
        ncec->ncec_rcnt = ill->ill_xmit_count;
        ncec->ncec_addr = *addr;
        ncec->ncec_qd_mp = NULL;
        ncec->ncec_refcnt = 1; /* for ncec getting created */
        mutex_init(&ncec->ncec_lock, NULL, MUTEX_DEFAULT, NULL);
        ncec->ncec_trace_disable = B_FALSE;

        /*
         * ncec_lladdr holds link layer address
         */
        if (hw_addr_len > 0) {
                template = kmem_alloc(hw_addr_len, KM_NOSLEEP);
                if (template == NULL) {
                        err = ENOMEM;
                        goto err_ret;
                }
                ncec->ncec_lladdr = template;
                ncec->ncec_lladdr_length = hw_addr_len;
                bzero(ncec->ncec_lladdr, hw_addr_len);
        }
        if ((flags & NCE_F_BCAST) != 0) {
                state = ND_REACHABLE;
                ASSERT(hw_addr_len > 0);
        } else if (ill->ill_net_type == IRE_IF_RESOLVER) {
                state = ND_INITIAL;
        } else if (ill->ill_net_type == IRE_IF_NORESOLVER) {
                /*
                 * NORESOLVER entries are always created in the REACHABLE
                 * state.
                 */
                state = ND_REACHABLE;
                if (ill->ill_phys_addr_length == IP_ADDR_LEN &&
                    ill->ill_mactype != DL_IPV4 &&
                    ill->ill_mactype != DL_6TO4) {
                        /*
                         * We create a nce_res_mp with the IP nexthop address
                         * as the destination address if the physical length
                         * is exactly 4 bytes for point-to-multipoint links
                         * that do their own resolution from IP to link-layer
                         * address (e.g. IP over X.25).
                         */
                        bcopy((uchar_t *)addr,
                            ncec->ncec_lladdr, ill->ill_phys_addr_length);
                }
                if (ill->ill_phys_addr_length == IPV6_ADDR_LEN &&
                    ill->ill_mactype != DL_IPV6) {
                        /*
                         * We create a nce_res_mp with the IP nexthop address
                         * as the destination address if the physical legnth
                         * is exactly 16 bytes for point-to-multipoint links
                         * that do their own resolution from IP to link-layer
                         * address.
                         */
                        bcopy((uchar_t *)addr,
                            ncec->ncec_lladdr, ill->ill_phys_addr_length);
                }
                /*
                 * Since NUD is not part of the base IPv4 protocol definition,
                 * IPv4 neighbor entries on NORESOLVER interfaces will never
                 * age, and are marked NCE_F_NONUD.
                 */
                if (!ill->ill_isv6)
                        ncec->ncec_flags |= NCE_F_NONUD;
        } else if (ill->ill_net_type == IRE_LOOPBACK) {
                state = ND_REACHABLE;
        }

        if (hw_addr != NULL || ill->ill_net_type == IRE_IF_NORESOLVER) {
                /*
                 * We are adding an ncec with a deterministic hw_addr,
                 * so the state can only be one of {REACHABLE, STALE, PROBE}.
                 *
                 * if we are adding a unicast ncec for the local address
                 * it would be REACHABLE; we would be adding a ND_STALE entry
                 * for the requestor of an ARP_REQUEST/ND_SOLICIT. Our own
                 * addresses are added in PROBE to trigger DAD.
                 */
                if ((flags & (NCE_F_MCAST|NCE_F_BCAST)) ||
                    ill->ill_net_type == IRE_IF_NORESOLVER)
                        state = ND_REACHABLE;
                else if (!NCE_PUBLISH(ncec))
                        state = ND_STALE;
                else
                        state = ND_PROBE;
                if (hw_addr != NULL)
                        nce_set_ll(ncec, hw_addr);
        }
        /* caller overrides internally computed state */
        if (nce_state != ND_UNCHANGED)
                state = nce_state;

        if (state == ND_PROBE)
                ncec->ncec_flags |= NCE_F_UNVERIFIED;

        ncec->ncec_state = state;

        if (state == ND_REACHABLE) {
                ncec->ncec_last = ncec->ncec_init_time =
                    TICK_TO_MSEC(ddi_get_lbolt64());
        } else {
                ncec->ncec_last = 0;
                if (state == ND_INITIAL)
                        ncec->ncec_init_time = TICK_TO_MSEC(ddi_get_lbolt64());
        }
        list_create(&ncec->ncec_cb, sizeof (ncec_cb_t),
            offsetof(ncec_cb_t, ncec_cb_node));
        /*
         * have all the memory allocations out of the way before taking locks
         * and adding the nce.
         */
        nce = kmem_cache_alloc(nce_cache, KM_NOSLEEP);
        if (nce == NULL) {
                err = ENOMEM;
                goto err_ret;
        }
        if (ncec->ncec_lladdr != NULL ||
            ill->ill_net_type == IRE_IF_NORESOLVER) {
                dlur_mp = ill_dlur_gen(ncec->ncec_lladdr,
                    ill->ill_phys_addr_length, ill->ill_sap,
                    ill->ill_sap_length);
                if (dlur_mp == NULL) {
                        err = ENOMEM;
                        goto err_ret;
                }
        }

        /*
         * Atomically ensure that the ill is not CONDEMNED, before
         * adding the NCE.
         */
        mutex_enter(&ill->ill_lock);
        if (ill->ill_state_flags & ILL_CONDEMNED) {
                mutex_exit(&ill->ill_lock);
                err = EINVAL;
                goto err_ret;
        }
        if (!NCE_MYADDR(ncec) &&
            (ill->ill_state_flags & ILL_DOWN_IN_PROGRESS)) {
                mutex_exit(&ill->ill_lock);
                DTRACE_PROBE1(nce__add__on__down__ill, ncec_t *, ncec);
                err = EINVAL;
                goto err_ret;
        }
        /*
         * Acquire the ncec_lock even before adding the ncec to the list
         * so that it cannot get deleted after the ncec is added, but
         * before we add the nce.
         */
        mutex_enter(&ncec->ncec_lock);
        if ((ncec->ncec_next = *ncep) != NULL)
                ncec->ncec_next->ncec_ptpn = &ncec->ncec_next;
        *ncep = ncec;
        ncec->ncec_ptpn = ncep;

        /* Bump up the number of ncec's referencing this ill */
        DTRACE_PROBE3(ill__incr__cnt, (ill_t *), ill,
            (char *), "ncec", (void *), ncec);
        ill->ill_ncec_cnt++;
        /*
         * Since we hold the ncec_lock at this time, the ncec cannot be
         * condemned, and we can safely add the nce.
         */
        list_create(&graveyard, sizeof (nce_t), offsetof(nce_t, nce_node));
        *retnce = nce_add_impl(ill, ncec, nce, dlur_mp, &graveyard);
        mutex_exit(&ncec->ncec_lock);
        mutex_exit(&ill->ill_lock);
        nce_graveyard_free(&graveyard);

        /* caller must trigger fastpath on *retnce */
        return (0);

err_ret:
        if (ncec != NULL)
                kmem_cache_free(ncec_cache, ncec);
        if (nce != NULL)
                kmem_cache_free(nce_cache, nce);
        freemsg(dlur_mp);
        if (template != NULL)
                kmem_free(template, ill->ill_phys_addr_length);
        return (err);
}

/*
 * take a ref on the nce
 */
void
nce_refhold(nce_t *nce)
{
        mutex_enter(&nce->nce_lock);
        nce->nce_refcnt++;
        ASSERT((nce)->nce_refcnt != 0);
        mutex_exit(&nce->nce_lock);
}

/*
 * release a ref on the nce; In general, this
 * cannot be called with locks held because nce_inactive
 * may result in nce_inactive which will take the ill_lock,
 * do ipif_ill_refrele_tail etc. Thus the one exception
 * where this can be called with locks held is when the caller
 * is certain that the nce_refcnt is sufficient to prevent
 * the invocation of nce_inactive.
 */
void
nce_refrele(nce_t *nce)
{
        ASSERT((nce)->nce_refcnt != 0);
        mutex_enter(&nce->nce_lock);
        if (--nce->nce_refcnt == 0)
                nce_inactive(nce); /* destroys the mutex */
        else
                mutex_exit(&nce->nce_lock);
}

/*
 * free the nce after all refs have gone away.
 */
static void
nce_inactive(nce_t *nce)
{
        ill_t *ill = nce->nce_ill;

        ASSERT(nce->nce_refcnt == 0);

        ncec_refrele_notr(nce->nce_common);
        nce->nce_common = NULL;
        freemsg(nce->nce_fp_mp);
        freemsg(nce->nce_dlur_mp);

        mutex_enter(&ill->ill_lock);
        DTRACE_PROBE3(ill__decr__cnt, (ill_t *), ill,
            (char *), "nce", (void *), nce);
        ill->ill_nce_cnt--;
        nce->nce_ill = NULL;
        /*
         * If the number of ncec's associated with this ill have dropped
         * to zero, check whether we need to restart any operation that
         * is waiting for this to happen.
         */
        if (ILL_DOWN_OK(ill)) {
                /* ipif_ill_refrele_tail drops the ill_lock */
                ipif_ill_refrele_tail(ill);
        } else {
                mutex_exit(&ill->ill_lock);
        }

        mutex_destroy(&nce->nce_lock);
        kmem_cache_free(nce_cache, nce);
}

/*
 * Add an nce to the ill_nce list.
 *
 * Adding multicast NCEs is subject to a per-ill limit. This function returns
 * NULL if that's the case, and it may reap a number of multicast nces.
 * Callers (and upstack) must be able to cope with NULL returns.
 */
static nce_t *
nce_add_impl(ill_t *ill, ncec_t *ncec, nce_t *nce, mblk_t *dlur_mp,
    list_t *graveyard)
{
        ASSERT(MUTEX_HELD(&ill->ill_lock));

        if ((ncec->ncec_flags & NCE_F_MCAST) != 0) {
                if (nce_too_many_mcast(ill, graveyard)) {
                        kmem_cache_free(nce_cache, nce);
                        return (NULL);
                }
                ill->ill_mcast_nces++;
        }

        bzero(nce, sizeof (*nce));
        mutex_init(&nce->nce_lock, NULL, MUTEX_DEFAULT, NULL);
        nce->nce_common = ncec;
        nce->nce_addr = ncec->ncec_addr;
        nce->nce_ill = ill;
        DTRACE_PROBE3(ill__incr__cnt, (ill_t *), ill,
            (char *), "nce", (void *), nce);
        ill->ill_nce_cnt++;

        nce->nce_refcnt = 1; /* for the thread */
        ncec->ncec_refcnt++; /* want ncec_refhold_locked_notr(ncec) */
        nce->nce_dlur_mp = dlur_mp;

        /* add nce to the ill's fastpath list.  */
        nce->nce_refcnt++; /* for the list */
        list_insert_head(&ill->ill_nce, nce);
        return (nce);
}

static nce_t *
nce_add(ill_t *ill, ncec_t *ncec, list_t *graveyard)
{
        nce_t   *nce;
        mblk_t  *dlur_mp = NULL;

        ASSERT(MUTEX_HELD(&ill->ill_lock));
        ASSERT(MUTEX_HELD(&ncec->ncec_lock));

        nce = kmem_cache_alloc(nce_cache, KM_NOSLEEP);
        if (nce == NULL)
                return (NULL);
        if (ncec->ncec_lladdr != NULL ||
            ill->ill_net_type == IRE_IF_NORESOLVER) {
                dlur_mp = ill_dlur_gen(ncec->ncec_lladdr,
                    ill->ill_phys_addr_length, ill->ill_sap,
                    ill->ill_sap_length);
                if (dlur_mp == NULL) {
                        kmem_cache_free(nce_cache, nce);
                        return (NULL);
                }
        }
        /*
         * If nce_add_impl() returns NULL due to on multicast limiting, caller
         * will (correctly) assume ENOMEM.
         */
        return (nce_add_impl(ill, ncec, nce, dlur_mp, graveyard));
}

/*
 * remove the nce from the ill_faspath list
 */
void
nce_delete(nce_t *nce)
{
        ill_t   *ill = nce->nce_ill;

        ASSERT(MUTEX_HELD(&ill->ill_lock));

        mutex_enter(&nce->nce_lock);
        if (nce->nce_is_condemned) {
                /*
                 * some other thread has removed this nce from the ill_nce list
                 */
                mutex_exit(&nce->nce_lock);
                return;
        }
        nce->nce_is_condemned = B_TRUE;
        mutex_exit(&nce->nce_lock);

        /* Update the count of multicast NCEs. */
        if ((nce->nce_common->ncec_flags & NCE_F_MCAST) == NCE_F_MCAST)
                ill->ill_mcast_nces--;

        list_remove(&ill->ill_nce, nce);
        /*
         * even though we are holding the ill_lock, it is ok to
         * call nce_refrele here because we know that we should have
         * at least 2 refs on the nce: one for the thread, and one
         * for the list. The refrele below will release the one for
         * the list.
         */
        nce_refrele(nce);
}

nce_t *
nce_lookup(ill_t *ill, const in6_addr_t *addr)
{
        nce_t *nce = NULL;

        ASSERT(ill != NULL);
        ASSERT(MUTEX_HELD(&ill->ill_lock));

        for (nce = list_head(&ill->ill_nce); nce != NULL;
            nce = list_next(&ill->ill_nce, nce)) {
                if (IN6_ARE_ADDR_EQUAL(&nce->nce_addr, addr))
                        break;
        }

        /*
         * if we found the nce on the ill_nce list while holding
         * the ill_lock, then it cannot be condemned yet.
         */
        if (nce != NULL) {
                ASSERT(!nce->nce_is_condemned);
                nce_refhold(nce);
        }
        return (nce);
}

/*
 * Walk the ill_nce list on ill. The callback function func() cannot perform
 * any destructive actions.
 */
static void
nce_walk_common(ill_t *ill, pfi_t func, void *arg)
{
        nce_t *nce = NULL, *nce_next;

        ASSERT(MUTEX_HELD(&ill->ill_lock));
        for (nce = list_head(&ill->ill_nce); nce != NULL; ) {
                nce_next = list_next(&ill->ill_nce, nce);
                if (func(ill, nce, arg) != 0)
                        break;
                nce = nce_next;
        }
}

void
nce_walk(ill_t *ill, pfi_t func, void *arg)
{
        mutex_enter(&ill->ill_lock);
        nce_walk_common(ill, func, arg);
        mutex_exit(&ill->ill_lock);
}

void
nce_flush(ill_t *ill, boolean_t flushall)
{
        nce_t *nce, *nce_next;
        list_t dead;

        list_create(&dead, sizeof (nce_t), offsetof(nce_t, nce_node));
        mutex_enter(&ill->ill_lock);
        for (nce = list_head(&ill->ill_nce); nce != NULL; ) {
                nce_next = list_next(&ill->ill_nce, nce);
                if (!flushall && NCE_PUBLISH(nce->nce_common)) {
                        nce = nce_next;
                        continue;
                }
                /*
                 * nce_delete requires that the caller should either not
                 * be holding locks, or should hold a ref to ensure that
                 * we wont hit ncec_inactive. So take a ref and clean up
                 * after the list is flushed.
                 */
                nce_refhold(nce);
                nce_delete(nce);
                list_insert_tail(&dead, nce);
                nce = nce_next;
        }
        mutex_exit(&ill->ill_lock);
        while ((nce = list_head(&dead)) != NULL) {
                list_remove(&dead, nce);
                nce_refrele(nce);
        }
        ASSERT(list_is_empty(&dead));
        list_destroy(&dead);
}

/* Return an interval that is anywhere in the [1 .. intv] range */
static clock_t
nce_fuzz_interval(clock_t intv, boolean_t initial_time)
{
        clock_t rnd, frac;

        (void) random_get_pseudo_bytes((uint8_t *)&rnd, sizeof (rnd));
        /* Note that clock_t is signed; must chop off bits */
        rnd &= (1ul << (NBBY * sizeof (rnd) - 1)) - 1;
        if (initial_time) {
                if (intv <= 0)
                        intv = 1;
                else
                        intv = (rnd % intv) + 1;
        } else {
                /* Compute 'frac' as 20% of the configured interval */
                if ((frac = intv / 5) <= 1)
                        frac = 2;
                /* Set intv randomly in the range [intv-frac .. intv+frac] */
                if ((intv = intv - frac + rnd % (2 * frac + 1)) <= 0)
                        intv = 1;
        }
        return (intv);
}

void
nce_resolv_ipmp_ok(ncec_t *ncec)
{
        mblk_t *mp;
        uint_t pkt_len;
        iaflags_t ixaflags = IXAF_NO_TRACE;
        nce_t *under_nce;
        ill_t   *ill = ncec->ncec_ill;
        boolean_t isv6 = (ncec->ncec_ipversion == IPV6_VERSION);
        ipif_t *src_ipif = NULL;
        ip_stack_t *ipst = ill->ill_ipst;
        ill_t *send_ill;
        uint_t nprobes;

        ASSERT(IS_IPMP(ill));

        mutex_enter(&ncec->ncec_lock);
        nprobes = ncec->ncec_nprobes;
        mp = ncec->ncec_qd_mp;
        ncec->ncec_qd_mp = NULL;
        ncec->ncec_nprobes = 0;
        mutex_exit(&ncec->ncec_lock);

        while (mp != NULL) {
                mblk_t *nxt_mp;

                nxt_mp = mp->b_next;
                mp->b_next = NULL;
                if (isv6) {
                        ip6_t *ip6h = (ip6_t *)mp->b_rptr;

                        pkt_len = ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN;
                        src_ipif = ipif_lookup_addr_nondup_v6(&ip6h->ip6_src,
                            ill, ALL_ZONES, ipst);
                } else {
                        ipha_t *ipha = (ipha_t *)mp->b_rptr;

                        ixaflags |= IXAF_IS_IPV4;
                        pkt_len = ntohs(ipha->ipha_length);
                        src_ipif = ipif_lookup_addr_nondup(ipha->ipha_src,
                            ill, ALL_ZONES, ipst);
                }

                /*
                 * find a new nce based on an under_ill. The first IPMP probe
                 * packet gets queued, so we could still find a src_ipif that
                 * matches an IPMP test address.
                 */
                if (src_ipif == NULL || IS_IPMP(src_ipif->ipif_ill)) {
                        /*
                         * if src_ipif is null, this could be either a
                         * forwarded packet or a probe whose src got deleted.
                         * We identify the former case by looking for the
                         * ncec_nprobes: the first ncec_nprobes packets are
                         * probes;
                         */
                        if (src_ipif == NULL && nprobes > 0)
                                goto drop_pkt;

                        /*
                         * For forwarded packets, we use the ipmp rotor
                         * to find send_ill.
                         */
                        send_ill = ipmp_ill_hold_xmit_ill(ncec->ncec_ill,
                            B_TRUE);
                } else {
                        send_ill = src_ipif->ipif_ill;
                        ill_refhold(send_ill);
                }

                DTRACE_PROBE4(nce__resolve__ipmp, (mblk_t *), mp,
                    (ncec_t *), ncec, (ipif_t *),
                    src_ipif, (ill_t *), send_ill);

                if (send_ill == NULL) {
                        if (src_ipif != NULL)
                                ipif_refrele(src_ipif);
                        goto drop_pkt;
                }
                /* create an under_nce on send_ill */
                rw_enter(&ipst->ips_ill_g_lock, RW_READER);
                if (IS_IN_SAME_ILLGRP(send_ill, ncec->ncec_ill))
                        under_nce = nce_fastpath_create(send_ill, ncec);
                else
                        under_nce = NULL;
                rw_exit(&ipst->ips_ill_g_lock);
                if (under_nce != NULL && NCE_ISREACHABLE(ncec))
                        nce_fastpath_trigger(under_nce);

                ill_refrele(send_ill);
                if (src_ipif != NULL)
                        ipif_refrele(src_ipif);

                if (under_nce != NULL) {
                        (void) ip_xmit(mp, under_nce, ixaflags, pkt_len, 0,
                            ALL_ZONES, 0, NULL);
                        nce_refrele(under_nce);
                        if (nprobes > 0)
                                nprobes--;
                        mp = nxt_mp;
                        continue;
                }
drop_pkt:
                if (isv6) {
                        BUMP_MIB(&ipst->ips_ip6_mib, ipIfStatsOutDiscards);
                } else {
                        BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
                }
                ip_drop_output("ipIfStatsOutDiscards - no under_ill", mp, NULL);
                freemsg(mp);
                if (nprobes > 0)
                        nprobes--;
                mp = nxt_mp;
        }
        ncec_cb_dispatch(ncec); /* complete callbacks */
}